crawler: perform HEAD before GET

Implements: https://todo.sr.ht/~sircmpwn/searchhut/8
This commit is contained in:
Drew DeVault 2022-07-09 18:59:23 +02:00
parent 759ad758af
commit baf82f9bb8
2 changed files with 20 additions and 1 deletions

View file

@ -97,6 +97,15 @@ func (c *Crawler) Get(ctx context.Context, url *url.URL) (*http.Response, error)
return c.Client.Do(req) return c.Client.Do(req)
} }
func (c *Crawler) Head(ctx context.Context, url *url.URL) (*http.Response, error) {
req, err := http.NewRequestWithContext(ctx, "HEAD", url.String(), http.NoBody)
if err != nil {
return nil, err
}
req.Header.Add("User-Agent", c.UserAgent)
return c.Client.Do(req)
}
func (c *Crawler) Schedule(url *url.URL) { func (c *Crawler) Schedule(url *url.URL) {
if url.User != nil || url.Host != c.Domain || url.Scheme != "https" { if url.User != nil || url.Host != c.Domain || url.Scheme != "https" {
return return

View file

@ -24,7 +24,7 @@ type Metadata struct {
func (c *Crawler) Index(ctx context.Context, url *url.URL) error { func (c *Crawler) Index(ctx context.Context, url *url.URL) error {
log.Println(url.String()) log.Println(url.String())
resp, err := c.Get(ctx, url) resp, err := c.Head(ctx, url)
if err != nil { if err != nil {
return err return err
} }
@ -44,6 +44,16 @@ func (c *Crawler) Index(ctx context.Context, url *url.URL) error {
return nil return nil
} }
resp, err = c.Get(ctx, url)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
log.Printf("Unexpected status code %d", resp.StatusCode)
return nil
}
node, err := html.Parse(resp.Body) node, err := html.Parse(resp.Body)
if err != nil { if err != nil {
return err return err