crawler: perform HEAD before GET
Implements: https://todo.sr.ht/~sircmpwn/searchhut/8
This commit is contained in:
parent
759ad758af
commit
baf82f9bb8
2 changed files with 20 additions and 1 deletions
|
@ -97,6 +97,15 @@ func (c *Crawler) Get(ctx context.Context, url *url.URL) (*http.Response, error)
|
|||
return c.Client.Do(req)
|
||||
}
|
||||
|
||||
func (c *Crawler) Head(ctx context.Context, url *url.URL) (*http.Response, error) {
|
||||
req, err := http.NewRequestWithContext(ctx, "HEAD", url.String(), http.NoBody)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
req.Header.Add("User-Agent", c.UserAgent)
|
||||
return c.Client.Do(req)
|
||||
}
|
||||
|
||||
func (c *Crawler) Schedule(url *url.URL) {
|
||||
if url.User != nil || url.Host != c.Domain || url.Scheme != "https" {
|
||||
return
|
||||
|
|
|
@ -24,7 +24,7 @@ type Metadata struct {
|
|||
func (c *Crawler) Index(ctx context.Context, url *url.URL) error {
|
||||
log.Println(url.String())
|
||||
|
||||
resp, err := c.Get(ctx, url)
|
||||
resp, err := c.Head(ctx, url)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
@ -44,6 +44,16 @@ func (c *Crawler) Index(ctx context.Context, url *url.URL) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
resp, err = c.Get(ctx, url)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode != 200 {
|
||||
log.Printf("Unexpected status code %d", resp.StatusCode)
|
||||
return nil
|
||||
}
|
||||
|
||||
node, err := html.Parse(resp.Body)
|
||||
if err != nil {
|
||||
return err
|
||||
|
|
Loading…
Reference in a new issue