crawler: perform HEAD before GET
Implements: https://todo.sr.ht/~sircmpwn/searchhut/8
This commit is contained in:
parent
759ad758af
commit
baf82f9bb8
2 changed files with 20 additions and 1 deletions
|
@ -97,6 +97,15 @@ func (c *Crawler) Get(ctx context.Context, url *url.URL) (*http.Response, error)
|
||||||
return c.Client.Do(req)
|
return c.Client.Do(req)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (c *Crawler) Head(ctx context.Context, url *url.URL) (*http.Response, error) {
|
||||||
|
req, err := http.NewRequestWithContext(ctx, "HEAD", url.String(), http.NoBody)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
req.Header.Add("User-Agent", c.UserAgent)
|
||||||
|
return c.Client.Do(req)
|
||||||
|
}
|
||||||
|
|
||||||
func (c *Crawler) Schedule(url *url.URL) {
|
func (c *Crawler) Schedule(url *url.URL) {
|
||||||
if url.User != nil || url.Host != c.Domain || url.Scheme != "https" {
|
if url.User != nil || url.Host != c.Domain || url.Scheme != "https" {
|
||||||
return
|
return
|
||||||
|
|
|
@ -24,7 +24,7 @@ type Metadata struct {
|
||||||
func (c *Crawler) Index(ctx context.Context, url *url.URL) error {
|
func (c *Crawler) Index(ctx context.Context, url *url.URL) error {
|
||||||
log.Println(url.String())
|
log.Println(url.String())
|
||||||
|
|
||||||
resp, err := c.Get(ctx, url)
|
resp, err := c.Head(ctx, url)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
@ -44,6 +44,16 @@ func (c *Crawler) Index(ctx context.Context, url *url.URL) error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
resp, err = c.Get(ctx, url)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer resp.Body.Close()
|
||||||
|
if resp.StatusCode != 200 {
|
||||||
|
log.Printf("Unexpected status code %d", resp.StatusCode)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
node, err := html.Parse(resp.Body)
|
node, err := html.Parse(resp.Body)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
|
|
Loading…
Reference in a new issue