package crawler import ( "context" "database/sql" "log" "mime" "net/http" "net/url" "strconv" "strings" "time" "github.com/go-shiori/go-readability" "golang.org/x/net/html" "git.sr.ht/~sircmpwn/searchhut/database" ) type Metadata struct { Title *string Robots []string Author *string Description *string Canonical *url.URL JavaScript bool } func (c *Crawler) Index(ctx context.Context, url *url.URL) error { log.Println(url.String()) resp, err := c.Head(ctx, url) if err != nil { return err } defer resp.Body.Close() if !c.checkResponse(resp, url) { return nil } resp, err = c.Get(ctx, url) if err != nil { return err } defer resp.Body.Close() if !c.checkResponse(resp, url) { return nil } node, err := html.Parse(resp.Body) if err != nil { return err } var ( index bool = true follow bool = true ) var meta Metadata collectMetadata(node, &meta) for _, item := range meta.Robots { switch item { case "none": index = false follow = false case "noindex": index = false case "nofollow": follow = false } } if meta.Canonical != nil && meta.Canonical.String() != url.String() { // TODO: Should we check for and remove the non-canonical URL from the // database? log.Printf("Re-scheduling canonical URL: %s", meta.Canonical.String()) c.Schedule(meta.Canonical) return nil } if follow { c.ScheduleLinks(url, node) } if !index { return nil } article, err := readability.FromDocument(node, url) if err != nil { return err } return database.WithTx(ctx, nil, func(tx *sql.Tx) error { _, err := tx.ExecContext(ctx, ` INSERT INTO page ( domain_id, -- $1 last_index_date, weight, crawl_priority, crawl_delay, url, -- $2 checksum, -- $3 title, -- $4 author, -- $5 description, -- $6 excerpt, -- $7 javascript, -- $8 fts_vector -- text_content -- $9 -- hostname -- $10 ) VALUES ( $1, now(), 0, 0, '0s'::interval, $2, $3, $4, $5, $6, $7, $8, setweight(to_tsvector(coalesce($4, '')), 'A') || setweight(to_tsvector(coalesce($5, '')), 'A') || setweight(to_tsvector(coalesce($6, '')), 'A') || setweight(to_tsvector(coalesce($9, '')), 'A') || setweight(to_tsvector(coalesce($10, '')), 'D') ) ON CONFLICT ON CONSTRAINT page_url_key DO UPDATE SET last_index_date = now(), checksum = $3, title = $4, author = $5, description = $6, excerpt = $7, javascript = $8, -- TODO: Maybe move this to a sub-query fts_vector = setweight(to_tsvector(coalesce($4, '')), 'A') || setweight(to_tsvector(coalesce($5, '')), 'A') || setweight(to_tsvector(coalesce($6, '')), 'A') || setweight(to_tsvector(coalesce($9, '')), 'A') || setweight(to_tsvector(coalesce($10, '')), 'D');`, c.DomainID, url.String(), []byte{}, meta.Title, meta.Author, meta.Description, article.Excerpt, meta.JavaScript, url.Host, article.TextContent) return err }) } // Checks an HTTP response and returns true if the crawler should proceed. func (c *Crawler) checkResponse(resp *http.Response, url *url.URL) bool { switch resp.StatusCode { case http.StatusOK: // no-op case http.StatusTooManyRequests: retryAfter := resp.Header.Get("Retry-After") if retryAfter == "" { retryAfter = "3600" } t, err := http.ParseTime(retryAfter) if err != nil { seconds, err := strconv.Atoi(retryAfter) if err != nil { seconds = 3600 } c.RetryAfter = time.Duration(seconds) * time.Second } else { c.RetryAfter = t.Sub(time.Now()) } log.Printf("HTTP 429, pausing for %s", c.RetryAfter.String()) c.Schedule(url) return false default: log.Printf("Unexpected status code %d", resp.StatusCode) return false } contentType := resp.Header.Get("Content-Type") if contentType == "" { return false } if mt, _, err := mime.ParseMediaType(contentType); err != nil { return false } else if mt != "text/html" { return false } return true } func collectMetadata(node *html.Node, meta *Metadata) { if node.Type == html.ElementNode && node.Data == "title" && meta.Title == nil { title := collectText(node) meta.Title = &title } if node.Type == html.ElementNode && node.Data == "meta" { var ( name string content string ) for _, attr := range node.Attr { if attr.Key == "name" { name = attr.Val } if attr.Key == "content" { content = attr.Val } } switch name { case "robots": meta.Robots = strings.Split(content, ",") case "author": meta.Author = &content case "description": meta.Description = &content } } if node.Type == html.ElementNode && node.Data == "link" { var ( rel string href *url.URL ) for _, attr := range node.Attr { if attr.Key == "rel" { rel = attr.Val } if attr.Key == "href" { href, _ = url.Parse(attr.Val) } } switch rel { case "canonical": meta.Canonical = href } } if node.Type == html.ElementNode && node.Data == "script" { meta.JavaScript = true } for child := node.FirstChild; child != nil; child = child.NextSibling { collectMetadata(child, meta) } } func collectText(node *html.Node) string { text := "" if node.Type == html.TextNode { text += node.Data } for child := node.FirstChild; child != nil; child = child.NextSibling { text += collectText(child) } return text }