package crawler import ( "context" "database/sql" "log" "mime" "net/url" "strings" "github.com/go-shiori/go-readability" "golang.org/x/net/html" "git.sr.ht/~sircmpwn/searchhut/database" ) type Metadata struct { Title *string Robots []string Author *string Description *string } func (c *Crawler) Index(ctx context.Context, url *url.URL) error { log.Println(url.String()) resp, err := c.Head(ctx, url) if err != nil { return err } defer resp.Body.Close() if resp.StatusCode != 200 { log.Printf("Unexpected status code %d", resp.StatusCode) return nil } contentType := resp.Header.Get("Content-Type") if contentType == "" { return nil } if mt, _, err := mime.ParseMediaType(contentType); err != nil { return nil } else if mt != "text/html" { return nil } resp, err = c.Get(ctx, url) if err != nil { return err } defer resp.Body.Close() if resp.StatusCode != 200 { log.Printf("Unexpected status code %d", resp.StatusCode) return nil } node, err := html.Parse(resp.Body) if err != nil { return err } var ( index bool = true follow bool = true ) var meta Metadata collectMetadata(node, &meta) for _, item := range meta.Robots { switch item { case "none": index = false follow = false case "noindex": index = false case "nofollow": follow = false } } if follow { c.ScheduleLinks(url, node) } if !index { return nil } article, err := readability.FromDocument(node, url) if err != nil { return err } return database.WithTx(ctx, nil, func(tx *sql.Tx) error { _, err := tx.ExecContext(ctx, ` INSERT INTO page ( domain_id, -- $1 last_index_date, weight, crawl_priority, crawl_delay, url, -- $2 checksum, -- $3 title, -- $4 author, -- $5 description, -- $6 excerpt, -- $7 fts_vector -- text_content -- $8 -- hostname -- $9 ) VALUES ( $1, now(), 0, 0, '0s'::interval, $2, $3, $4, $5, $6, $7, setweight(to_tsvector(coalesce($4, '')), 'A') || setweight(to_tsvector(coalesce($5, '')), 'A') || setweight(to_tsvector(coalesce($6, '')), 'A') || setweight(to_tsvector(coalesce($8, '')), 'A') || setweight(to_tsvector(coalesce($9, '')), 'D') ) ON CONFLICT ON CONSTRAINT page_url_key DO UPDATE SET last_index_date = now(), checksum = $3, title = $4, author = $5, description = $6, excerpt = $7, -- TODO: Maybe move this to a sub-query fts_vector = setweight(to_tsvector(coalesce($4, '')), 'A') || setweight(to_tsvector(coalesce($5, '')), 'A') || setweight(to_tsvector(coalesce($6, '')), 'A') || setweight(to_tsvector(coalesce($8, '')), 'A') || setweight(to_tsvector(coalesce($9, '')), 'D');`, c.DomainID, url.String(), []byte{}, meta.Title, meta.Author, meta.Description, article.Excerpt, url.Host, article.TextContent) return err }) } func collectMetadata(node *html.Node, meta *Metadata) { if node.Type == html.ElementNode && node.Data == "title" { title := collectText(node) meta.Title = &title } if node.Type == html.ElementNode && node.Data == "meta" { var ( name string content string ) for _, attr := range node.Attr { if attr.Key == "name" { name = attr.Val } if attr.Key == "content" { content = attr.Val } } switch name { case "robots": meta.Robots = strings.Split(content, ",") case "author": meta.Author = &content case "description": meta.Description = &content } } for child := node.FirstChild; child != nil; child = child.NextSibling { collectMetadata(child, meta) } } func collectText(node *html.Node) string { text := "" if node.Type == html.TextNode { text += node.Data } for child := node.FirstChild; child != nil; child = child.NextSibling { text += collectText(child) } return text }