searchhut/crawler/index.go

package crawler

import (
	"context"
	"database/sql"
	"log"
	"mime"
	"net/url"
	"strings"

	"github.com/go-shiori/go-readability"
	"golang.org/x/net/html"

	"git.sr.ht/~sircmpwn/searchhut/database"
)

type Metadata struct {
	Title       *string
	Robots      []string
	Author      *string
	Description *string
}

func (c *Crawler) Index(ctx context.Context, url *url.URL) error {
	log.Println(url.String())

	resp, err := c.Head(ctx, url)
	if err != nil {
		return err
	}
	defer resp.Body.Close()
	if resp.StatusCode != 200 {
		log.Printf("Unexpected status code %d", resp.StatusCode)
		return nil
	}

	contentType := resp.Header.Get("Content-Type")
	if contentType == "" {
		return nil
	}
	if mt, _, err := mime.ParseMediaType(contentType); err != nil {
		return nil
	} else if mt != "text/html" {
		return nil
	}

	resp, err = c.Get(ctx, url)
	if err != nil {
		return err
	}
	defer resp.Body.Close()
	if resp.StatusCode != 200 {
		log.Printf("Unexpected status code %d", resp.StatusCode)
		return nil
	}

	node, err := html.Parse(resp.Body)
	if err != nil {
		return err
	}

	var (
		index  bool = true
		follow bool = true
	)

	var meta Metadata
	collectMetadata(node, &meta)
	for _, item := range meta.Robots {
		switch item {
		case "none":
			index = false
			follow = false
		case "noindex":
			index = false
		case "nofollow":
			follow = false
		}
	}

	if follow {
		c.ScheduleLinks(url, node)
	}
	if !index {
		return nil
	}

	article, err := readability.FromDocument(node, url)
	if err != nil {
		return err
	}

	return database.WithTx(ctx, nil, func(tx *sql.Tx) error {
		_, err := tx.ExecContext(ctx, `
			INSERT INTO page (
				domain_id,			-- $1
				last_index_date,
				weight,
				crawl_priority,
				crawl_delay,
				url,				-- $2
				checksum,			-- $3
				title,				-- $4
				author,				-- $5
				description,		-- $6
				excerpt,			-- $7
				fts_vector
				-- text_content		-- $8
				-- hostname			-- $9
			) VALUES (
				$1, now(), 0, 0, '0s'::interval,
				$2, $3, $4, $5, $6, $7,
				setweight(to_tsvector(coalesce($4, '')), 'A') ||
				setweight(to_tsvector(coalesce($5, '')), 'A') ||
				setweight(to_tsvector(coalesce($6, '')), 'A') ||
				setweight(to_tsvector(coalesce($8, '')), 'A') ||
				setweight(to_tsvector(coalesce($9, '')), 'D')
			)
			ON CONFLICT ON CONSTRAINT page_url_key DO UPDATE SET
				last_index_date = now(),
				checksum = $3,
				title = $4,
				author = $5,
				description = $6,
				excerpt = $7,
				-- TODO: Maybe move this to a sub-query
				fts_vector =
					setweight(to_tsvector(coalesce($4, '')), 'A') ||
					setweight(to_tsvector(coalesce($5, '')), 'A') ||
					setweight(to_tsvector(coalesce($6, '')), 'A') ||
					setweight(to_tsvector(coalesce($8, '')), 'A') ||
					setweight(to_tsvector(coalesce($9, '')), 'D');`,
			c.DomainID, url.String(), []byte{}, meta.Title, meta.Author,
			meta.Description, article.Excerpt, url.Host, article.TextContent)
		return err
	})
}

func collectMetadata(node *html.Node, meta *Metadata) {
	if node.Type == html.ElementNode && node.Data == "title" {
		title := collectText(node)
		meta.Title = &title
	}
	if node.Type == html.ElementNode && node.Data == "meta" {
		var (
			name    string
			content string
		)
		for _, attr := range node.Attr {
			if attr.Key == "name" {
				name = attr.Val
			}
			if attr.Key == "content" {
				content = attr.Val
			}
		}
		switch name {
		case "robots":
			meta.Robots = strings.Split(content, ",")
		case "author":
			meta.Author = &content
		case "description":
			meta.Description = &content
		}
	}
	for child := node.FirstChild; child != nil; child = child.NextSibling {
		collectMetadata(child, meta)
	}
}

func collectText(node *html.Node) string {
	text := ""
	if node.Type == html.TextNode {
		text += node.Data
	}
	for child := node.FirstChild; child != nil; child = child.NextSibling {
		text += collectText(child)
	}
	return text
}