searchhut/crawler/index.go

package crawler

import (
	"context"
	"crypto/sha512"
	"database/sql"
	"io"
	"log"
	"mime"
	"net/http"
	"net/url"
	"strconv"
	"strings"
	"time"

	"github.com/go-shiori/go-readability"
	"golang.org/x/net/html"

	"git.sr.ht/~sircmpwn/searchhut/database"
)

var weights = [...]string{"D", "C", "B", "A"}

type Metadata struct {
	Title       *string
	Robots      []string
	Author      *string
	Description *string
	Canonical   *url.URL
	JavaScript  bool
}

func (c *Crawler) Index(ctx context.Context, url *url.URL) error {
	log.Println(url.String())

	resp, err := c.Head(ctx, url)
	if err != nil {
		return err
	}
	defer resp.Body.Close()
	if !c.checkResponse(resp, url) {
		return nil
	}

	resp, err = c.Get(ctx, url)
	if err != nil {
		return err
	}
	defer resp.Body.Close()
	hash := sha512.New()
	reader := io.TeeReader(resp.Body, hash)

	if !c.checkResponse(resp, url) {
		return nil
	}

	node, err := html.Parse(reader)
	if err != nil {
		return err
	}

	var (
		index  bool = true
		follow bool = true
	)

	var meta Metadata
	collectMetadata(node, &meta)
	for _, item := range meta.Robots {
		switch item {
		case "none":
			index = false
			follow = false
		case "noindex":
			index = false
		case "nofollow":
			follow = false
		}
	}
	if meta.Canonical != nil && meta.Canonical.String() != url.String() {
		// TODO: Should we check for and remove the non-canonical URL from the
		// database?
		log.Printf("Re-scheduling canonical URL: %s", meta.Canonical.String())
		c.Schedule(meta.Canonical)
		return nil
	}

	if follow {
		c.ScheduleLinks(url, node)
	}
	if !index {
		return nil
	}

	article, err := readability.FromDocument(node, url)
	if err != nil {
		return err
	}

	weight := 1
	if c.Authoritative {
		weight += 1
	}
	if url.Path == "" || url.Path == "/" {
		weight += 1
	}

	return database.WithTx(ctx, nil, func(tx *sql.Tx) error {
		_, err := tx.ExecContext(ctx, `
			INSERT INTO page (
				domain_id,			-- $1
				last_index_date,
				source,
				weight,
				crawl_priority,
				crawl_delay,
				url,				-- $2
				checksum,			-- $3
				title,				-- $4
				author,				-- $5
				description,			-- $6
				excerpt,			-- $7
				javascript,			-- $8
				fts_vector
				-- hostname			-- $9
				-- domain_labels		-- $10
				-- text_content			-- $11
			) VALUES (
				$1, now(), 'crawler', 0, 0, '0s'::interval,
				$2, $3, $4, $5, $6, $7, $8,
				setweight(to_tsvector(coalesce($4, '')), $12) ||
				setweight(to_tsvector(coalesce($5, '')), $12) ||
				setweight(to_tsvector(coalesce($6, '')), $12) ||
				setweight(to_tsvector(coalesce($9, '')), $12) ||
				setweight(to_tsvector(coalesce($10, '')), $12) ||
				setweight(to_tsvector(coalesce($11, '')), 'D')
			)
			ON CONFLICT ON CONSTRAINT page_url_key DO UPDATE SET
				last_index_date = now(),
				checksum = $3,
				title = $4,
				author = $5,
				description = $6,
				excerpt = $7,
				javascript = $8,
				-- TODO: Maybe move this to a sub-query
				fts_vector =
					setweight(to_tsvector(coalesce($4, '')), $12) ||
					setweight(to_tsvector(coalesce($5, '')), $12) ||
					setweight(to_tsvector(coalesce($6, '')), $12) ||
					setweight(to_tsvector(coalesce($9, '')), $12) ||
					setweight(to_tsvector(coalesce($10, '')), $12) ||
					setweight(to_tsvector(coalesce($11, '')), 'D');`,
			c.DomainID, url.String(), hash.Sum([]byte{}), meta.Title,
			meta.Author, meta.Description, article.Excerpt, meta.JavaScript,
			url.Host, c.labels, article.TextContent, weights[weight])
		return err
	})
}

// Checks an HTTP response and returns true if the crawler should proceed.
func (c *Crawler) checkResponse(resp *http.Response, url *url.URL) bool {
	switch resp.StatusCode {
	case http.StatusOK:
		// no-op
	case http.StatusTooManyRequests:
		retryAfter := resp.Header.Get("Retry-After")
		if retryAfter == "" {
			retryAfter = "3600"
		}
		t, err := http.ParseTime(retryAfter)
		if err != nil {
			seconds, err := strconv.Atoi(retryAfter)
			if err != nil {
				seconds = 3600
			}
			c.RetryAfter = time.Duration(seconds) * time.Second
		} else {
			c.RetryAfter = t.Sub(time.Now())
		}
		log.Printf("HTTP 429, pausing for %s", c.RetryAfter.String())
		c.Schedule(url)
		return false
	default:
		log.Printf("Unexpected status code %d", resp.StatusCode)
		return false
	}

	contentType := resp.Header.Get("Content-Type")
	if contentType == "" {
		return false
	}
	if mt, _, err := mime.ParseMediaType(contentType); err != nil {
		return false
	} else if mt != "text/html" {
		return false
	}

	return true
}

func collectMetadata(node *html.Node, meta *Metadata) {
	if node.Type == html.ElementNode && node.Data == "title" && meta.Title == nil {
		title := collectText(node)
		meta.Title = &title
	}

	if node.Type == html.ElementNode && node.Data == "meta" {
		var (
			name    string
			content string
		)
		for _, attr := range node.Attr {
			if attr.Key == "name" {
				name = attr.Val
			}
			if attr.Key == "content" {
				content = attr.Val
			}
		}
		switch name {
		case "robots":
			meta.Robots = strings.Split(content, ",")
		case "author":
			meta.Author = &content
		case "description":
			meta.Description = &content
		}
	}

	if node.Type == html.ElementNode && node.Data == "link" {
		var (
			rel  string
			href *url.URL
		)
		for _, attr := range node.Attr {
			if attr.Key == "rel" {
				rel = attr.Val
			}
			if attr.Key == "href" {
				href, _ = url.Parse(attr.Val)
			}
		}
		switch rel {
		case "canonical":
			meta.Canonical = href
		}
	}

	if node.Type == html.ElementNode && node.Data == "script" {
		meta.JavaScript = true
	}

	for child := node.FirstChild; child != nil; child = child.NextSibling {
		collectMetadata(child, meta)
	}
}

func collectText(node *html.Node) string {
	text := ""
	if node.Type == html.TextNode {
		text += node.Data
	}
	for child := node.FirstChild; child != nil; child = child.NextSibling {
		text += collectText(child)
	}
	return text
}
Initial commit 2022-07-08 19:46:11 +02:00			`package crawler`

			`import (`
			`"context"`
crawler: compute checksum and make unique Fixes: https://todo.sr.ht/~sircmpwn/searchhut/30 2022-07-10 09:36:07 +02:00			`"crypto/sha512"`
Initial commit 2022-07-08 19:46:11 +02:00			`"database/sql"`
crawler: compute checksum and make unique Fixes: https://todo.sr.ht/~sircmpwn/searchhut/30 2022-07-10 09:36:07 +02:00			`"io"`
Initial commit 2022-07-08 19:46:11 +02:00			`"log"`
crawler: follow links regardless of readability 2022-07-08 20:13:32 +02:00			`"mime"`
crawler: re-schedule after HTTP 429 Fixes: https://todo.sr.ht/~sircmpwn/searchhut/5 2022-07-09 19:13:46 +02:00			`"net/http"`
Initial commit 2022-07-08 19:46:11 +02:00			`"net/url"`
crawler: re-schedule after HTTP 429 Fixes: https://todo.sr.ht/~sircmpwn/searchhut/5 2022-07-09 19:13:46 +02:00			`"strconv"`
crawler: improve index settings 2022-07-09 18:57:39 +02:00			`"strings"`
crawler: re-schedule after HTTP 429 Fixes: https://todo.sr.ht/~sircmpwn/searchhut/5 2022-07-09 19:13:46 +02:00			`"time"`
Initial commit 2022-07-08 19:46:11 +02:00
			`"github.com/go-shiori/go-readability"`
crawler: follow links regardless of readability 2022-07-08 20:13:32 +02:00			`"golang.org/x/net/html"`
Initial commit 2022-07-08 19:46:11 +02:00
			`"git.sr.ht/~sircmpwn/searchhut/database"`
			`)`

Rank authoritative websites and index pages higher Implements: https://todo.sr.ht/~sircmpwn/searchhut/23 2022-07-11 14:59:35 +02:00			`var weights = [...]string{"D", "C", "B", "A"}`

crawler: improve index settings 2022-07-09 18:57:39 +02:00			`type Metadata struct {`
			`Title *string`
			`Robots []string`
			`Author *string`
			`Description *string`
Handle canonical URLs Fixes: https://todo.sr.ht/~sircmpwn/searchhut/11 2022-07-09 19:06:28 +02:00			`Canonical *url.URL`
Track pages with JavaScript and total crawl time 2022-07-10 09:07:37 +02:00			`JavaScript bool`
crawler: improve index settings 2022-07-09 18:57:39 +02:00			`}`

Initial commit 2022-07-08 19:46:11 +02:00			`func (c Crawler) Index(ctx context.Context, url url.URL) error {`
			`log.Println(url.String())`

crawler: perform HEAD before GET Implements: https://todo.sr.ht/~sircmpwn/searchhut/8 2022-07-09 18:59:23 +02:00			`resp, err := c.Head(ctx, url)`
crawler: respect robots.txt 2022-07-08 20:30:09 +02:00			`if err != nil {`
			`return err`
			`}`
Initial commit 2022-07-08 19:46:11 +02:00			`defer resp.Body.Close()`
crawler: re-schedule after HTTP 429 Fixes: https://todo.sr.ht/~sircmpwn/searchhut/5 2022-07-09 19:13:46 +02:00			`if !c.checkResponse(resp, url) {`
crawler: follow links regardless of readability 2022-07-08 20:13:32 +02:00			`return nil`
			`}`

crawler: perform HEAD before GET Implements: https://todo.sr.ht/~sircmpwn/searchhut/8 2022-07-09 18:59:23 +02:00			`resp, err = c.Get(ctx, url)`
			`if err != nil {`
			`return err`
			`}`
			`defer resp.Body.Close()`
crawler: compute checksum and make unique Fixes: https://todo.sr.ht/~sircmpwn/searchhut/30 2022-07-10 09:36:07 +02:00			`hash := sha512.New()`
			`reader := io.TeeReader(resp.Body, hash)`

crawler: re-schedule after HTTP 429 Fixes: https://todo.sr.ht/~sircmpwn/searchhut/5 2022-07-09 19:13:46 +02:00			`if !c.checkResponse(resp, url) {`
crawler: perform HEAD before GET Implements: https://todo.sr.ht/~sircmpwn/searchhut/8 2022-07-09 18:59:23 +02:00			`return nil`
			`}`

crawler: compute checksum and make unique Fixes: https://todo.sr.ht/~sircmpwn/searchhut/30 2022-07-10 09:36:07 +02:00			`node, err := html.Parse(reader)`
Initial commit 2022-07-08 19:46:11 +02:00			`if err != nil {`
			`return err`
			`}`
crawler: improve index settings 2022-07-09 18:57:39 +02:00
			`var (`
			`index bool = true`
			`follow bool = true`
			`)`

			`var meta Metadata`
			`collectMetadata(node, &meta)`
			`for _, item := range meta.Robots {`
			`switch item {`
			`case "none":`
			`index = false`
			`follow = false`
			`case "noindex":`
			`index = false`
			`case "nofollow":`
			`follow = false`
			`}`
			`}`
Handle canonical URLs Fixes: https://todo.sr.ht/~sircmpwn/searchhut/11 2022-07-09 19:06:28 +02:00			`if meta.Canonical != nil && meta.Canonical.String() != url.String() {`
			`// TODO: Should we check for and remove the non-canonical URL from the`
			`// database?`
			`log.Printf("Re-scheduling canonical URL: %s", meta.Canonical.String())`
			`c.Schedule(meta.Canonical)`
			`return nil`
			`}`
crawler: improve index settings 2022-07-09 18:57:39 +02:00
			`if follow {`
			`c.ScheduleLinks(url, node)`
			`}`
			`if !index {`
			`return nil`
			`}`
Initial commit 2022-07-08 19:46:11 +02:00
crawler: follow links regardless of readability 2022-07-08 20:13:32 +02:00			`article, err := readability.FromDocument(node, url)`
			`if err != nil {`
			`return err`
Initial commit 2022-07-08 19:46:11 +02:00			`}`

Rank authoritative websites and index pages higher Implements: https://todo.sr.ht/~sircmpwn/searchhut/23 2022-07-11 14:59:35 +02:00			`weight := 1`
			`if c.Authoritative {`
			`weight += 1`
			`}`
			`if url.Path == "" \|\| url.Path == "/" {`
			`weight += 1`
			`}`

Initial commit 2022-07-08 19:46:11 +02:00			`return database.WithTx(ctx, nil, func(tx *sql.Tx) error {`
			_, err := tx.ExecContext(ctx, `
			`INSERT INTO page (`
crawler: improve index settings 2022-07-09 18:57:39 +02:00			`domain_id, -- $1`
Initial commit 2022-07-08 19:46:11 +02:00			`last_index_date,`
schema: add "source" column to page 2022-07-10 10:13:11 +02:00			`source,`
Initial commit 2022-07-08 19:46:11 +02:00			`weight,`
			`crawl_priority,`
			`crawl_delay,`
crawler: improve index settings 2022-07-09 18:57:39 +02:00			`url, -- $2`
			`checksum, -- $3`
			`title, -- $4`
			`author, -- $5`
Put domain labels minus eTLD into the text index Before, only the hostname (say, harelang.org) was indexed, and no results appeared for a "harelang" query. Now, all domain labels (minus the eTLD) are indexed separately (for example, "docs" and "harelang" for "docs.harelang.org"), and such queries work. eTLD is removed using the data from Mozilla's Public Suffix List (https://publicsuffix.org). 2022-07-11 14:58:34 +02:00			`description, -- $6`
crawler: improve index settings 2022-07-09 18:57:39 +02:00			`excerpt, -- $7`
Track pages with JavaScript and total crawl time 2022-07-10 09:07:37 +02:00			`javascript, -- $8`
Initial commit 2022-07-08 19:46:11 +02:00			`fts_vector`
Put domain labels minus eTLD into the text index Before, only the hostname (say, harelang.org) was indexed, and no results appeared for a "harelang" query. Now, all domain labels (minus the eTLD) are indexed separately (for example, "docs" and "harelang" for "docs.harelang.org"), and such queries work. eTLD is removed using the data from Mozilla's Public Suffix List (https://publicsuffix.org). 2022-07-11 14:58:34 +02:00			`-- hostname -- $9`
			`-- domain_labels -- $10`
			`-- text_content -- $11`
Initial commit 2022-07-08 19:46:11 +02:00			`) VALUES (`
schema: add "source" column to page 2022-07-10 10:13:11 +02:00			`$1, now(), 'crawler', 0, 0, '0s'::interval,`
Track pages with JavaScript and total crawl time 2022-07-10 09:07:37 +02:00			`$2, $3, $4, $5, $6, $7, $8,`
Rank authoritative websites and index pages higher Implements: https://todo.sr.ht/~sircmpwn/searchhut/23 2022-07-11 14:59:35 +02:00			`setweight(to_tsvector(coalesce($4, '')), $12) \|\|`
			`setweight(to_tsvector(coalesce($5, '')), $12) \|\|`
			`setweight(to_tsvector(coalesce($6, '')), $12) \|\|`
			`setweight(to_tsvector(coalesce($9, '')), $12) \|\|`
			`setweight(to_tsvector(coalesce($10, '')), $12) \|\|`
Put domain labels minus eTLD into the text index Before, only the hostname (say, harelang.org) was indexed, and no results appeared for a "harelang" query. Now, all domain labels (minus the eTLD) are indexed separately (for example, "docs" and "harelang" for "docs.harelang.org"), and such queries work. eTLD is removed using the data from Mozilla's Public Suffix List (https://publicsuffix.org). 2022-07-11 14:58:34 +02:00			`setweight(to_tsvector(coalesce($11, '')), 'D')`
Initial commit 2022-07-08 19:46:11 +02:00			`)`
			`ON CONFLICT ON CONSTRAINT page_url_key DO UPDATE SET`
			`last_index_date = now(),`
			`checksum = $3,`
			`title = $4,`
			`author = $5,`
crawler: improve index settings 2022-07-09 18:57:39 +02:00			`description = $6,`
			`excerpt = $7,`
Track pages with JavaScript and total crawl time 2022-07-10 09:07:37 +02:00			`javascript = $8,`
Initial commit 2022-07-08 19:46:11 +02:00			`-- TODO: Maybe move this to a sub-query`
			`fts_vector =`
Rank authoritative websites and index pages higher Implements: https://todo.sr.ht/~sircmpwn/searchhut/23 2022-07-11 14:59:35 +02:00			`setweight(to_tsvector(coalesce($4, '')), $12) \|\|`
			`setweight(to_tsvector(coalesce($5, '')), $12) \|\|`
			`setweight(to_tsvector(coalesce($6, '')), $12) \|\|`
			`setweight(to_tsvector(coalesce($9, '')), $12) \|\|`
			`setweight(to_tsvector(coalesce($10, '')), $12) \|\|`
Put domain labels minus eTLD into the text index Before, only the hostname (say, harelang.org) was indexed, and no results appeared for a "harelang" query. Now, all domain labels (minus the eTLD) are indexed separately (for example, "docs" and "harelang" for "docs.harelang.org"), and such queries work. eTLD is removed using the data from Mozilla's Public Suffix List (https://publicsuffix.org). 2022-07-11 14:58:34 +02:00			setweight(to_tsvector(coalesce($11, '')), 'D');`,
crawler: compute checksum and make unique Fixes: https://todo.sr.ht/~sircmpwn/searchhut/30 2022-07-10 09:36:07 +02:00			`c.DomainID, url.String(), hash.Sum([]byte{}), meta.Title,`
			`meta.Author, meta.Description, article.Excerpt, meta.JavaScript,`
Rank authoritative websites and index pages higher Implements: https://todo.sr.ht/~sircmpwn/searchhut/23 2022-07-11 14:59:35 +02:00			`url.Host, c.labels, article.TextContent, weights[weight])`
Initial commit 2022-07-08 19:46:11 +02:00			`return err`
			`})`
			`}`
crawler: improve index settings 2022-07-09 18:57:39 +02:00
crawler: re-schedule after HTTP 429 Fixes: https://todo.sr.ht/~sircmpwn/searchhut/5 2022-07-09 19:13:46 +02:00			`// Checks an HTTP response and returns true if the crawler should proceed.`
			`func (c Crawler) checkResponse(resp http.Response, url *url.URL) bool {`
			`switch resp.StatusCode {`
			`case http.StatusOK:`
			`// no-op`
			`case http.StatusTooManyRequests:`
			`retryAfter := resp.Header.Get("Retry-After")`
			`if retryAfter == "" {`
			`retryAfter = "3600"`
			`}`
Handle Retry-After as timestamp 2022-07-09 19:16:48 +02:00			`t, err := http.ParseTime(retryAfter)`
crawler: re-schedule after HTTP 429 Fixes: https://todo.sr.ht/~sircmpwn/searchhut/5 2022-07-09 19:13:46 +02:00			`if err != nil {`
Handle Retry-After as timestamp 2022-07-09 19:16:48 +02:00			`seconds, err := strconv.Atoi(retryAfter)`
			`if err != nil {`
			`seconds = 3600`
			`}`
			`c.RetryAfter = time.Duration(seconds) * time.Second`
			`} else {`
			`c.RetryAfter = t.Sub(time.Now())`
crawler: re-schedule after HTTP 429 Fixes: https://todo.sr.ht/~sircmpwn/searchhut/5 2022-07-09 19:13:46 +02:00			`}`
Handle Retry-After as timestamp 2022-07-09 19:16:48 +02:00			`log.Printf("HTTP 429, pausing for %s", c.RetryAfter.String())`
crawler: re-schedule after HTTP 429 Fixes: https://todo.sr.ht/~sircmpwn/searchhut/5 2022-07-09 19:13:46 +02:00			`c.Schedule(url)`
			`return false`
			`default:`
			`log.Printf("Unexpected status code %d", resp.StatusCode)`
			`return false`
			`}`

			`contentType := resp.Header.Get("Content-Type")`
			`if contentType == "" {`
			`return false`
			`}`
			`if mt, _, err := mime.ParseMediaType(contentType); err != nil {`
			`return false`
			`} else if mt != "text/html" {`
			`return false`
			`}`

			`return true`
			`}`

crawler: improve index settings 2022-07-09 18:57:39 +02:00			`func collectMetadata(node html.Node, meta Metadata) {`
Track pages with JavaScript and total crawl time 2022-07-10 09:07:37 +02:00			`if node.Type == html.ElementNode && node.Data == "title" && meta.Title == nil {`
crawler: improve index settings 2022-07-09 18:57:39 +02:00			`title := collectText(node)`
			`meta.Title = &title`
			`}`
Track pages with JavaScript and total crawl time 2022-07-10 09:07:37 +02:00
crawler: improve index settings 2022-07-09 18:57:39 +02:00			`if node.Type == html.ElementNode && node.Data == "meta" {`
			`var (`
			`name string`
			`content string`
			`)`
			`for _, attr := range node.Attr {`
			`if attr.Key == "name" {`
			`name = attr.Val`
			`}`
			`if attr.Key == "content" {`
			`content = attr.Val`
			`}`
			`}`
			`switch name {`
			`case "robots":`
			`meta.Robots = strings.Split(content, ",")`
			`case "author":`
			`meta.Author = &content`
			`case "description":`
			`meta.Description = &content`
			`}`
			`}`
Track pages with JavaScript and total crawl time 2022-07-10 09:07:37 +02:00
Handle canonical URLs Fixes: https://todo.sr.ht/~sircmpwn/searchhut/11 2022-07-09 19:06:28 +02:00			`if node.Type == html.ElementNode && node.Data == "link" {`
			`var (`
			`rel string`
			`href *url.URL`
			`)`
			`for _, attr := range node.Attr {`
			`if attr.Key == "rel" {`
			`rel = attr.Val`
			`}`
			`if attr.Key == "href" {`
			`href, _ = url.Parse(attr.Val)`
			`}`
			`}`
			`switch rel {`
			`case "canonical":`
			`meta.Canonical = href`
			`}`
			`}`
Track pages with JavaScript and total crawl time 2022-07-10 09:07:37 +02:00
			`if node.Type == html.ElementNode && node.Data == "script" {`
			`meta.JavaScript = true`
			`}`

crawler: improve index settings 2022-07-09 18:57:39 +02:00			`for child := node.FirstChild; child != nil; child = child.NextSibling {`
			`collectMetadata(child, meta)`
			`}`
			`}`

			`func collectText(node *html.Node) string {`
			`text := ""`
			`if node.Type == html.TextNode {`
			`text += node.Data`
			`}`
			`for child := node.FirstChild; child != nil; child = child.NextSibling {`
			`text += collectText(child)`
			`}`
			`return text`
			`}`