searchhut/crawler/index.go

package crawler

import (
	"context"
	"crypto/sha512"
	"database/sql"
	"io"
	"log"
	"mime"
	"net/http"
	"net/url"
	"strconv"
	"strings"
	"time"

	"github.com/go-shiori/go-readability"
	"github.com/mattn/go-runewidth"
	"golang.org/x/net/html"

	"git.sr.ht/~sircmpwn/searchhut/database"
)

var weights = [...]string{"D", "C", "B", "A"}

type Metadata struct {
	Title       *string
	Robots      []string
	Author      *string
	Description *string
	Canonical   *url.URL
	JavaScript  bool
}

type counterWriter struct {
	Length int
}

func (c *counterWriter) Write(p []byte) (int, error) {
	c.Length += len(p)
	return len(p), nil
}

func (c *Crawler) Index(ctx context.Context, url *url.URL) error {
	log.Println(url.String())

	resp, err := c.Head(ctx, url)
	if err != nil {
		return err
	}
	defer resp.Body.Close()
	if !c.checkResponse(resp, url) {
		return nil
	}

	resp, err = c.Get(ctx, url)
	if err != nil {
		return err
	}
	defer resp.Body.Close()
	hash := sha512.New()
	counter := counterWriter{}
	reader := io.TeeReader(resp.Body, io.MultiWriter(&counter, hash))

	if !c.checkResponse(resp, url) {
		return nil
	}

	node, err := html.Parse(reader)
	if err != nil {
		return err
	}

	var (
		index  bool = true
		follow bool = true
	)

	var meta Metadata
	collectMetadata(node, &meta)
	for _, item := range meta.Robots {
		switch item {
		case "none":
			index = false
			follow = false
		case "noindex":
			index = false
		case "nofollow":
			follow = false
		}
	}
	if meta.Canonical != nil && meta.Canonical.String() != url.String() {
		// TODO: Should we check for and remove the non-canonical URL from the
		// database?
		log.Printf("Re-scheduling canonical URL: %s", meta.Canonical.String())
		c.Schedule(meta.Canonical)
		return nil
	}

	if follow {
		c.ScheduleLinks(url, node)
	}
	if !index {
		return nil
	}

	article, err := readability.FromDocument(node, url)
	if err != nil {
		return err
	}

	weight := 1
	if c.Authoritative {
		weight += 1
	}
	if url.Path == "" || url.Path == "/" {
		weight += 1
	}

	excerpt := runewidth.Truncate(article.Excerpt, 512, "…")
	return database.WithTx(ctx, nil, func(tx *sql.Tx) error {
		_, err := tx.ExecContext(ctx, `
			INSERT INTO page (
				domain_id,			-- $1
				last_index_date,
				source,
				url,				-- $2
				page_size,                      -- $3
				checksum,			-- $4
				title,				-- $5
				author,				-- $6
				description,			-- $7
				excerpt,			-- $8
				javascript,			-- $9
				fts_vector
				-- hostname			-- $10
				-- domain_labels		-- $11
				-- text_content			-- $12
			) VALUES (
				$1, now(), 'crawler',
				$2, $3, $4, $5, $6, $7, $8, $9,
				setweight(to_tsvector(coalesce($5, '')), $13) ||
				setweight(to_tsvector(coalesce($6, '')), $13) ||
				setweight(to_tsvector(coalesce($7, '')), $13) ||
				setweight(to_tsvector(coalesce($10, '')), $13) ||
				setweight(to_tsvector(coalesce($11, '')), $13) ||
				setweight(to_tsvector(coalesce($12, '')), 'D')
			)
			ON CONFLICT ON CONSTRAINT page_url_key DO UPDATE SET
				last_index_date = now(),
				page_size = $3,
				checksum = $4,
				title = $5,
				author = $6,
				description = $7,
				excerpt = $8,
				javascript = $9,
				-- TODO: Maybe move this to a sub-query
				fts_vector =
					setweight(to_tsvector(coalesce($5, '')), $13) ||
					setweight(to_tsvector(coalesce($6, '')), $13) ||
					setweight(to_tsvector(coalesce($7, '')), $13) ||
					setweight(to_tsvector(coalesce($10, '')), $13) ||
					setweight(to_tsvector(coalesce($11, '')), $13) ||
					setweight(to_tsvector(coalesce($12, '')), 'D');`,
			c.DomainID, url.String(), counter.Length, hash.Sum([]byte{}),
			meta.Title, meta.Author, meta.Description, excerpt,
			meta.JavaScript, url.Host, c.labels, article.TextContent,
			weights[weight])
		return err
	})
}

// Checks an HTTP response and returns true if the crawler should proceed.
func (c *Crawler) checkResponse(resp *http.Response, url *url.URL) bool {
	switch resp.StatusCode {
	case http.StatusOK:
		// no-op
	case http.StatusTooManyRequests:
		retryAfter := resp.Header.Get("Retry-After")
		if retryAfter == "" {
			retryAfter = "3600"
		}
		t, err := http.ParseTime(retryAfter)
		if err != nil {
			seconds, err := strconv.Atoi(retryAfter)
			if err != nil {
				seconds = 3600
			}
			c.RetryAfter = time.Duration(seconds) * time.Second
		} else {
			c.RetryAfter = t.Sub(time.Now())
		}
		log.Printf("HTTP 429, pausing for %s", c.RetryAfter.String())
		c.Schedule(url)
		return false
	default:
		log.Printf("Unexpected status code %d", resp.StatusCode)
		return false
	}

	contentType := resp.Header.Get("Content-Type")
	if contentType == "" {
		return false
	}
	if mt, _, err := mime.ParseMediaType(contentType); err != nil {
		return false
	} else if mt != "text/html" {
		return false
	}

	return true
}

func collectMetadata(node *html.Node, meta *Metadata) {
	if node.Type == html.ElementNode && node.Data == "title" && meta.Title == nil {
		title := collectText(node)
		meta.Title = &title
	}

	if node.Type == html.ElementNode && node.Data == "meta" {
		var (
			name    string
			content string
		)
		for _, attr := range node.Attr {
			if attr.Key == "name" {
				name = attr.Val
			}
			if attr.Key == "content" {
				content = attr.Val
			}
		}
		switch name {
		case "robots":
			meta.Robots = strings.Split(content, ",")
		case "author":
			meta.Author = &content
		case "description":
			meta.Description = &content
		}
	}

	if node.Type == html.ElementNode && node.Data == "link" {
		var (
			rel  string
			href *url.URL
		)
		for _, attr := range node.Attr {
			if attr.Key == "rel" {
				rel = attr.Val
			}
			if attr.Key == "href" {
				href, _ = url.Parse(attr.Val)
			}
		}
		switch rel {
		case "canonical":
			meta.Canonical = href
		}
	}

	if node.Type == html.ElementNode && node.Data == "script" {
		meta.JavaScript = true
	}

	for child := node.FirstChild; child != nil; child = child.NextSibling {
		collectMetadata(child, meta)
	}
}

func collectText(node *html.Node) string {
	text := ""
	if node.Type == html.TextNode {
		text += node.Data
	}
	for child := node.FirstChild; child != nil; child = child.NextSibling {
		text += collectText(child)
	}
	return text
}
Initial commit 2022-07-08 19:46:11 +02:00			`package crawler`

			`import (`
			`"context"`
crawler: compute checksum and make unique Fixes: https://todo.sr.ht/~sircmpwn/searchhut/30 2022-07-10 09:36:07 +02:00			`"crypto/sha512"`
Initial commit 2022-07-08 19:46:11 +02:00			`"database/sql"`
crawler: compute checksum and make unique Fixes: https://todo.sr.ht/~sircmpwn/searchhut/30 2022-07-10 09:36:07 +02:00			`"io"`
Initial commit 2022-07-08 19:46:11 +02:00			`"log"`
crawler: follow links regardless of readability 2022-07-08 20:13:32 +02:00			`"mime"`
crawler: re-schedule after HTTP 429 Fixes: https://todo.sr.ht/~sircmpwn/searchhut/5 2022-07-09 19:13:46 +02:00			`"net/http"`
Initial commit 2022-07-08 19:46:11 +02:00			`"net/url"`
crawler: re-schedule after HTTP 429 Fixes: https://todo.sr.ht/~sircmpwn/searchhut/5 2022-07-09 19:13:46 +02:00			`"strconv"`
crawler: improve index settings 2022-07-09 18:57:39 +02:00			`"strings"`
crawler: re-schedule after HTTP 429 Fixes: https://todo.sr.ht/~sircmpwn/searchhut/5 2022-07-09 19:13:46 +02:00			`"time"`
Initial commit 2022-07-08 19:46:11 +02:00
			`"github.com/go-shiori/go-readability"`
crawler: trim excerpt Fixes: https://todo.sr.ht/~sircmpwn/searchhut/38 2022-07-13 10:26:22 +02:00			`"github.com/mattn/go-runewidth"`
crawler: follow links regardless of readability 2022-07-08 20:13:32 +02:00			`"golang.org/x/net/html"`
Initial commit 2022-07-08 19:46:11 +02:00
			`"git.sr.ht/~sircmpwn/searchhut/database"`
			`)`

Rank authoritative websites and index pages higher Implements: https://todo.sr.ht/~sircmpwn/searchhut/23 2022-07-11 14:59:35 +02:00			`var weights = [...]string{"D", "C", "B", "A"}`

crawler: improve index settings 2022-07-09 18:57:39 +02:00			`type Metadata struct {`
			`Title *string`
			`Robots []string`
			`Author *string`
			`Description *string`
Handle canonical URLs Fixes: https://todo.sr.ht/~sircmpwn/searchhut/11 2022-07-09 19:06:28 +02:00			`Canonical *url.URL`
Track pages with JavaScript and total crawl time 2022-07-10 09:07:37 +02:00			`JavaScript bool`
crawler: improve index settings 2022-07-09 18:57:39 +02:00			`}`

Store page size in the database Implements: https://todo.sr.ht/~sircmpwn/searchhut/33 2022-07-11 15:03:27 +02:00			`type counterWriter struct {`
			`Length int`
			`}`

			`func (c *counterWriter) Write(p []byte) (int, error) {`
			`c.Length += len(p)`
			`return len(p), nil`
			`}`

Initial commit 2022-07-08 19:46:11 +02:00			`func (c Crawler) Index(ctx context.Context, url url.URL) error {`
			`log.Println(url.String())`

crawler: perform HEAD before GET Implements: https://todo.sr.ht/~sircmpwn/searchhut/8 2022-07-09 18:59:23 +02:00			`resp, err := c.Head(ctx, url)`
crawler: respect robots.txt 2022-07-08 20:30:09 +02:00			`if err != nil {`
			`return err`
			`}`
Initial commit 2022-07-08 19:46:11 +02:00			`defer resp.Body.Close()`
crawler: re-schedule after HTTP 429 Fixes: https://todo.sr.ht/~sircmpwn/searchhut/5 2022-07-09 19:13:46 +02:00			`if !c.checkResponse(resp, url) {`
crawler: follow links regardless of readability 2022-07-08 20:13:32 +02:00			`return nil`
			`}`

crawler: perform HEAD before GET Implements: https://todo.sr.ht/~sircmpwn/searchhut/8 2022-07-09 18:59:23 +02:00			`resp, err = c.Get(ctx, url)`
			`if err != nil {`
			`return err`
			`}`
			`defer resp.Body.Close()`
crawler: compute checksum and make unique Fixes: https://todo.sr.ht/~sircmpwn/searchhut/30 2022-07-10 09:36:07 +02:00			`hash := sha512.New()`
Store page size in the database Implements: https://todo.sr.ht/~sircmpwn/searchhut/33 2022-07-11 15:03:27 +02:00			`counter := counterWriter{}`
			`reader := io.TeeReader(resp.Body, io.MultiWriter(&counter, hash))`
crawler: compute checksum and make unique Fixes: https://todo.sr.ht/~sircmpwn/searchhut/30 2022-07-10 09:36:07 +02:00
crawler: re-schedule after HTTP 429 Fixes: https://todo.sr.ht/~sircmpwn/searchhut/5 2022-07-09 19:13:46 +02:00			`if !c.checkResponse(resp, url) {`
crawler: perform HEAD before GET Implements: https://todo.sr.ht/~sircmpwn/searchhut/8 2022-07-09 18:59:23 +02:00			`return nil`
			`}`

crawler: compute checksum and make unique Fixes: https://todo.sr.ht/~sircmpwn/searchhut/30 2022-07-10 09:36:07 +02:00			`node, err := html.Parse(reader)`
Initial commit 2022-07-08 19:46:11 +02:00			`if err != nil {`
			`return err`
			`}`
crawler: improve index settings 2022-07-09 18:57:39 +02:00
			`var (`
			`index bool = true`
			`follow bool = true`
			`)`

			`var meta Metadata`
			`collectMetadata(node, &meta)`
			`for _, item := range meta.Robots {`
			`switch item {`
			`case "none":`
			`index = false`
			`follow = false`
			`case "noindex":`
			`index = false`
			`case "nofollow":`
			`follow = false`
			`}`
			`}`
Handle canonical URLs Fixes: https://todo.sr.ht/~sircmpwn/searchhut/11 2022-07-09 19:06:28 +02:00			`if meta.Canonical != nil && meta.Canonical.String() != url.String() {`
			`// TODO: Should we check for and remove the non-canonical URL from the`
			`// database?`
			`log.Printf("Re-scheduling canonical URL: %s", meta.Canonical.String())`
			`c.Schedule(meta.Canonical)`
			`return nil`
			`}`
crawler: improve index settings 2022-07-09 18:57:39 +02:00
			`if follow {`
			`c.ScheduleLinks(url, node)`
			`}`
			`if !index {`
			`return nil`
			`}`
Initial commit 2022-07-08 19:46:11 +02:00
crawler: follow links regardless of readability 2022-07-08 20:13:32 +02:00			`article, err := readability.FromDocument(node, url)`
			`if err != nil {`
			`return err`
Initial commit 2022-07-08 19:46:11 +02:00			`}`

Rank authoritative websites and index pages higher Implements: https://todo.sr.ht/~sircmpwn/searchhut/23 2022-07-11 14:59:35 +02:00			`weight := 1`
			`if c.Authoritative {`
			`weight += 1`
			`}`
			`if url.Path == "" \|\| url.Path == "/" {`
			`weight += 1`
			`}`

crawler: trim excerpt Fixes: https://todo.sr.ht/~sircmpwn/searchhut/38 2022-07-13 10:26:22 +02:00			`excerpt := runewidth.Truncate(article.Excerpt, 512, "…")`
Initial commit 2022-07-08 19:46:11 +02:00			`return database.WithTx(ctx, nil, func(tx *sql.Tx) error {`
			_, err := tx.ExecContext(ctx, `
			`INSERT INTO page (`
crawler: improve index settings 2022-07-09 18:57:39 +02:00			`domain_id, -- $1`
Initial commit 2022-07-08 19:46:11 +02:00			`last_index_date,`
schema: add "source" column to page 2022-07-10 10:13:11 +02:00			`source,`
crawler: improve index settings 2022-07-09 18:57:39 +02:00			`url, -- $2`
Store page size in the database Implements: https://todo.sr.ht/~sircmpwn/searchhut/33 2022-07-11 15:03:27 +02:00			`page_size, -- $3`
			`checksum, -- $4`
			`title, -- $5`
			`author, -- $6`
			`description, -- $7`
			`excerpt, -- $8`
			`javascript, -- $9`
Initial commit 2022-07-08 19:46:11 +02:00			`fts_vector`
Store page size in the database Implements: https://todo.sr.ht/~sircmpwn/searchhut/33 2022-07-11 15:03:27 +02:00			`-- hostname -- $10`
			`-- domain_labels -- $11`
			`-- text_content -- $12`
Initial commit 2022-07-08 19:46:11 +02:00			`) VALUES (`
Drop crawl schedule-related fields They were unused. 2022-07-11 13:47:38 +02:00			`$1, now(), 'crawler',`
Store page size in the database Implements: https://todo.sr.ht/~sircmpwn/searchhut/33 2022-07-11 15:03:27 +02:00			`$2, $3, $4, $5, $6, $7, $8, $9,`
			`setweight(to_tsvector(coalesce($5, '')), $13) \|\|`
			`setweight(to_tsvector(coalesce($6, '')), $13) \|\|`
			`setweight(to_tsvector(coalesce($7, '')), $13) \|\|`
			`setweight(to_tsvector(coalesce($10, '')), $13) \|\|`
			`setweight(to_tsvector(coalesce($11, '')), $13) \|\|`
			`setweight(to_tsvector(coalesce($12, '')), 'D')`
Initial commit 2022-07-08 19:46:11 +02:00			`)`
			`ON CONFLICT ON CONSTRAINT page_url_key DO UPDATE SET`
			`last_index_date = now(),`
Store page size in the database Implements: https://todo.sr.ht/~sircmpwn/searchhut/33 2022-07-11 15:03:27 +02:00			`page_size = $3,`
			`checksum = $4,`
			`title = $5,`
			`author = $6,`
			`description = $7,`
			`excerpt = $8,`
			`javascript = $9,`
Initial commit 2022-07-08 19:46:11 +02:00			`-- TODO: Maybe move this to a sub-query`
			`fts_vector =`
Store page size in the database Implements: https://todo.sr.ht/~sircmpwn/searchhut/33 2022-07-11 15:03:27 +02:00			`setweight(to_tsvector(coalesce($5, '')), $13) \|\|`
			`setweight(to_tsvector(coalesce($6, '')), $13) \|\|`
			`setweight(to_tsvector(coalesce($7, '')), $13) \|\|`
			`setweight(to_tsvector(coalesce($10, '')), $13) \|\|`
			`setweight(to_tsvector(coalesce($11, '')), $13) \|\|`
			setweight(to_tsvector(coalesce($12, '')), 'D');`,
			`c.DomainID, url.String(), counter.Length, hash.Sum([]byte{}),`
crawler: trim excerpt Fixes: https://todo.sr.ht/~sircmpwn/searchhut/38 2022-07-13 10:26:22 +02:00			`meta.Title, meta.Author, meta.Description, excerpt,`
Store page size in the database Implements: https://todo.sr.ht/~sircmpwn/searchhut/33 2022-07-11 15:03:27 +02:00			`meta.JavaScript, url.Host, c.labels, article.TextContent,`
			`weights[weight])`
Initial commit 2022-07-08 19:46:11 +02:00			`return err`
			`})`
			`}`
crawler: improve index settings 2022-07-09 18:57:39 +02:00
crawler: re-schedule after HTTP 429 Fixes: https://todo.sr.ht/~sircmpwn/searchhut/5 2022-07-09 19:13:46 +02:00			`// Checks an HTTP response and returns true if the crawler should proceed.`
			`func (c Crawler) checkResponse(resp http.Response, url *url.URL) bool {`
			`switch resp.StatusCode {`
			`case http.StatusOK:`
			`// no-op`
			`case http.StatusTooManyRequests:`
			`retryAfter := resp.Header.Get("Retry-After")`
			`if retryAfter == "" {`
			`retryAfter = "3600"`
			`}`
Handle Retry-After as timestamp 2022-07-09 19:16:48 +02:00			`t, err := http.ParseTime(retryAfter)`
crawler: re-schedule after HTTP 429 Fixes: https://todo.sr.ht/~sircmpwn/searchhut/5 2022-07-09 19:13:46 +02:00			`if err != nil {`
Handle Retry-After as timestamp 2022-07-09 19:16:48 +02:00			`seconds, err := strconv.Atoi(retryAfter)`
			`if err != nil {`
			`seconds = 3600`
			`}`
			`c.RetryAfter = time.Duration(seconds) * time.Second`
			`} else {`
			`c.RetryAfter = t.Sub(time.Now())`
crawler: re-schedule after HTTP 429 Fixes: https://todo.sr.ht/~sircmpwn/searchhut/5 2022-07-09 19:13:46 +02:00			`}`
Handle Retry-After as timestamp 2022-07-09 19:16:48 +02:00			`log.Printf("HTTP 429, pausing for %s", c.RetryAfter.String())`
crawler: re-schedule after HTTP 429 Fixes: https://todo.sr.ht/~sircmpwn/searchhut/5 2022-07-09 19:13:46 +02:00			`c.Schedule(url)`
			`return false`
			`default:`
			`log.Printf("Unexpected status code %d", resp.StatusCode)`
			`return false`
			`}`

			`contentType := resp.Header.Get("Content-Type")`
			`if contentType == "" {`
			`return false`
			`}`
			`if mt, _, err := mime.ParseMediaType(contentType); err != nil {`
			`return false`
			`} else if mt != "text/html" {`
			`return false`
			`}`

			`return true`
			`}`

crawler: improve index settings 2022-07-09 18:57:39 +02:00			`func collectMetadata(node html.Node, meta Metadata) {`
Track pages with JavaScript and total crawl time 2022-07-10 09:07:37 +02:00			`if node.Type == html.ElementNode && node.Data == "title" && meta.Title == nil {`
crawler: improve index settings 2022-07-09 18:57:39 +02:00			`title := collectText(node)`
			`meta.Title = &title`
			`}`
Track pages with JavaScript and total crawl time 2022-07-10 09:07:37 +02:00
crawler: improve index settings 2022-07-09 18:57:39 +02:00			`if node.Type == html.ElementNode && node.Data == "meta" {`
			`var (`
			`name string`
			`content string`
			`)`
			`for _, attr := range node.Attr {`
			`if attr.Key == "name" {`
			`name = attr.Val`
			`}`
			`if attr.Key == "content" {`
			`content = attr.Val`
			`}`
			`}`
			`switch name {`
			`case "robots":`
			`meta.Robots = strings.Split(content, ",")`
			`case "author":`
			`meta.Author = &content`
			`case "description":`
			`meta.Description = &content`
			`}`
			`}`
Track pages with JavaScript and total crawl time 2022-07-10 09:07:37 +02:00
Handle canonical URLs Fixes: https://todo.sr.ht/~sircmpwn/searchhut/11 2022-07-09 19:06:28 +02:00			`if node.Type == html.ElementNode && node.Data == "link" {`
			`var (`
			`rel string`
			`href *url.URL`
			`)`
			`for _, attr := range node.Attr {`
			`if attr.Key == "rel" {`
			`rel = attr.Val`
			`}`
			`if attr.Key == "href" {`
			`href, _ = url.Parse(attr.Val)`
			`}`
			`}`
			`switch rel {`
			`case "canonical":`
			`meta.Canonical = href`
			`}`
			`}`
Track pages with JavaScript and total crawl time 2022-07-10 09:07:37 +02:00
			`if node.Type == html.ElementNode && node.Data == "script" {`
			`meta.JavaScript = true`
			`}`

crawler: improve index settings 2022-07-09 18:57:39 +02:00			`for child := node.FirstChild; child != nil; child = child.NextSibling {`
			`collectMetadata(child, meta)`
			`}`
			`}`

			`func collectText(node *html.Node) string {`
			`text := ""`
			`if node.Type == html.TextNode {`
			`text += node.Data`
			`}`
			`for child := node.FirstChild; child != nil; child = child.NextSibling {`
			`text += collectText(child)`
			`}`
			`return text`
			`}`