searchhut/crawler/index.go

281 lines
6 KiB
Go
Raw Normal View History

2022-07-08 19:46:11 +02:00
package crawler
import (
"context"
"crypto/sha512"
2022-07-08 19:46:11 +02:00
"database/sql"
"io"
2022-07-08 19:46:11 +02:00
"log"
"mime"
"net/http"
2022-07-08 19:46:11 +02:00
"net/url"
"strconv"
2022-07-09 18:57:39 +02:00
"strings"
"time"
2022-07-08 19:46:11 +02:00
"github.com/go-shiori/go-readability"
"github.com/mattn/go-runewidth"
"golang.org/x/net/html"
2022-07-08 19:46:11 +02:00
"git.sr.ht/~sircmpwn/searchhut/database"
)
var weights = [...]string{"D", "C", "B", "A"}
2022-07-09 18:57:39 +02:00
type Metadata struct {
Title *string
Robots []string
Author *string
Description *string
Canonical *url.URL
JavaScript bool
2022-07-09 18:57:39 +02:00
}
type counterWriter struct {
Length int
}
func (c *counterWriter) Write(p []byte) (int, error) {
c.Length += len(p)
return len(p), nil
}
2022-07-08 19:46:11 +02:00
func (c *Crawler) Index(ctx context.Context, url *url.URL) error {
log.Println(url.String())
resp, err := c.Head(ctx, url)
2022-07-08 20:30:09 +02:00
if err != nil {
return err
}
2022-07-08 19:46:11 +02:00
defer resp.Body.Close()
if !c.checkResponse(resp, url) {
return nil
}
resp, err = c.Get(ctx, url)
if err != nil {
return err
}
defer resp.Body.Close()
hash := sha512.New()
counter := counterWriter{}
reader := io.TeeReader(resp.Body, io.MultiWriter(&counter, hash))
if !c.checkResponse(resp, url) {
return nil
}
node, err := html.Parse(reader)
2022-07-08 19:46:11 +02:00
if err != nil {
return err
}
2022-07-09 18:57:39 +02:00
var (
index bool = true
follow bool = true
)
var meta Metadata
collectMetadata(node, &meta)
for _, item := range meta.Robots {
switch item {
case "none":
index = false
follow = false
case "noindex":
index = false
case "nofollow":
follow = false
}
}
if meta.Canonical != nil && meta.Canonical.String() != url.String() {
// TODO: Should we check for and remove the non-canonical URL from the
// database?
log.Printf("Re-scheduling canonical URL: %s", meta.Canonical.String())
c.Schedule(meta.Canonical)
return nil
}
2022-07-09 18:57:39 +02:00
if follow {
c.ScheduleLinks(url, node)
}
if !index {
return nil
}
2022-07-08 19:46:11 +02:00
article, err := readability.FromDocument(node, url)
if err != nil {
return err
2022-07-08 19:46:11 +02:00
}
weight := 1
if c.Authoritative {
weight += 1
}
if url.Path == "" || url.Path == "/" {
weight += 1
}
excerpt := runewidth.Truncate(article.Excerpt, 512, "…")
2022-07-08 19:46:11 +02:00
return database.WithTx(ctx, nil, func(tx *sql.Tx) error {
_, err := tx.ExecContext(ctx, `
INSERT INTO page (
2022-07-09 18:57:39 +02:00
domain_id, -- $1
2022-07-08 19:46:11 +02:00
last_index_date,
2022-07-10 10:13:11 +02:00
source,
2022-07-09 18:57:39 +02:00
url, -- $2
page_size, -- $3
checksum, -- $4
title, -- $5
author, -- $6
description, -- $7
excerpt, -- $8
javascript, -- $9
2022-07-08 19:46:11 +02:00
fts_vector
-- hostname -- $10
-- domain_labels -- $11
-- text_content -- $12
2022-07-08 19:46:11 +02:00
) VALUES (
$1, now(), 'crawler',
$2, $3, $4, $5, $6, $7, $8, $9,
setweight(to_tsvector(coalesce($5, '')), $13) ||
setweight(to_tsvector(coalesce($6, '')), $13) ||
setweight(to_tsvector(coalesce($7, '')), $13) ||
setweight(to_tsvector(coalesce($10, '')), $13) ||
setweight(to_tsvector(coalesce($11, '')), $13) ||
setweight(to_tsvector(coalesce($12, '')), 'D')
2022-07-08 19:46:11 +02:00
)
ON CONFLICT ON CONSTRAINT page_url_key DO UPDATE SET
last_index_date = now(),
page_size = $3,
checksum = $4,
title = $5,
author = $6,
description = $7,
excerpt = $8,
javascript = $9,
2022-07-08 19:46:11 +02:00
-- TODO: Maybe move this to a sub-query
fts_vector =
setweight(to_tsvector(coalesce($5, '')), $13) ||
setweight(to_tsvector(coalesce($6, '')), $13) ||
setweight(to_tsvector(coalesce($7, '')), $13) ||
setweight(to_tsvector(coalesce($10, '')), $13) ||
setweight(to_tsvector(coalesce($11, '')), $13) ||
setweight(to_tsvector(coalesce($12, '')), 'D');`,
c.DomainID, url.String(), counter.Length, hash.Sum([]byte{}),
meta.Title, meta.Author, meta.Description, excerpt,
meta.JavaScript, url.Host, c.labels, article.TextContent,
weights[weight])
2022-07-08 19:46:11 +02:00
return err
})
}
2022-07-09 18:57:39 +02:00
// Checks an HTTP response and returns true if the crawler should proceed.
func (c *Crawler) checkResponse(resp *http.Response, url *url.URL) bool {
switch resp.StatusCode {
case http.StatusOK:
// no-op
case http.StatusTooManyRequests:
retryAfter := resp.Header.Get("Retry-After")
if retryAfter == "" {
retryAfter = "3600"
}
2022-07-09 19:16:48 +02:00
t, err := http.ParseTime(retryAfter)
if err != nil {
2022-07-09 19:16:48 +02:00
seconds, err := strconv.Atoi(retryAfter)
if err != nil {
seconds = 3600
}
c.RetryAfter = time.Duration(seconds) * time.Second
} else {
c.RetryAfter = t.Sub(time.Now())
}
2022-07-09 19:16:48 +02:00
log.Printf("HTTP 429, pausing for %s", c.RetryAfter.String())
c.Schedule(url)
return false
default:
log.Printf("Unexpected status code %d", resp.StatusCode)
return false
}
contentType := resp.Header.Get("Content-Type")
if contentType == "" {
return false
}
if mt, _, err := mime.ParseMediaType(contentType); err != nil {
return false
} else if mt != "text/html" {
return false
}
return true
}
2022-07-09 18:57:39 +02:00
func collectMetadata(node *html.Node, meta *Metadata) {
if node.Type == html.ElementNode && node.Data == "title" && meta.Title == nil {
2022-07-09 18:57:39 +02:00
title := collectText(node)
meta.Title = &title
}
2022-07-09 18:57:39 +02:00
if node.Type == html.ElementNode && node.Data == "meta" {
var (
name string
content string
)
for _, attr := range node.Attr {
if attr.Key == "name" {
name = attr.Val
}
if attr.Key == "content" {
content = attr.Val
}
}
switch name {
case "robots":
meta.Robots = strings.Split(content, ",")
case "author":
meta.Author = &content
case "description":
meta.Description = &content
}
}
if node.Type == html.ElementNode && node.Data == "link" {
var (
rel string
href *url.URL
)
for _, attr := range node.Attr {
if attr.Key == "rel" {
rel = attr.Val
}
if attr.Key == "href" {
href, _ = url.Parse(attr.Val)
}
}
switch rel {
case "canonical":
meta.Canonical = href
}
}
if node.Type == html.ElementNode && node.Data == "script" {
meta.JavaScript = true
}
2022-07-09 18:57:39 +02:00
for child := node.FirstChild; child != nil; child = child.NextSibling {
collectMetadata(child, meta)
}
}
func collectText(node *html.Node) string {
text := ""
if node.Type == html.TextNode {
text += node.Data
}
for child := node.FirstChild; child != nil; child = child.NextSibling {
text += collectText(child)
}
return text
}