searchhut/crawler/index.go

239 lines
5.1 KiB
Go
Raw Normal View History

2022-07-08 19:46:11 +02:00
package crawler
import (
"context"
"database/sql"
"log"
"mime"
"net/http"
2022-07-08 19:46:11 +02:00
"net/url"
"strconv"
2022-07-09 18:57:39 +02:00
"strings"
"time"
2022-07-08 19:46:11 +02:00
"github.com/go-shiori/go-readability"
"golang.org/x/net/html"
2022-07-08 19:46:11 +02:00
"git.sr.ht/~sircmpwn/searchhut/database"
)
2022-07-09 18:57:39 +02:00
type Metadata struct {
Title *string
Robots []string
Author *string
Description *string
Canonical *url.URL
2022-07-09 18:57:39 +02:00
}
2022-07-08 19:46:11 +02:00
func (c *Crawler) Index(ctx context.Context, url *url.URL) error {
log.Println(url.String())
resp, err := c.Head(ctx, url)
2022-07-08 20:30:09 +02:00
if err != nil {
return err
}
2022-07-08 19:46:11 +02:00
defer resp.Body.Close()
if !c.checkResponse(resp, url) {
return nil
}
resp, err = c.Get(ctx, url)
if err != nil {
return err
}
defer resp.Body.Close()
if !c.checkResponse(resp, url) {
return nil
}
node, err := html.Parse(resp.Body)
2022-07-08 19:46:11 +02:00
if err != nil {
return err
}
2022-07-09 18:57:39 +02:00
var (
index bool = true
follow bool = true
)
var meta Metadata
collectMetadata(node, &meta)
for _, item := range meta.Robots {
switch item {
case "none":
index = false
follow = false
case "noindex":
index = false
case "nofollow":
follow = false
}
}
if meta.Canonical != nil && meta.Canonical.String() != url.String() {
// TODO: Should we check for and remove the non-canonical URL from the
// database?
log.Printf("Re-scheduling canonical URL: %s", meta.Canonical.String())
c.Schedule(meta.Canonical)
return nil
}
2022-07-09 18:57:39 +02:00
if follow {
c.ScheduleLinks(url, node)
}
if !index {
return nil
}
2022-07-08 19:46:11 +02:00
article, err := readability.FromDocument(node, url)
if err != nil {
return err
2022-07-08 19:46:11 +02:00
}
return database.WithTx(ctx, nil, func(tx *sql.Tx) error {
_, err := tx.ExecContext(ctx, `
INSERT INTO page (
2022-07-09 18:57:39 +02:00
domain_id, -- $1
2022-07-08 19:46:11 +02:00
last_index_date,
weight,
crawl_priority,
crawl_delay,
2022-07-09 18:57:39 +02:00
url, -- $2
checksum, -- $3
title, -- $4
author, -- $5
description, -- $6
excerpt, -- $7
2022-07-08 19:46:11 +02:00
fts_vector
2022-07-09 18:57:39 +02:00
-- text_content -- $8
-- hostname -- $9
2022-07-08 19:46:11 +02:00
) VALUES (
$1, now(), 0, 0, '0s'::interval,
2022-07-09 18:57:39 +02:00
$2, $3, $4, $5, $6, $7,
2022-07-08 19:46:11 +02:00
setweight(to_tsvector(coalesce($4, '')), 'A') ||
setweight(to_tsvector(coalesce($5, '')), 'A') ||
2022-07-09 18:57:39 +02:00
setweight(to_tsvector(coalesce($6, '')), 'A') ||
setweight(to_tsvector(coalesce($8, '')), 'A') ||
setweight(to_tsvector(coalesce($9, '')), 'D')
2022-07-08 19:46:11 +02:00
)
ON CONFLICT ON CONSTRAINT page_url_key DO UPDATE SET
last_index_date = now(),
checksum = $3,
title = $4,
author = $5,
2022-07-09 18:57:39 +02:00
description = $6,
excerpt = $7,
2022-07-08 19:46:11 +02:00
-- TODO: Maybe move this to a sub-query
fts_vector =
setweight(to_tsvector(coalesce($4, '')), 'A') ||
setweight(to_tsvector(coalesce($5, '')), 'A') ||
2022-07-09 18:57:39 +02:00
setweight(to_tsvector(coalesce($6, '')), 'A') ||
setweight(to_tsvector(coalesce($8, '')), 'A') ||
setweight(to_tsvector(coalesce($9, '')), 'D');`,
c.DomainID, url.String(), []byte{}, meta.Title, meta.Author,
meta.Description, article.Excerpt, url.Host, article.TextContent)
2022-07-08 19:46:11 +02:00
return err
})
}
2022-07-09 18:57:39 +02:00
// Checks an HTTP response and returns true if the crawler should proceed.
func (c *Crawler) checkResponse(resp *http.Response, url *url.URL) bool {
switch resp.StatusCode {
case http.StatusOK:
// no-op
case http.StatusTooManyRequests:
retryAfter := resp.Header.Get("Retry-After")
if retryAfter == "" {
retryAfter = "3600"
}
2022-07-09 19:16:48 +02:00
t, err := http.ParseTime(retryAfter)
if err != nil {
2022-07-09 19:16:48 +02:00
seconds, err := strconv.Atoi(retryAfter)
if err != nil {
seconds = 3600
}
c.RetryAfter = time.Duration(seconds) * time.Second
} else {
c.RetryAfter = t.Sub(time.Now())
}
2022-07-09 19:16:48 +02:00
log.Printf("HTTP 429, pausing for %s", c.RetryAfter.String())
c.Schedule(url)
return false
default:
log.Printf("Unexpected status code %d", resp.StatusCode)
return false
}
contentType := resp.Header.Get("Content-Type")
if contentType == "" {
return false
}
if mt, _, err := mime.ParseMediaType(contentType); err != nil {
return false
} else if mt != "text/html" {
return false
}
return true
}
2022-07-09 18:57:39 +02:00
func collectMetadata(node *html.Node, meta *Metadata) {
if node.Type == html.ElementNode && node.Data == "title" {
title := collectText(node)
meta.Title = &title
}
if node.Type == html.ElementNode && node.Data == "meta" {
var (
name string
content string
)
for _, attr := range node.Attr {
if attr.Key == "name" {
name = attr.Val
}
if attr.Key == "content" {
content = attr.Val
}
}
switch name {
case "robots":
meta.Robots = strings.Split(content, ",")
case "author":
meta.Author = &content
case "description":
meta.Description = &content
}
}
if node.Type == html.ElementNode && node.Data == "link" {
var (
rel string
href *url.URL
)
for _, attr := range node.Attr {
if attr.Key == "rel" {
rel = attr.Val
}
if attr.Key == "href" {
href, _ = url.Parse(attr.Val)
}
}
switch rel {
case "canonical":
meta.Canonical = href
}
}
2022-07-09 18:57:39 +02:00
for child := node.FirstChild; child != nil; child = child.NextSibling {
collectMetadata(child, meta)
}
}
func collectText(node *html.Node) string {
text := ""
if node.Type == html.TextNode {
text += node.Data
}
for child := node.FirstChild; child != nil; child = child.NextSibling {
text += collectText(child)
}
return text
}