2022-07-08 19:46:11 +02:00
|
|
|
package crawler
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
2022-07-10 09:36:07 +02:00
|
|
|
"crypto/sha512"
|
2022-07-08 19:46:11 +02:00
|
|
|
"database/sql"
|
2022-07-10 09:36:07 +02:00
|
|
|
"io"
|
2022-07-08 19:46:11 +02:00
|
|
|
"log"
|
2022-07-08 20:13:32 +02:00
|
|
|
"mime"
|
2022-07-09 19:13:46 +02:00
|
|
|
"net/http"
|
2022-07-08 19:46:11 +02:00
|
|
|
"net/url"
|
2022-07-09 19:13:46 +02:00
|
|
|
"strconv"
|
2022-07-09 18:57:39 +02:00
|
|
|
"strings"
|
2022-07-09 19:13:46 +02:00
|
|
|
"time"
|
2022-07-08 19:46:11 +02:00
|
|
|
|
|
|
|
"github.com/go-shiori/go-readability"
|
2022-07-08 20:13:32 +02:00
|
|
|
"golang.org/x/net/html"
|
2022-07-08 19:46:11 +02:00
|
|
|
|
|
|
|
"git.sr.ht/~sircmpwn/searchhut/database"
|
|
|
|
)
|
|
|
|
|
2022-07-11 14:59:35 +02:00
|
|
|
var weights = [...]string{"D", "C", "B", "A"}
|
|
|
|
|
2022-07-09 18:57:39 +02:00
|
|
|
type Metadata struct {
|
|
|
|
Title *string
|
|
|
|
Robots []string
|
|
|
|
Author *string
|
|
|
|
Description *string
|
2022-07-09 19:06:28 +02:00
|
|
|
Canonical *url.URL
|
2022-07-10 09:07:37 +02:00
|
|
|
JavaScript bool
|
2022-07-09 18:57:39 +02:00
|
|
|
}
|
|
|
|
|
2022-07-08 19:46:11 +02:00
|
|
|
func (c *Crawler) Index(ctx context.Context, url *url.URL) error {
|
|
|
|
log.Println(url.String())
|
|
|
|
|
2022-07-09 18:59:23 +02:00
|
|
|
resp, err := c.Head(ctx, url)
|
2022-07-08 20:30:09 +02:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2022-07-08 19:46:11 +02:00
|
|
|
defer resp.Body.Close()
|
2022-07-09 19:13:46 +02:00
|
|
|
if !c.checkResponse(resp, url) {
|
2022-07-08 20:13:32 +02:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2022-07-09 18:59:23 +02:00
|
|
|
resp, err = c.Get(ctx, url)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
defer resp.Body.Close()
|
2022-07-10 09:36:07 +02:00
|
|
|
hash := sha512.New()
|
|
|
|
reader := io.TeeReader(resp.Body, hash)
|
|
|
|
|
2022-07-09 19:13:46 +02:00
|
|
|
if !c.checkResponse(resp, url) {
|
2022-07-09 18:59:23 +02:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2022-07-10 09:36:07 +02:00
|
|
|
node, err := html.Parse(reader)
|
2022-07-08 19:46:11 +02:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2022-07-09 18:57:39 +02:00
|
|
|
|
|
|
|
var (
|
|
|
|
index bool = true
|
|
|
|
follow bool = true
|
|
|
|
)
|
|
|
|
|
|
|
|
var meta Metadata
|
|
|
|
collectMetadata(node, &meta)
|
|
|
|
for _, item := range meta.Robots {
|
|
|
|
switch item {
|
|
|
|
case "none":
|
|
|
|
index = false
|
|
|
|
follow = false
|
|
|
|
case "noindex":
|
|
|
|
index = false
|
|
|
|
case "nofollow":
|
|
|
|
follow = false
|
|
|
|
}
|
|
|
|
}
|
2022-07-09 19:06:28 +02:00
|
|
|
if meta.Canonical != nil && meta.Canonical.String() != url.String() {
|
|
|
|
// TODO: Should we check for and remove the non-canonical URL from the
|
|
|
|
// database?
|
|
|
|
log.Printf("Re-scheduling canonical URL: %s", meta.Canonical.String())
|
|
|
|
c.Schedule(meta.Canonical)
|
|
|
|
return nil
|
|
|
|
}
|
2022-07-09 18:57:39 +02:00
|
|
|
|
|
|
|
if follow {
|
|
|
|
c.ScheduleLinks(url, node)
|
|
|
|
}
|
|
|
|
if !index {
|
|
|
|
return nil
|
|
|
|
}
|
2022-07-08 19:46:11 +02:00
|
|
|
|
2022-07-08 20:13:32 +02:00
|
|
|
article, err := readability.FromDocument(node, url)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
2022-07-08 19:46:11 +02:00
|
|
|
}
|
|
|
|
|
2022-07-11 14:59:35 +02:00
|
|
|
weight := 1
|
|
|
|
if c.Authoritative {
|
|
|
|
weight += 1
|
|
|
|
}
|
|
|
|
if url.Path == "" || url.Path == "/" {
|
|
|
|
weight += 1
|
|
|
|
}
|
|
|
|
|
2022-07-08 19:46:11 +02:00
|
|
|
return database.WithTx(ctx, nil, func(tx *sql.Tx) error {
|
|
|
|
_, err := tx.ExecContext(ctx, `
|
|
|
|
INSERT INTO page (
|
2022-07-09 18:57:39 +02:00
|
|
|
domain_id, -- $1
|
2022-07-08 19:46:11 +02:00
|
|
|
last_index_date,
|
2022-07-10 10:13:11 +02:00
|
|
|
source,
|
2022-07-08 19:46:11 +02:00
|
|
|
weight,
|
|
|
|
crawl_priority,
|
|
|
|
crawl_delay,
|
2022-07-09 18:57:39 +02:00
|
|
|
url, -- $2
|
|
|
|
checksum, -- $3
|
|
|
|
title, -- $4
|
|
|
|
author, -- $5
|
2022-07-11 14:58:34 +02:00
|
|
|
description, -- $6
|
2022-07-09 18:57:39 +02:00
|
|
|
excerpt, -- $7
|
2022-07-10 09:07:37 +02:00
|
|
|
javascript, -- $8
|
2022-07-08 19:46:11 +02:00
|
|
|
fts_vector
|
2022-07-11 14:58:34 +02:00
|
|
|
-- hostname -- $9
|
|
|
|
-- domain_labels -- $10
|
|
|
|
-- text_content -- $11
|
2022-07-08 19:46:11 +02:00
|
|
|
) VALUES (
|
2022-07-10 10:13:11 +02:00
|
|
|
$1, now(), 'crawler', 0, 0, '0s'::interval,
|
2022-07-10 09:07:37 +02:00
|
|
|
$2, $3, $4, $5, $6, $7, $8,
|
2022-07-11 14:59:35 +02:00
|
|
|
setweight(to_tsvector(coalesce($4, '')), $12) ||
|
|
|
|
setweight(to_tsvector(coalesce($5, '')), $12) ||
|
|
|
|
setweight(to_tsvector(coalesce($6, '')), $12) ||
|
|
|
|
setweight(to_tsvector(coalesce($9, '')), $12) ||
|
|
|
|
setweight(to_tsvector(coalesce($10, '')), $12) ||
|
2022-07-11 14:58:34 +02:00
|
|
|
setweight(to_tsvector(coalesce($11, '')), 'D')
|
2022-07-08 19:46:11 +02:00
|
|
|
)
|
|
|
|
ON CONFLICT ON CONSTRAINT page_url_key DO UPDATE SET
|
|
|
|
last_index_date = now(),
|
|
|
|
checksum = $3,
|
|
|
|
title = $4,
|
|
|
|
author = $5,
|
2022-07-09 18:57:39 +02:00
|
|
|
description = $6,
|
|
|
|
excerpt = $7,
|
2022-07-10 09:07:37 +02:00
|
|
|
javascript = $8,
|
2022-07-08 19:46:11 +02:00
|
|
|
-- TODO: Maybe move this to a sub-query
|
|
|
|
fts_vector =
|
2022-07-11 14:59:35 +02:00
|
|
|
setweight(to_tsvector(coalesce($4, '')), $12) ||
|
|
|
|
setweight(to_tsvector(coalesce($5, '')), $12) ||
|
|
|
|
setweight(to_tsvector(coalesce($6, '')), $12) ||
|
|
|
|
setweight(to_tsvector(coalesce($9, '')), $12) ||
|
|
|
|
setweight(to_tsvector(coalesce($10, '')), $12) ||
|
2022-07-11 14:58:34 +02:00
|
|
|
setweight(to_tsvector(coalesce($11, '')), 'D');`,
|
2022-07-10 09:36:07 +02:00
|
|
|
c.DomainID, url.String(), hash.Sum([]byte{}), meta.Title,
|
|
|
|
meta.Author, meta.Description, article.Excerpt, meta.JavaScript,
|
2022-07-11 14:59:35 +02:00
|
|
|
url.Host, c.labels, article.TextContent, weights[weight])
|
2022-07-08 19:46:11 +02:00
|
|
|
return err
|
|
|
|
})
|
|
|
|
}
|
2022-07-09 18:57:39 +02:00
|
|
|
|
2022-07-09 19:13:46 +02:00
|
|
|
// Checks an HTTP response and returns true if the crawler should proceed.
|
|
|
|
func (c *Crawler) checkResponse(resp *http.Response, url *url.URL) bool {
|
|
|
|
switch resp.StatusCode {
|
|
|
|
case http.StatusOK:
|
|
|
|
// no-op
|
|
|
|
case http.StatusTooManyRequests:
|
|
|
|
retryAfter := resp.Header.Get("Retry-After")
|
|
|
|
if retryAfter == "" {
|
|
|
|
retryAfter = "3600"
|
|
|
|
}
|
2022-07-09 19:16:48 +02:00
|
|
|
t, err := http.ParseTime(retryAfter)
|
2022-07-09 19:13:46 +02:00
|
|
|
if err != nil {
|
2022-07-09 19:16:48 +02:00
|
|
|
seconds, err := strconv.Atoi(retryAfter)
|
|
|
|
if err != nil {
|
|
|
|
seconds = 3600
|
|
|
|
}
|
|
|
|
c.RetryAfter = time.Duration(seconds) * time.Second
|
|
|
|
} else {
|
|
|
|
c.RetryAfter = t.Sub(time.Now())
|
2022-07-09 19:13:46 +02:00
|
|
|
}
|
2022-07-09 19:16:48 +02:00
|
|
|
log.Printf("HTTP 429, pausing for %s", c.RetryAfter.String())
|
2022-07-09 19:13:46 +02:00
|
|
|
c.Schedule(url)
|
|
|
|
return false
|
|
|
|
default:
|
|
|
|
log.Printf("Unexpected status code %d", resp.StatusCode)
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
contentType := resp.Header.Get("Content-Type")
|
|
|
|
if contentType == "" {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
if mt, _, err := mime.ParseMediaType(contentType); err != nil {
|
|
|
|
return false
|
|
|
|
} else if mt != "text/html" {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
|
2022-07-09 18:57:39 +02:00
|
|
|
func collectMetadata(node *html.Node, meta *Metadata) {
|
2022-07-10 09:07:37 +02:00
|
|
|
if node.Type == html.ElementNode && node.Data == "title" && meta.Title == nil {
|
2022-07-09 18:57:39 +02:00
|
|
|
title := collectText(node)
|
|
|
|
meta.Title = &title
|
|
|
|
}
|
2022-07-10 09:07:37 +02:00
|
|
|
|
2022-07-09 18:57:39 +02:00
|
|
|
if node.Type == html.ElementNode && node.Data == "meta" {
|
|
|
|
var (
|
|
|
|
name string
|
|
|
|
content string
|
|
|
|
)
|
|
|
|
for _, attr := range node.Attr {
|
|
|
|
if attr.Key == "name" {
|
|
|
|
name = attr.Val
|
|
|
|
}
|
|
|
|
if attr.Key == "content" {
|
|
|
|
content = attr.Val
|
|
|
|
}
|
|
|
|
}
|
|
|
|
switch name {
|
|
|
|
case "robots":
|
|
|
|
meta.Robots = strings.Split(content, ",")
|
|
|
|
case "author":
|
|
|
|
meta.Author = &content
|
|
|
|
case "description":
|
|
|
|
meta.Description = &content
|
|
|
|
}
|
|
|
|
}
|
2022-07-10 09:07:37 +02:00
|
|
|
|
2022-07-09 19:06:28 +02:00
|
|
|
if node.Type == html.ElementNode && node.Data == "link" {
|
|
|
|
var (
|
|
|
|
rel string
|
|
|
|
href *url.URL
|
|
|
|
)
|
|
|
|
for _, attr := range node.Attr {
|
|
|
|
if attr.Key == "rel" {
|
|
|
|
rel = attr.Val
|
|
|
|
}
|
|
|
|
if attr.Key == "href" {
|
|
|
|
href, _ = url.Parse(attr.Val)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
switch rel {
|
|
|
|
case "canonical":
|
|
|
|
meta.Canonical = href
|
|
|
|
}
|
|
|
|
}
|
2022-07-10 09:07:37 +02:00
|
|
|
|
|
|
|
if node.Type == html.ElementNode && node.Data == "script" {
|
|
|
|
meta.JavaScript = true
|
|
|
|
}
|
|
|
|
|
2022-07-09 18:57:39 +02:00
|
|
|
for child := node.FirstChild; child != nil; child = child.NextSibling {
|
|
|
|
collectMetadata(child, meta)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func collectText(node *html.Node) string {
|
|
|
|
text := ""
|
|
|
|
if node.Type == html.TextNode {
|
|
|
|
text += node.Data
|
|
|
|
}
|
|
|
|
for child := node.FirstChild; child != nil; child = child.NextSibling {
|
|
|
|
text += collectText(child)
|
|
|
|
}
|
|
|
|
return text
|
|
|
|
}
|