searchhut/crawler/index.go
Umar Getagazov 2971603710 Put domain labels minus eTLD into the text index
Before, only the hostname (say, harelang.org) was indexed, and no
results appeared for a "harelang" query. Now, all domain labels (minus
the eTLD) are indexed separately (for example, "docs" and "harelang" for
"docs.harelang.org"), and such queries work. eTLD is removed using the
data from Mozilla's Public Suffix List (https://publicsuffix.org).
2022-07-11 17:48:46 +02:00

258 lines
5.6 KiB
Go

package crawler
import (
"context"
"crypto/sha512"
"database/sql"
"io"
"log"
"mime"
"net/http"
"net/url"
"strconv"
"strings"
"time"
"github.com/go-shiori/go-readability"
"golang.org/x/net/html"
"git.sr.ht/~sircmpwn/searchhut/database"
)
type Metadata struct {
Title *string
Robots []string
Author *string
Description *string
Canonical *url.URL
JavaScript bool
}
func (c *Crawler) Index(ctx context.Context, url *url.URL) error {
log.Println(url.String())
resp, err := c.Head(ctx, url)
if err != nil {
return err
}
defer resp.Body.Close()
if !c.checkResponse(resp, url) {
return nil
}
resp, err = c.Get(ctx, url)
if err != nil {
return err
}
defer resp.Body.Close()
hash := sha512.New()
reader := io.TeeReader(resp.Body, hash)
if !c.checkResponse(resp, url) {
return nil
}
node, err := html.Parse(reader)
if err != nil {
return err
}
var (
index bool = true
follow bool = true
)
var meta Metadata
collectMetadata(node, &meta)
for _, item := range meta.Robots {
switch item {
case "none":
index = false
follow = false
case "noindex":
index = false
case "nofollow":
follow = false
}
}
if meta.Canonical != nil && meta.Canonical.String() != url.String() {
// TODO: Should we check for and remove the non-canonical URL from the
// database?
log.Printf("Re-scheduling canonical URL: %s", meta.Canonical.String())
c.Schedule(meta.Canonical)
return nil
}
if follow {
c.ScheduleLinks(url, node)
}
if !index {
return nil
}
article, err := readability.FromDocument(node, url)
if err != nil {
return err
}
return database.WithTx(ctx, nil, func(tx *sql.Tx) error {
_, err := tx.ExecContext(ctx, `
INSERT INTO page (
domain_id, -- $1
last_index_date,
source,
weight,
crawl_priority,
crawl_delay,
url, -- $2
checksum, -- $3
title, -- $4
author, -- $5
description, -- $6
excerpt, -- $7
javascript, -- $8
fts_vector
-- hostname -- $9
-- domain_labels -- $10
-- text_content -- $11
) VALUES (
$1, now(), 'crawler', 0, 0, '0s'::interval,
$2, $3, $4, $5, $6, $7, $8,
setweight(to_tsvector(coalesce($4, '')), 'A') ||
setweight(to_tsvector(coalesce($5, '')), 'A') ||
setweight(to_tsvector(coalesce($6, '')), 'A') ||
setweight(to_tsvector(coalesce($9, '')), 'A') ||
setweight(to_tsvector(coalesce($10, '')), 'A') ||
setweight(to_tsvector(coalesce($11, '')), 'D')
)
ON CONFLICT ON CONSTRAINT page_url_key DO UPDATE SET
last_index_date = now(),
checksum = $3,
title = $4,
author = $5,
description = $6,
excerpt = $7,
javascript = $8,
-- TODO: Maybe move this to a sub-query
fts_vector =
setweight(to_tsvector(coalesce($4, '')), 'A') ||
setweight(to_tsvector(coalesce($5, '')), 'A') ||
setweight(to_tsvector(coalesce($6, '')), 'A') ||
setweight(to_tsvector(coalesce($9, '')), 'A') ||
setweight(to_tsvector(coalesce($10, '')), 'A') ||
setweight(to_tsvector(coalesce($11, '')), 'D');`,
c.DomainID, url.String(), hash.Sum([]byte{}), meta.Title,
meta.Author, meta.Description, article.Excerpt, meta.JavaScript,
url.Host, c.labels, article.TextContent)
return err
})
}
// Checks an HTTP response and returns true if the crawler should proceed.
func (c *Crawler) checkResponse(resp *http.Response, url *url.URL) bool {
switch resp.StatusCode {
case http.StatusOK:
// no-op
case http.StatusTooManyRequests:
retryAfter := resp.Header.Get("Retry-After")
if retryAfter == "" {
retryAfter = "3600"
}
t, err := http.ParseTime(retryAfter)
if err != nil {
seconds, err := strconv.Atoi(retryAfter)
if err != nil {
seconds = 3600
}
c.RetryAfter = time.Duration(seconds) * time.Second
} else {
c.RetryAfter = t.Sub(time.Now())
}
log.Printf("HTTP 429, pausing for %s", c.RetryAfter.String())
c.Schedule(url)
return false
default:
log.Printf("Unexpected status code %d", resp.StatusCode)
return false
}
contentType := resp.Header.Get("Content-Type")
if contentType == "" {
return false
}
if mt, _, err := mime.ParseMediaType(contentType); err != nil {
return false
} else if mt != "text/html" {
return false
}
return true
}
func collectMetadata(node *html.Node, meta *Metadata) {
if node.Type == html.ElementNode && node.Data == "title" && meta.Title == nil {
title := collectText(node)
meta.Title = &title
}
if node.Type == html.ElementNode && node.Data == "meta" {
var (
name string
content string
)
for _, attr := range node.Attr {
if attr.Key == "name" {
name = attr.Val
}
if attr.Key == "content" {
content = attr.Val
}
}
switch name {
case "robots":
meta.Robots = strings.Split(content, ",")
case "author":
meta.Author = &content
case "description":
meta.Description = &content
}
}
if node.Type == html.ElementNode && node.Data == "link" {
var (
rel string
href *url.URL
)
for _, attr := range node.Attr {
if attr.Key == "rel" {
rel = attr.Val
}
if attr.Key == "href" {
href, _ = url.Parse(attr.Val)
}
}
switch rel {
case "canonical":
meta.Canonical = href
}
}
if node.Type == html.ElementNode && node.Data == "script" {
meta.JavaScript = true
}
for child := node.FirstChild; child != nil; child = child.NextSibling {
collectMetadata(child, meta)
}
}
func collectText(node *html.Node) string {
text := ""
if node.Type == html.TextNode {
text += node.Data
}
for child := node.FirstChild; child != nil; child = child.NextSibling {
text += collectText(child)
}
return text
}