2971603710
Before, only the hostname (say, harelang.org) was indexed, and no results appeared for a "harelang" query. Now, all domain labels (minus the eTLD) are indexed separately (for example, "docs" and "harelang" for "docs.harelang.org"), and such queries work. eTLD is removed using the data from Mozilla's Public Suffix List (https://publicsuffix.org).
258 lines
5.6 KiB
Go
258 lines
5.6 KiB
Go
package crawler
|
|
|
|
import (
|
|
"context"
|
|
"crypto/sha512"
|
|
"database/sql"
|
|
"io"
|
|
"log"
|
|
"mime"
|
|
"net/http"
|
|
"net/url"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/go-shiori/go-readability"
|
|
"golang.org/x/net/html"
|
|
|
|
"git.sr.ht/~sircmpwn/searchhut/database"
|
|
)
|
|
|
|
type Metadata struct {
|
|
Title *string
|
|
Robots []string
|
|
Author *string
|
|
Description *string
|
|
Canonical *url.URL
|
|
JavaScript bool
|
|
}
|
|
|
|
func (c *Crawler) Index(ctx context.Context, url *url.URL) error {
|
|
log.Println(url.String())
|
|
|
|
resp, err := c.Head(ctx, url)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer resp.Body.Close()
|
|
if !c.checkResponse(resp, url) {
|
|
return nil
|
|
}
|
|
|
|
resp, err = c.Get(ctx, url)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer resp.Body.Close()
|
|
hash := sha512.New()
|
|
reader := io.TeeReader(resp.Body, hash)
|
|
|
|
if !c.checkResponse(resp, url) {
|
|
return nil
|
|
}
|
|
|
|
node, err := html.Parse(reader)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
var (
|
|
index bool = true
|
|
follow bool = true
|
|
)
|
|
|
|
var meta Metadata
|
|
collectMetadata(node, &meta)
|
|
for _, item := range meta.Robots {
|
|
switch item {
|
|
case "none":
|
|
index = false
|
|
follow = false
|
|
case "noindex":
|
|
index = false
|
|
case "nofollow":
|
|
follow = false
|
|
}
|
|
}
|
|
if meta.Canonical != nil && meta.Canonical.String() != url.String() {
|
|
// TODO: Should we check for and remove the non-canonical URL from the
|
|
// database?
|
|
log.Printf("Re-scheduling canonical URL: %s", meta.Canonical.String())
|
|
c.Schedule(meta.Canonical)
|
|
return nil
|
|
}
|
|
|
|
if follow {
|
|
c.ScheduleLinks(url, node)
|
|
}
|
|
if !index {
|
|
return nil
|
|
}
|
|
|
|
article, err := readability.FromDocument(node, url)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
return database.WithTx(ctx, nil, func(tx *sql.Tx) error {
|
|
_, err := tx.ExecContext(ctx, `
|
|
INSERT INTO page (
|
|
domain_id, -- $1
|
|
last_index_date,
|
|
source,
|
|
weight,
|
|
crawl_priority,
|
|
crawl_delay,
|
|
url, -- $2
|
|
checksum, -- $3
|
|
title, -- $4
|
|
author, -- $5
|
|
description, -- $6
|
|
excerpt, -- $7
|
|
javascript, -- $8
|
|
fts_vector
|
|
-- hostname -- $9
|
|
-- domain_labels -- $10
|
|
-- text_content -- $11
|
|
) VALUES (
|
|
$1, now(), 'crawler', 0, 0, '0s'::interval,
|
|
$2, $3, $4, $5, $6, $7, $8,
|
|
setweight(to_tsvector(coalesce($4, '')), 'A') ||
|
|
setweight(to_tsvector(coalesce($5, '')), 'A') ||
|
|
setweight(to_tsvector(coalesce($6, '')), 'A') ||
|
|
setweight(to_tsvector(coalesce($9, '')), 'A') ||
|
|
setweight(to_tsvector(coalesce($10, '')), 'A') ||
|
|
setweight(to_tsvector(coalesce($11, '')), 'D')
|
|
)
|
|
ON CONFLICT ON CONSTRAINT page_url_key DO UPDATE SET
|
|
last_index_date = now(),
|
|
checksum = $3,
|
|
title = $4,
|
|
author = $5,
|
|
description = $6,
|
|
excerpt = $7,
|
|
javascript = $8,
|
|
-- TODO: Maybe move this to a sub-query
|
|
fts_vector =
|
|
setweight(to_tsvector(coalesce($4, '')), 'A') ||
|
|
setweight(to_tsvector(coalesce($5, '')), 'A') ||
|
|
setweight(to_tsvector(coalesce($6, '')), 'A') ||
|
|
setweight(to_tsvector(coalesce($9, '')), 'A') ||
|
|
setweight(to_tsvector(coalesce($10, '')), 'A') ||
|
|
setweight(to_tsvector(coalesce($11, '')), 'D');`,
|
|
c.DomainID, url.String(), hash.Sum([]byte{}), meta.Title,
|
|
meta.Author, meta.Description, article.Excerpt, meta.JavaScript,
|
|
url.Host, c.labels, article.TextContent)
|
|
return err
|
|
})
|
|
}
|
|
|
|
// Checks an HTTP response and returns true if the crawler should proceed.
|
|
func (c *Crawler) checkResponse(resp *http.Response, url *url.URL) bool {
|
|
switch resp.StatusCode {
|
|
case http.StatusOK:
|
|
// no-op
|
|
case http.StatusTooManyRequests:
|
|
retryAfter := resp.Header.Get("Retry-After")
|
|
if retryAfter == "" {
|
|
retryAfter = "3600"
|
|
}
|
|
t, err := http.ParseTime(retryAfter)
|
|
if err != nil {
|
|
seconds, err := strconv.Atoi(retryAfter)
|
|
if err != nil {
|
|
seconds = 3600
|
|
}
|
|
c.RetryAfter = time.Duration(seconds) * time.Second
|
|
} else {
|
|
c.RetryAfter = t.Sub(time.Now())
|
|
}
|
|
log.Printf("HTTP 429, pausing for %s", c.RetryAfter.String())
|
|
c.Schedule(url)
|
|
return false
|
|
default:
|
|
log.Printf("Unexpected status code %d", resp.StatusCode)
|
|
return false
|
|
}
|
|
|
|
contentType := resp.Header.Get("Content-Type")
|
|
if contentType == "" {
|
|
return false
|
|
}
|
|
if mt, _, err := mime.ParseMediaType(contentType); err != nil {
|
|
return false
|
|
} else if mt != "text/html" {
|
|
return false
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
func collectMetadata(node *html.Node, meta *Metadata) {
|
|
if node.Type == html.ElementNode && node.Data == "title" && meta.Title == nil {
|
|
title := collectText(node)
|
|
meta.Title = &title
|
|
}
|
|
|
|
if node.Type == html.ElementNode && node.Data == "meta" {
|
|
var (
|
|
name string
|
|
content string
|
|
)
|
|
for _, attr := range node.Attr {
|
|
if attr.Key == "name" {
|
|
name = attr.Val
|
|
}
|
|
if attr.Key == "content" {
|
|
content = attr.Val
|
|
}
|
|
}
|
|
switch name {
|
|
case "robots":
|
|
meta.Robots = strings.Split(content, ",")
|
|
case "author":
|
|
meta.Author = &content
|
|
case "description":
|
|
meta.Description = &content
|
|
}
|
|
}
|
|
|
|
if node.Type == html.ElementNode && node.Data == "link" {
|
|
var (
|
|
rel string
|
|
href *url.URL
|
|
)
|
|
for _, attr := range node.Attr {
|
|
if attr.Key == "rel" {
|
|
rel = attr.Val
|
|
}
|
|
if attr.Key == "href" {
|
|
href, _ = url.Parse(attr.Val)
|
|
}
|
|
}
|
|
switch rel {
|
|
case "canonical":
|
|
meta.Canonical = href
|
|
}
|
|
}
|
|
|
|
if node.Type == html.ElementNode && node.Data == "script" {
|
|
meta.JavaScript = true
|
|
}
|
|
|
|
for child := node.FirstChild; child != nil; child = child.NextSibling {
|
|
collectMetadata(child, meta)
|
|
}
|
|
}
|
|
|
|
func collectText(node *html.Node) string {
|
|
text := ""
|
|
if node.Type == html.TextNode {
|
|
text += node.Data
|
|
}
|
|
for child := node.FirstChild; child != nil; child = child.NextSibling {
|
|
text += collectText(child)
|
|
}
|
|
return text
|
|
}
|