searchhut/crawler/index.go
2022-07-09 18:59:23 +02:00

180 lines
3.8 KiB
Go

package crawler
import (
"context"
"database/sql"
"log"
"mime"
"net/url"
"strings"
"github.com/go-shiori/go-readability"
"golang.org/x/net/html"
"git.sr.ht/~sircmpwn/searchhut/database"
)
type Metadata struct {
Title *string
Robots []string
Author *string
Description *string
}
func (c *Crawler) Index(ctx context.Context, url *url.URL) error {
log.Println(url.String())
resp, err := c.Head(ctx, url)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
log.Printf("Unexpected status code %d", resp.StatusCode)
return nil
}
contentType := resp.Header.Get("Content-Type")
if contentType == "" {
return nil
}
if mt, _, err := mime.ParseMediaType(contentType); err != nil {
return nil
} else if mt != "text/html" {
return nil
}
resp, err = c.Get(ctx, url)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode != 200 {
log.Printf("Unexpected status code %d", resp.StatusCode)
return nil
}
node, err := html.Parse(resp.Body)
if err != nil {
return err
}
var (
index bool = true
follow bool = true
)
var meta Metadata
collectMetadata(node, &meta)
for _, item := range meta.Robots {
switch item {
case "none":
index = false
follow = false
case "noindex":
index = false
case "nofollow":
follow = false
}
}
if follow {
c.ScheduleLinks(url, node)
}
if !index {
return nil
}
article, err := readability.FromDocument(node, url)
if err != nil {
return err
}
return database.WithTx(ctx, nil, func(tx *sql.Tx) error {
_, err := tx.ExecContext(ctx, `
INSERT INTO page (
domain_id, -- $1
last_index_date,
weight,
crawl_priority,
crawl_delay,
url, -- $2
checksum, -- $3
title, -- $4
author, -- $5
description, -- $6
excerpt, -- $7
fts_vector
-- text_content -- $8
-- hostname -- $9
) VALUES (
$1, now(), 0, 0, '0s'::interval,
$2, $3, $4, $5, $6, $7,
setweight(to_tsvector(coalesce($4, '')), 'A') ||
setweight(to_tsvector(coalesce($5, '')), 'A') ||
setweight(to_tsvector(coalesce($6, '')), 'A') ||
setweight(to_tsvector(coalesce($8, '')), 'A') ||
setweight(to_tsvector(coalesce($9, '')), 'D')
)
ON CONFLICT ON CONSTRAINT page_url_key DO UPDATE SET
last_index_date = now(),
checksum = $3,
title = $4,
author = $5,
description = $6,
excerpt = $7,
-- TODO: Maybe move this to a sub-query
fts_vector =
setweight(to_tsvector(coalesce($4, '')), 'A') ||
setweight(to_tsvector(coalesce($5, '')), 'A') ||
setweight(to_tsvector(coalesce($6, '')), 'A') ||
setweight(to_tsvector(coalesce($8, '')), 'A') ||
setweight(to_tsvector(coalesce($9, '')), 'D');`,
c.DomainID, url.String(), []byte{}, meta.Title, meta.Author,
meta.Description, article.Excerpt, url.Host, article.TextContent)
return err
})
}
func collectMetadata(node *html.Node, meta *Metadata) {
if node.Type == html.ElementNode && node.Data == "title" {
title := collectText(node)
meta.Title = &title
}
if node.Type == html.ElementNode && node.Data == "meta" {
var (
name string
content string
)
for _, attr := range node.Attr {
if attr.Key == "name" {
name = attr.Val
}
if attr.Key == "content" {
content = attr.Val
}
}
switch name {
case "robots":
meta.Robots = strings.Split(content, ",")
case "author":
meta.Author = &content
case "description":
meta.Description = &content
}
}
for child := node.FirstChild; child != nil; child = child.NextSibling {
collectMetadata(child, meta)
}
}
func collectText(node *html.Node) string {
text := ""
if node.Type == html.TextNode {
text += node.Data
}
for child := node.FirstChild; child != nil; child = child.NextSibling {
text += collectText(child)
}
return text
}