searchhut/crawler/index.go

88 lines
1.9 KiB
Go
Raw Normal View History

2022-07-08 19:46:11 +02:00
package crawler
import (
"context"
"database/sql"
"log"
"mime"
2022-07-08 19:46:11 +02:00
"net/url"
"github.com/go-shiori/go-readability"
"golang.org/x/net/html"
2022-07-08 19:46:11 +02:00
"git.sr.ht/~sircmpwn/searchhut/database"
)
func (c *Crawler) Index(ctx context.Context, url *url.URL) error {
log.Println(url.String())
resp, err := c.Get(ctx, url)
2022-07-08 20:30:09 +02:00
if err != nil {
return err
}
2022-07-08 19:46:11 +02:00
defer resp.Body.Close()
2022-07-08 20:30:09 +02:00
if resp.StatusCode != 200 {
log.Printf("Unexpected status code %d", resp.StatusCode)
return nil
}
contentType := resp.Header.Get("Content-Type")
if contentType == "" {
return nil
}
if mt, _, err := mime.ParseMediaType(contentType); err != nil {
return nil
} else if mt != "text/html" {
return nil
}
node, err := html.Parse(resp.Body)
2022-07-08 19:46:11 +02:00
if err != nil {
return err
}
c.ScheduleLinks(url, node)
2022-07-08 19:46:11 +02:00
article, err := readability.FromDocument(node, url)
if err != nil {
return err
2022-07-08 19:46:11 +02:00
}
return database.WithTx(ctx, nil, func(tx *sql.Tx) error {
// TODO: add more stuff
_, err := tx.ExecContext(ctx, `
INSERT INTO page (
domain_id,
last_index_date,
weight,
crawl_priority,
crawl_delay,
url,
checksum,
title,
author,
excerpt,
fts_vector
) VALUES (
$1, now(), 0, 0, '0s'::interval,
$2, $3, $4, $5, $6,
setweight(to_tsvector(coalesce($4, '')), 'A') ||
setweight(to_tsvector(coalesce($5, '')), 'A') ||
setweight(to_tsvector(coalesce($7, '')), 'B')
)
ON CONFLICT ON CONSTRAINT page_url_key DO UPDATE SET
last_index_date = now(),
checksum = $3,
title = $4,
author = $5,
excerpt = $6,
-- TODO: Maybe move this to a sub-query
fts_vector =
setweight(to_tsvector(coalesce($4, '')), 'A') ||
setweight(to_tsvector(coalesce($5, '')), 'A') ||
setweight(to_tsvector(coalesce($7, '')), 'B');`,
c.DomainID, url.String(), []byte{}, article.Title,
article.Byline, article.Excerpt, article.TextContent)
return err
})
}