searchhut/crawler/index.go

66 lines
1.5 KiB
Go
Raw Normal View History

2022-07-08 19:46:11 +02:00
package crawler
import (
"context"
"database/sql"
"log"
"net/url"
"github.com/go-shiori/go-readability"
"git.sr.ht/~sircmpwn/searchhut/database"
)
func (c *Crawler) Index(ctx context.Context, url *url.URL) error {
log.Println(url.String())
resp, err := c.Get(ctx, url)
defer resp.Body.Close()
article, err := readability.FromReader(resp.Body, url)
if err != nil {
return err
}
if article.Node != nil {
c.ScheduleLinks(url, article.Node)
}
return database.WithTx(ctx, nil, func(tx *sql.Tx) error {
// TODO: add more stuff
_, err := tx.ExecContext(ctx, `
INSERT INTO page (
domain_id,
last_index_date,
weight,
crawl_priority,
crawl_delay,
url,
checksum,
title,
author,
excerpt,
fts_vector
) VALUES (
$1, now(), 0, 0, '0s'::interval,
$2, $3, $4, $5, $6,
setweight(to_tsvector(coalesce($4, '')), 'A') ||
setweight(to_tsvector(coalesce($5, '')), 'A') ||
setweight(to_tsvector(coalesce($7, '')), 'B')
)
ON CONFLICT ON CONSTRAINT page_url_key DO UPDATE SET
last_index_date = now(),
checksum = $3,
title = $4,
author = $5,
excerpt = $6,
-- TODO: Maybe move this to a sub-query
fts_vector =
setweight(to_tsvector(coalesce($4, '')), 'A') ||
setweight(to_tsvector(coalesce($5, '')), 'A') ||
setweight(to_tsvector(coalesce($7, '')), 'B');`,
c.DomainID, url.String(), []byte{}, article.Title,
article.Byline, article.Excerpt, article.TextContent)
return err
})
}