65 lines
1.5 KiB
Go
65 lines
1.5 KiB
Go
package crawler
|
|
|
|
import (
|
|
"context"
|
|
"database/sql"
|
|
"log"
|
|
"net/url"
|
|
|
|
"github.com/go-shiori/go-readability"
|
|
|
|
"git.sr.ht/~sircmpwn/searchhut/database"
|
|
)
|
|
|
|
func (c *Crawler) Index(ctx context.Context, url *url.URL) error {
|
|
log.Println(url.String())
|
|
|
|
resp, err := c.Get(ctx, url)
|
|
defer resp.Body.Close()
|
|
article, err := readability.FromReader(resp.Body, url)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if article.Node != nil {
|
|
c.ScheduleLinks(url, article.Node)
|
|
}
|
|
|
|
return database.WithTx(ctx, nil, func(tx *sql.Tx) error {
|
|
// TODO: add more stuff
|
|
_, err := tx.ExecContext(ctx, `
|
|
INSERT INTO page (
|
|
domain_id,
|
|
last_index_date,
|
|
weight,
|
|
crawl_priority,
|
|
crawl_delay,
|
|
url,
|
|
checksum,
|
|
title,
|
|
author,
|
|
excerpt,
|
|
fts_vector
|
|
) VALUES (
|
|
$1, now(), 0, 0, '0s'::interval,
|
|
$2, $3, $4, $5, $6,
|
|
setweight(to_tsvector(coalesce($4, '')), 'A') ||
|
|
setweight(to_tsvector(coalesce($5, '')), 'A') ||
|
|
setweight(to_tsvector(coalesce($7, '')), 'B')
|
|
)
|
|
ON CONFLICT ON CONSTRAINT page_url_key DO UPDATE SET
|
|
last_index_date = now(),
|
|
checksum = $3,
|
|
title = $4,
|
|
author = $5,
|
|
excerpt = $6,
|
|
-- TODO: Maybe move this to a sub-query
|
|
fts_vector =
|
|
setweight(to_tsvector(coalesce($4, '')), 'A') ||
|
|
setweight(to_tsvector(coalesce($5, '')), 'A') ||
|
|
setweight(to_tsvector(coalesce($7, '')), 'B');`,
|
|
c.DomainID, url.String(), []byte{}, article.Title,
|
|
article.Byline, article.Excerpt, article.TextContent)
|
|
return err
|
|
})
|
|
}
|