Rank authoritative websites and index pages higher

Implements: https://todo.sr.ht/~sircmpwn/searchhut/23
This commit is contained in:
Umar Getagazov 2022-07-11 19:59:35 +07:00 committed by Drew DeVault
parent 72649f0f0e
commit a7e6fba60f
2 changed files with 42 additions and 29 deletions

View file

@ -20,15 +20,16 @@ import (
)
type Crawler struct {
Client *http.Client
Domain string
DomainID int
Exclude []*regexp.Regexp
Delay time.Duration
RetryAfter time.Duration
Robots *robotstxt.Group
UserAgent string
Start time.Time
Client *http.Client
Domain string
DomainID int
Authoritative bool
Exclude []*regexp.Regexp
Delay time.Duration
RetryAfter time.Duration
Robots *robotstxt.Group
UserAgent string
Start time.Time
db *sql.DB
seen map[string]struct{}
@ -38,12 +39,13 @@ type Crawler struct {
func NewCrawler(ua string, db *sql.DB, domain string) *Crawler {
var (
domainID int
exclPatsStr pq.StringArray
exclPats = []*regexp.Regexp{}
domainID int
authoritative bool
exclPatsStr pq.StringArray
exclPats = []*regexp.Regexp{}
)
row := db.QueryRow(`SELECT id, exclusion_patterns FROM domain WHERE hostname = $1`, domain)
if err := row.Scan(&domainID, &exclPatsStr); err != nil {
row := db.QueryRow(`SELECT id, authoritative, exclusion_patterns FROM domain WHERE hostname = $1`, domain)
if err := row.Scan(&domainID, &authoritative, &exclPatsStr); err != nil {
log.Fatal(err)
}
@ -60,10 +62,11 @@ func NewCrawler(ua string, db *sql.DB, domain string) *Crawler {
}
return &Crawler{
Client: client,
Domain: domain,
DomainID: domainID,
Exclude: exclPats,
Client: client,
Domain: domain,
DomainID: domainID,
Authoritative: authoritative,
Exclude: exclPats,
// TODO: Dynamic crawl delay based on remote performance
Delay: 5 * time.Second,
Robots: nil,

View file

@ -19,6 +19,8 @@ import (
"git.sr.ht/~sircmpwn/searchhut/database"
)
var weights = [...]string{"D", "C", "B", "A"}
type Metadata struct {
Title *string
Robots []string
@ -95,6 +97,14 @@ func (c *Crawler) Index(ctx context.Context, url *url.URL) error {
return err
}
weight := 1
if c.Authoritative {
weight += 1
}
if url.Path == "" || url.Path == "/" {
weight += 1
}
return database.WithTx(ctx, nil, func(tx *sql.Tx) error {
_, err := tx.ExecContext(ctx, `
INSERT INTO page (
@ -118,11 +128,11 @@ func (c *Crawler) Index(ctx context.Context, url *url.URL) error {
) VALUES (
$1, now(), 'crawler', 0, 0, '0s'::interval,
$2, $3, $4, $5, $6, $7, $8,
setweight(to_tsvector(coalesce($4, '')), 'A') ||
setweight(to_tsvector(coalesce($5, '')), 'A') ||
setweight(to_tsvector(coalesce($6, '')), 'A') ||
setweight(to_tsvector(coalesce($9, '')), 'A') ||
setweight(to_tsvector(coalesce($10, '')), 'A') ||
setweight(to_tsvector(coalesce($4, '')), $12) ||
setweight(to_tsvector(coalesce($5, '')), $12) ||
setweight(to_tsvector(coalesce($6, '')), $12) ||
setweight(to_tsvector(coalesce($9, '')), $12) ||
setweight(to_tsvector(coalesce($10, '')), $12) ||
setweight(to_tsvector(coalesce($11, '')), 'D')
)
ON CONFLICT ON CONSTRAINT page_url_key DO UPDATE SET
@ -135,15 +145,15 @@ func (c *Crawler) Index(ctx context.Context, url *url.URL) error {
javascript = $8,
-- TODO: Maybe move this to a sub-query
fts_vector =
setweight(to_tsvector(coalesce($4, '')), 'A') ||
setweight(to_tsvector(coalesce($5, '')), 'A') ||
setweight(to_tsvector(coalesce($6, '')), 'A') ||
setweight(to_tsvector(coalesce($9, '')), 'A') ||
setweight(to_tsvector(coalesce($10, '')), 'A') ||
setweight(to_tsvector(coalesce($4, '')), $12) ||
setweight(to_tsvector(coalesce($5, '')), $12) ||
setweight(to_tsvector(coalesce($6, '')), $12) ||
setweight(to_tsvector(coalesce($9, '')), $12) ||
setweight(to_tsvector(coalesce($10, '')), $12) ||
setweight(to_tsvector(coalesce($11, '')), 'D');`,
c.DomainID, url.String(), hash.Sum([]byte{}), meta.Title,
meta.Author, meta.Description, article.Excerpt, meta.JavaScript,
url.Host, c.labels, article.TextContent)
url.Host, c.labels, article.TextContent, weights[weight])
return err
})
}