Store page size in the database

Implements: https://todo.sr.ht/~sircmpwn/searchhut/33
This commit is contained in:
Umar Getagazov 2022-07-11 20:03:27 +07:00 committed by Drew DeVault
parent 7f555e21f5
commit cbd3732deb
3 changed files with 51 additions and 33 deletions

View file

@ -30,6 +30,15 @@ type Metadata struct {
JavaScript bool
}
type counterWriter struct {
Length int
}
func (c *counterWriter) Write(p []byte) (int, error) {
c.Length += len(p)
return len(p), nil
}
func (c *Crawler) Index(ctx context.Context, url *url.URL) error {
log.Println(url.String())
@ -48,7 +57,8 @@ func (c *Crawler) Index(ctx context.Context, url *url.URL) error {
}
defer resp.Body.Close()
hash := sha512.New()
reader := io.TeeReader(resp.Body, hash)
counter := counterWriter{}
reader := io.TeeReader(resp.Body, io.MultiWriter(&counter, hash))
if !c.checkResponse(resp, url) {
return nil
@ -112,45 +122,48 @@ func (c *Crawler) Index(ctx context.Context, url *url.URL) error {
last_index_date,
source,
url, -- $2
checksum, -- $3
title, -- $4
author, -- $5
description, -- $6
excerpt, -- $7
javascript, -- $8
page_size, -- $3
checksum, -- $4
title, -- $5
author, -- $6
description, -- $7
excerpt, -- $8
javascript, -- $9
fts_vector
-- hostname -- $9
-- domain_labels -- $10
-- text_content -- $11
-- hostname -- $10
-- domain_labels -- $11
-- text_content -- $12
) VALUES (
$1, now(), 'crawler',
$2, $3, $4, $5, $6, $7, $8,
setweight(to_tsvector(coalesce($4, '')), $12) ||
setweight(to_tsvector(coalesce($5, '')), $12) ||
setweight(to_tsvector(coalesce($6, '')), $12) ||
setweight(to_tsvector(coalesce($9, '')), $12) ||
setweight(to_tsvector(coalesce($10, '')), $12) ||
setweight(to_tsvector(coalesce($11, '')), 'D')
$2, $3, $4, $5, $6, $7, $8, $9,
setweight(to_tsvector(coalesce($5, '')), $13) ||
setweight(to_tsvector(coalesce($6, '')), $13) ||
setweight(to_tsvector(coalesce($7, '')), $13) ||
setweight(to_tsvector(coalesce($10, '')), $13) ||
setweight(to_tsvector(coalesce($11, '')), $13) ||
setweight(to_tsvector(coalesce($12, '')), 'D')
)
ON CONFLICT ON CONSTRAINT page_url_key DO UPDATE SET
last_index_date = now(),
checksum = $3,
title = $4,
author = $5,
description = $6,
excerpt = $7,
javascript = $8,
page_size = $3,
checksum = $4,
title = $5,
author = $6,
description = $7,
excerpt = $8,
javascript = $9,
-- TODO: Maybe move this to a sub-query
fts_vector =
setweight(to_tsvector(coalesce($4, '')), $12) ||
setweight(to_tsvector(coalesce($5, '')), $12) ||
setweight(to_tsvector(coalesce($6, '')), $12) ||
setweight(to_tsvector(coalesce($9, '')), $12) ||
setweight(to_tsvector(coalesce($10, '')), $12) ||
setweight(to_tsvector(coalesce($11, '')), 'D');`,
c.DomainID, url.String(), hash.Sum([]byte{}), meta.Title,
meta.Author, meta.Description, article.Excerpt, meta.JavaScript,
url.Host, c.labels, article.TextContent, weights[weight])
setweight(to_tsvector(coalesce($5, '')), $13) ||
setweight(to_tsvector(coalesce($6, '')), $13) ||
setweight(to_tsvector(coalesce($7, '')), $13) ||
setweight(to_tsvector(coalesce($10, '')), $13) ||
setweight(to_tsvector(coalesce($11, '')), $13) ||
setweight(to_tsvector(coalesce($12, '')), 'D');`,
c.DomainID, url.String(), counter.Length, hash.Sum([]byte{}),
meta.Title, meta.Author, meta.Description, article.Excerpt,
meta.JavaScript, url.Host, c.labels, article.TextContent,
weights[weight])
return err
})
}

View file

@ -88,6 +88,7 @@ for _, elem in parser:
last_index_date,
source,
url,
page_size,
checksum,
title,
excerpt,
@ -98,13 +99,15 @@ for _, elem in parser:
) VALUES (
%(domain_id)s,
now(), 'import/mediawiki',
%(url)s, %(checksum)s, %(title)s, %(excerpt)s, false,
%(url)s, %(page_size)s, %(checksum)s,
%(title)s, %(excerpt)s, false,
setweight(to_tsvector(%(title)s), 'A') ||
setweight(to_tsvector('en.wikipedia.org'), 'A') ||
setweight(to_tsvector(%(content)s), 'D')
)
ON CONFLICT ON CONSTRAINT page_url_key DO UPDATE SET
last_index_date = now(),
page_size = %(page_size)s,
checksum = %(checksum)s,
title = %(title)s,
excerpt = %(excerpt)s,
@ -116,6 +119,7 @@ for _, elem in parser:
""", {
"domain_id": domain_id,
"url": page.url,
"page_size": len(page.content),
"checksum": checksum,
"title": page.title,
"content": content,

View file

@ -13,6 +13,7 @@ CREATE TABLE page (
domain_id integer NOT NULL REFERENCES domain(id),
source varchar NOT NULL,
url text NOT NULL UNIQUE,
page_size integer NOT NULL,
checksum bytea NOT NULL UNIQUE,
last_index_date timestamptz NOT NULL,
fts_vector tsvector NOT NULL,