From cbd3732deba02c4c5e41c7fe6c2b7fd55dcfb61c Mon Sep 17 00:00:00 2001 From: Umar Getagazov Date: Mon, 11 Jul 2022 20:03:27 +0700 Subject: [PATCH] Store page size in the database Implements: https://todo.sr.ht/~sircmpwn/searchhut/33 --- crawler/index.go | 77 +++++++++++++++++++++++----------------- import/mediawiki/main.py | 6 +++- schema.sql | 1 + 3 files changed, 51 insertions(+), 33 deletions(-) diff --git a/crawler/index.go b/crawler/index.go index 2d7d05a..8520481 100644 --- a/crawler/index.go +++ b/crawler/index.go @@ -30,6 +30,15 @@ type Metadata struct { JavaScript bool } +type counterWriter struct { + Length int +} + +func (c *counterWriter) Write(p []byte) (int, error) { + c.Length += len(p) + return len(p), nil +} + func (c *Crawler) Index(ctx context.Context, url *url.URL) error { log.Println(url.String()) @@ -48,7 +57,8 @@ func (c *Crawler) Index(ctx context.Context, url *url.URL) error { } defer resp.Body.Close() hash := sha512.New() - reader := io.TeeReader(resp.Body, hash) + counter := counterWriter{} + reader := io.TeeReader(resp.Body, io.MultiWriter(&counter, hash)) if !c.checkResponse(resp, url) { return nil @@ -112,45 +122,48 @@ func (c *Crawler) Index(ctx context.Context, url *url.URL) error { last_index_date, source, url, -- $2 - checksum, -- $3 - title, -- $4 - author, -- $5 - description, -- $6 - excerpt, -- $7 - javascript, -- $8 + page_size, -- $3 + checksum, -- $4 + title, -- $5 + author, -- $6 + description, -- $7 + excerpt, -- $8 + javascript, -- $9 fts_vector - -- hostname -- $9 - -- domain_labels -- $10 - -- text_content -- $11 + -- hostname -- $10 + -- domain_labels -- $11 + -- text_content -- $12 ) VALUES ( $1, now(), 'crawler', - $2, $3, $4, $5, $6, $7, $8, - setweight(to_tsvector(coalesce($4, '')), $12) || - setweight(to_tsvector(coalesce($5, '')), $12) || - setweight(to_tsvector(coalesce($6, '')), $12) || - setweight(to_tsvector(coalesce($9, '')), $12) || - setweight(to_tsvector(coalesce($10, '')), $12) || - setweight(to_tsvector(coalesce($11, '')), 'D') + $2, $3, $4, $5, $6, $7, $8, $9, + setweight(to_tsvector(coalesce($5, '')), $13) || + setweight(to_tsvector(coalesce($6, '')), $13) || + setweight(to_tsvector(coalesce($7, '')), $13) || + setweight(to_tsvector(coalesce($10, '')), $13) || + setweight(to_tsvector(coalesce($11, '')), $13) || + setweight(to_tsvector(coalesce($12, '')), 'D') ) ON CONFLICT ON CONSTRAINT page_url_key DO UPDATE SET last_index_date = now(), - checksum = $3, - title = $4, - author = $5, - description = $6, - excerpt = $7, - javascript = $8, + page_size = $3, + checksum = $4, + title = $5, + author = $6, + description = $7, + excerpt = $8, + javascript = $9, -- TODO: Maybe move this to a sub-query fts_vector = - setweight(to_tsvector(coalesce($4, '')), $12) || - setweight(to_tsvector(coalesce($5, '')), $12) || - setweight(to_tsvector(coalesce($6, '')), $12) || - setweight(to_tsvector(coalesce($9, '')), $12) || - setweight(to_tsvector(coalesce($10, '')), $12) || - setweight(to_tsvector(coalesce($11, '')), 'D');`, - c.DomainID, url.String(), hash.Sum([]byte{}), meta.Title, - meta.Author, meta.Description, article.Excerpt, meta.JavaScript, - url.Host, c.labels, article.TextContent, weights[weight]) + setweight(to_tsvector(coalesce($5, '')), $13) || + setweight(to_tsvector(coalesce($6, '')), $13) || + setweight(to_tsvector(coalesce($7, '')), $13) || + setweight(to_tsvector(coalesce($10, '')), $13) || + setweight(to_tsvector(coalesce($11, '')), $13) || + setweight(to_tsvector(coalesce($12, '')), 'D');`, + c.DomainID, url.String(), counter.Length, hash.Sum([]byte{}), + meta.Title, meta.Author, meta.Description, article.Excerpt, + meta.JavaScript, url.Host, c.labels, article.TextContent, + weights[weight]) return err }) } diff --git a/import/mediawiki/main.py b/import/mediawiki/main.py index 9c660bc..7984645 100644 --- a/import/mediawiki/main.py +++ b/import/mediawiki/main.py @@ -88,6 +88,7 @@ for _, elem in parser: last_index_date, source, url, + page_size, checksum, title, excerpt, @@ -98,13 +99,15 @@ for _, elem in parser: ) VALUES ( %(domain_id)s, now(), 'import/mediawiki', - %(url)s, %(checksum)s, %(title)s, %(excerpt)s, false, + %(url)s, %(page_size)s, %(checksum)s, + %(title)s, %(excerpt)s, false, setweight(to_tsvector(%(title)s), 'A') || setweight(to_tsvector('en.wikipedia.org'), 'A') || setweight(to_tsvector(%(content)s), 'D') ) ON CONFLICT ON CONSTRAINT page_url_key DO UPDATE SET last_index_date = now(), + page_size = %(page_size)s, checksum = %(checksum)s, title = %(title)s, excerpt = %(excerpt)s, @@ -116,6 +119,7 @@ for _, elem in parser: """, { "domain_id": domain_id, "url": page.url, + "page_size": len(page.content), "checksum": checksum, "title": page.title, "content": content, diff --git a/schema.sql b/schema.sql index bb1e423..cdcc9e1 100644 --- a/schema.sql +++ b/schema.sql @@ -13,6 +13,7 @@ CREATE TABLE page ( domain_id integer NOT NULL REFERENCES domain(id), source varchar NOT NULL, url text NOT NULL UNIQUE, + page_size integer NOT NULL, checksum bytea NOT NULL UNIQUE, last_index_date timestamptz NOT NULL, fts_vector tsvector NOT NULL,