import/*: fix page_size issues

This commit is contained in:
Drew DeVault 2022-07-13 10:24:26 +02:00
parent 69a9e20a0a
commit 9473f3b49b
2 changed files with 6 additions and 1 deletions

View file

@ -43,6 +43,7 @@ for _, elem in parser:
sha.update(page.text.encode()) sha.update(page.text.encode())
checksum = sha.digest() checksum = sha.digest()
page_size = len(page.text)
excerpt = page.text.split("\n")[0] excerpt = page.text.split("\n")[0]
# XXX: Could be better for non-English languages # XXX: Could be better for non-English languages
if len(excerpt) > 512: if len(excerpt) > 512:
@ -61,6 +62,7 @@ for _, elem in parser:
title, title,
excerpt, excerpt,
javascript, javascript,
page_size,
fts_vector fts_vector
-- text_content -- text_content
-- hostname -- hostname
@ -68,6 +70,7 @@ for _, elem in parser:
%(domain_id)s, %(domain_id)s,
now(), 'import/cve', now(), 'import/cve',
%(url)s, %(checksum)s, %(title)s, %(excerpt)s, true, %(url)s, %(checksum)s, %(title)s, %(excerpt)s, true,
%(page_size)s,
setweight(to_tsvector(%(title)s), 'A') || setweight(to_tsvector(%(title)s), 'A') ||
setweight(to_tsvector('cve.org'), 'A') || setweight(to_tsvector('cve.org'), 'A') ||
setweight(to_tsvector(%(content)s), 'D') setweight(to_tsvector(%(content)s), 'D')
@ -77,6 +80,7 @@ for _, elem in parser:
checksum = %(checksum)s, checksum = %(checksum)s,
title = %(title)s, title = %(title)s,
excerpt = %(excerpt)s, excerpt = %(excerpt)s,
page_size = %(page_size)s,
-- TODO: Maybe move this to a sub-query -- TODO: Maybe move this to a sub-query
fts_vector = fts_vector =
setweight(to_tsvector(%(title)s), 'A') || setweight(to_tsvector(%(title)s), 'A') ||
@ -89,6 +93,7 @@ for _, elem in parser:
"title": page.name, "title": page.name,
"content": page.text, "content": page.text,
"excerpt": excerpt, "excerpt": excerpt,
"page_size": page_size,
}) })
conn.commit() conn.commit()
except psycopg2.errors.UniqueViolation: except psycopg2.errors.UniqueViolation:

View file

@ -119,7 +119,7 @@ for _, elem in parser:
""", { """, {
"domain_id": domain_id, "domain_id": domain_id,
"url": page.url, "url": page.url,
"page_size": len(page.content), "page_size": len(page.markup),
"checksum": checksum, "checksum": checksum,
"title": page.title, "title": page.title,
"content": content, "content": content,