import/*: fix page_size issues
This commit is contained in:
parent
69a9e20a0a
commit
9473f3b49b
2 changed files with 6 additions and 1 deletions
|
@ -43,6 +43,7 @@ for _, elem in parser:
|
|||
sha.update(page.text.encode())
|
||||
checksum = sha.digest()
|
||||
|
||||
page_size = len(page.text)
|
||||
excerpt = page.text.split("\n")[0]
|
||||
# XXX: Could be better for non-English languages
|
||||
if len(excerpt) > 512:
|
||||
|
@ -61,6 +62,7 @@ for _, elem in parser:
|
|||
title,
|
||||
excerpt,
|
||||
javascript,
|
||||
page_size,
|
||||
fts_vector
|
||||
-- text_content
|
||||
-- hostname
|
||||
|
@ -68,6 +70,7 @@ for _, elem in parser:
|
|||
%(domain_id)s,
|
||||
now(), 'import/cve',
|
||||
%(url)s, %(checksum)s, %(title)s, %(excerpt)s, true,
|
||||
%(page_size)s,
|
||||
setweight(to_tsvector(%(title)s), 'A') ||
|
||||
setweight(to_tsvector('cve.org'), 'A') ||
|
||||
setweight(to_tsvector(%(content)s), 'D')
|
||||
|
@ -77,6 +80,7 @@ for _, elem in parser:
|
|||
checksum = %(checksum)s,
|
||||
title = %(title)s,
|
||||
excerpt = %(excerpt)s,
|
||||
page_size = %(page_size)s,
|
||||
-- TODO: Maybe move this to a sub-query
|
||||
fts_vector =
|
||||
setweight(to_tsvector(%(title)s), 'A') ||
|
||||
|
@ -89,6 +93,7 @@ for _, elem in parser:
|
|||
"title": page.name,
|
||||
"content": page.text,
|
||||
"excerpt": excerpt,
|
||||
"page_size": page_size,
|
||||
})
|
||||
conn.commit()
|
||||
except psycopg2.errors.UniqueViolation:
|
||||
|
|
|
@ -119,7 +119,7 @@ for _, elem in parser:
|
|||
""", {
|
||||
"domain_id": domain_id,
|
||||
"url": page.url,
|
||||
"page_size": len(page.content),
|
||||
"page_size": len(page.markup),
|
||||
"checksum": checksum,
|
||||
"title": page.title,
|
||||
"content": content,
|
||||
|
|
Loading…
Reference in a new issue