import/*: fix page_size issues
This commit is contained in:
parent
69a9e20a0a
commit
9473f3b49b
2 changed files with 6 additions and 1 deletions
|
@ -43,6 +43,7 @@ for _, elem in parser:
|
||||||
sha.update(page.text.encode())
|
sha.update(page.text.encode())
|
||||||
checksum = sha.digest()
|
checksum = sha.digest()
|
||||||
|
|
||||||
|
page_size = len(page.text)
|
||||||
excerpt = page.text.split("\n")[0]
|
excerpt = page.text.split("\n")[0]
|
||||||
# XXX: Could be better for non-English languages
|
# XXX: Could be better for non-English languages
|
||||||
if len(excerpt) > 512:
|
if len(excerpt) > 512:
|
||||||
|
@ -61,6 +62,7 @@ for _, elem in parser:
|
||||||
title,
|
title,
|
||||||
excerpt,
|
excerpt,
|
||||||
javascript,
|
javascript,
|
||||||
|
page_size,
|
||||||
fts_vector
|
fts_vector
|
||||||
-- text_content
|
-- text_content
|
||||||
-- hostname
|
-- hostname
|
||||||
|
@ -68,6 +70,7 @@ for _, elem in parser:
|
||||||
%(domain_id)s,
|
%(domain_id)s,
|
||||||
now(), 'import/cve',
|
now(), 'import/cve',
|
||||||
%(url)s, %(checksum)s, %(title)s, %(excerpt)s, true,
|
%(url)s, %(checksum)s, %(title)s, %(excerpt)s, true,
|
||||||
|
%(page_size)s,
|
||||||
setweight(to_tsvector(%(title)s), 'A') ||
|
setweight(to_tsvector(%(title)s), 'A') ||
|
||||||
setweight(to_tsvector('cve.org'), 'A') ||
|
setweight(to_tsvector('cve.org'), 'A') ||
|
||||||
setweight(to_tsvector(%(content)s), 'D')
|
setweight(to_tsvector(%(content)s), 'D')
|
||||||
|
@ -77,6 +80,7 @@ for _, elem in parser:
|
||||||
checksum = %(checksum)s,
|
checksum = %(checksum)s,
|
||||||
title = %(title)s,
|
title = %(title)s,
|
||||||
excerpt = %(excerpt)s,
|
excerpt = %(excerpt)s,
|
||||||
|
page_size = %(page_size)s,
|
||||||
-- TODO: Maybe move this to a sub-query
|
-- TODO: Maybe move this to a sub-query
|
||||||
fts_vector =
|
fts_vector =
|
||||||
setweight(to_tsvector(%(title)s), 'A') ||
|
setweight(to_tsvector(%(title)s), 'A') ||
|
||||||
|
@ -89,6 +93,7 @@ for _, elem in parser:
|
||||||
"title": page.name,
|
"title": page.name,
|
||||||
"content": page.text,
|
"content": page.text,
|
||||||
"excerpt": excerpt,
|
"excerpt": excerpt,
|
||||||
|
"page_size": page_size,
|
||||||
})
|
})
|
||||||
conn.commit()
|
conn.commit()
|
||||||
except psycopg2.errors.UniqueViolation:
|
except psycopg2.errors.UniqueViolation:
|
||||||
|
|
|
@ -119,7 +119,7 @@ for _, elem in parser:
|
||||||
""", {
|
""", {
|
||||||
"domain_id": domain_id,
|
"domain_id": domain_id,
|
||||||
"url": page.url,
|
"url": page.url,
|
||||||
"page_size": len(page.content),
|
"page_size": len(page.markup),
|
||||||
"checksum": checksum,
|
"checksum": checksum,
|
||||||
"title": page.title,
|
"title": page.title,
|
||||||
"content": content,
|
"content": content,
|
||||||
|
|
Loading…
Reference in a new issue