searchhut/import/cve.org/main.py
2022-07-11 19:11:37 +02:00

98 lines
3.4 KiB
Python
Executable file

#!/usr/bin/env python
# copied from mediawiki parser
# Usage: Download https://cve.mitre.org/data/downloads/allitems.xml and pass it to stdin
import configparser
import hashlib
import psycopg2
import sys
import xml.etree.ElementTree as ET
# TODO:
# - Update domain last_index_date and crawl_duration
# ? Apply weighting heuristics (primarily for Main_Page)
conf = configparser.ConfigParser()
conf.read("config.ini") # TODO: Search multiple paths
cs = conf["searchhut"]["connection-string"]
conn = psycopg2.connect(cs)
with conn.cursor() as cur:
cur.execute("SELECT id FROM domain WHERE hostname = %s", ("www.cve.org",))
domain_id = cur.fetchone()[0]
parser = ET.iterparse(sys.stdin.buffer)
class Page:
def __init__(self, elem):
#name = elem.find("{http://cve.mitre.org/cve/downloads/1.0}item/@name")
self.name = elem.attrib["name"]
text = elem.find("{http://cve.mitre.org/cve/downloads/1.0}desc")
if text is not None:
self.text = text.text
else:
self.text = None
# Maybe it should use an alternative frontend similar to gemini://hacktivis.me/cgi-bin/cve?{name} instead?
self.url = f"https://www.cve.org/CVERecord?id={self.name}"
for _, elem in parser:
if elem.tag == "{http://cve.mitre.org/cve/downloads/1.0}item":
page = Page(elem)
sha = hashlib.sha512()
sha.update(page.text.encode())
checksum = sha.digest()
excerpt = page.text.split("\n")[0]
# XXX: Could be better for non-English languages
if len(excerpt) > 512:
excerpt = excerpt[:512] + ""
print(page.url)
try:
with conn.cursor() as cur:
cur.execute("""
INSERT INTO page (
domain_id,
last_index_date,
source,
url,
checksum,
title,
excerpt,
javascript,
fts_vector
-- text_content
-- hostname
) VALUES (
%(domain_id)s,
now(), 'import/cve',
%(url)s, %(checksum)s, %(title)s, %(excerpt)s, true,
setweight(to_tsvector(%(title)s), 'A') ||
setweight(to_tsvector('cve.org'), 'A') ||
setweight(to_tsvector(%(content)s), 'D')
)
ON CONFLICT ON CONSTRAINT page_url_key DO UPDATE SET
last_index_date = now(),
checksum = %(checksum)s,
title = %(title)s,
excerpt = %(excerpt)s,
-- TODO: Maybe move this to a sub-query
fts_vector =
setweight(to_tsvector(%(title)s), 'A') ||
setweight(to_tsvector('cve.org'), 'A') ||
setweight(to_tsvector(%(content)s), 'D');
""", {
"domain_id": domain_id,
"url": page.url,
"checksum": checksum,
"title": page.name,
"content": page.text,
"excerpt": excerpt,
})
conn.commit()
except psycopg2.errors.UniqueViolation:
conn.rollback()
elem.clear()
del elem