2022-07-11 17:00:06 +02:00
|
|
|
#!/usr/bin/env python
|
|
|
|
# copied from mediawiki parser
|
|
|
|
# Usage: Download https://cve.mitre.org/data/downloads/allitems.xml and pass it to stdin
|
|
|
|
import configparser
|
|
|
|
import hashlib
|
|
|
|
import psycopg2
|
|
|
|
import sys
|
|
|
|
import xml.etree.ElementTree as ET
|
|
|
|
|
|
|
|
# TODO:
|
|
|
|
# - Update domain last_index_date and crawl_duration
|
|
|
|
# ? Apply weighting heuristics (primarily for Main_Page)
|
|
|
|
|
|
|
|
conf = configparser.ConfigParser()
|
|
|
|
conf.read("config.ini") # TODO: Search multiple paths
|
|
|
|
cs = conf["searchhut"]["connection-string"]
|
|
|
|
conn = psycopg2.connect(cs)
|
|
|
|
|
|
|
|
with conn.cursor() as cur:
|
|
|
|
cur.execute("SELECT id FROM domain WHERE hostname = %s", ("www.cve.org",))
|
|
|
|
domain_id = cur.fetchone()[0]
|
|
|
|
|
|
|
|
parser = ET.iterparse(sys.stdin.buffer)
|
|
|
|
|
|
|
|
class Page:
|
|
|
|
def __init__(self, elem):
|
|
|
|
#name = elem.find("{http://cve.mitre.org/cve/downloads/1.0}item/@name")
|
|
|
|
self.name = elem.attrib["name"]
|
|
|
|
|
|
|
|
text = elem.find("{http://cve.mitre.org/cve/downloads/1.0}desc")
|
|
|
|
if text is not None:
|
|
|
|
self.text = text.text
|
|
|
|
else:
|
|
|
|
self.text = None
|
|
|
|
|
|
|
|
# Maybe it should use an alternative frontend similar to gemini://hacktivis.me/cgi-bin/cve?{name} instead?
|
|
|
|
self.url = f"https://www.cve.org/CVERecord?id={self.name}"
|
|
|
|
|
|
|
|
for _, elem in parser:
|
|
|
|
if elem.tag == "{http://cve.mitre.org/cve/downloads/1.0}item":
|
|
|
|
page = Page(elem)
|
|
|
|
sha = hashlib.sha512()
|
|
|
|
sha.update(page.text.encode())
|
|
|
|
checksum = sha.digest()
|
|
|
|
|
2022-07-11 18:41:50 +02:00
|
|
|
excerpt = page.text.split("\n")[0]
|
|
|
|
# XXX: Could be better for non-English languages
|
|
|
|
if len(excerpt) > 512:
|
|
|
|
excerpt = excerpt[:512] + "…"
|
|
|
|
|
2022-07-11 17:00:06 +02:00
|
|
|
print(page.url)
|
|
|
|
try:
|
|
|
|
with conn.cursor() as cur:
|
|
|
|
cur.execute("""
|
|
|
|
INSERT INTO page (
|
|
|
|
domain_id,
|
|
|
|
last_index_date,
|
|
|
|
source,
|
|
|
|
url,
|
|
|
|
checksum,
|
|
|
|
title,
|
|
|
|
excerpt,
|
|
|
|
javascript,
|
|
|
|
fts_vector
|
|
|
|
-- text_content
|
|
|
|
-- hostname
|
|
|
|
) VALUES (
|
|
|
|
%(domain_id)s,
|
|
|
|
now(), 'import/cve',
|
2022-07-11 18:41:50 +02:00
|
|
|
%(url)s, %(checksum)s, %(title)s, %(excerpt)s, true,
|
2022-07-11 17:00:06 +02:00
|
|
|
setweight(to_tsvector(%(title)s), 'A') ||
|
|
|
|
setweight(to_tsvector('cve.org'), 'A') ||
|
|
|
|
setweight(to_tsvector(%(content)s), 'D')
|
|
|
|
)
|
|
|
|
ON CONFLICT ON CONSTRAINT page_url_key DO UPDATE SET
|
|
|
|
last_index_date = now(),
|
|
|
|
checksum = %(checksum)s,
|
|
|
|
title = %(title)s,
|
2022-07-11 18:41:50 +02:00
|
|
|
excerpt = %(excerpt)s,
|
2022-07-11 17:00:06 +02:00
|
|
|
-- TODO: Maybe move this to a sub-query
|
|
|
|
fts_vector =
|
|
|
|
setweight(to_tsvector(%(title)s), 'A') ||
|
|
|
|
setweight(to_tsvector('cve.org'), 'A') ||
|
|
|
|
setweight(to_tsvector(%(content)s), 'D');
|
|
|
|
""", {
|
|
|
|
"domain_id": domain_id,
|
|
|
|
"url": page.url,
|
|
|
|
"checksum": checksum,
|
|
|
|
"title": page.name,
|
2022-07-11 18:41:50 +02:00
|
|
|
"content": page.text,
|
|
|
|
"excerpt": excerpt,
|
2022-07-11 17:00:06 +02:00
|
|
|
})
|
|
|
|
conn.commit()
|
|
|
|
except psycopg2.errors.UniqueViolation:
|
|
|
|
conn.rollback()
|
|
|
|
|
|
|
|
elem.clear()
|
|
|
|
del elem
|