import/cve.org: New importer
This commit is contained in:
parent
fde8b75efd
commit
062e63437a
1 changed files with 92 additions and 0 deletions
92
import/cve.org/main.py
Executable file
92
import/cve.org/main.py
Executable file
|
@ -0,0 +1,92 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# copied from mediawiki parser
|
||||||
|
# Usage: Download https://cve.mitre.org/data/downloads/allitems.xml and pass it to stdin
|
||||||
|
import configparser
|
||||||
|
import hashlib
|
||||||
|
import psycopg2
|
||||||
|
import sys
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
|
||||||
|
# TODO:
|
||||||
|
# - Update domain last_index_date and crawl_duration
|
||||||
|
# ? Apply weighting heuristics (primarily for Main_Page)
|
||||||
|
|
||||||
|
conf = configparser.ConfigParser()
|
||||||
|
conf.read("config.ini") # TODO: Search multiple paths
|
||||||
|
cs = conf["searchhut"]["connection-string"]
|
||||||
|
conn = psycopg2.connect(cs)
|
||||||
|
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.execute("SELECT id FROM domain WHERE hostname = %s", ("www.cve.org",))
|
||||||
|
domain_id = cur.fetchone()[0]
|
||||||
|
|
||||||
|
parser = ET.iterparse(sys.stdin.buffer)
|
||||||
|
|
||||||
|
class Page:
|
||||||
|
def __init__(self, elem):
|
||||||
|
#name = elem.find("{http://cve.mitre.org/cve/downloads/1.0}item/@name")
|
||||||
|
self.name = elem.attrib["name"]
|
||||||
|
|
||||||
|
text = elem.find("{http://cve.mitre.org/cve/downloads/1.0}desc")
|
||||||
|
if text is not None:
|
||||||
|
self.text = text.text
|
||||||
|
else:
|
||||||
|
self.text = None
|
||||||
|
|
||||||
|
# Maybe it should use an alternative frontend similar to gemini://hacktivis.me/cgi-bin/cve?{name} instead?
|
||||||
|
self.url = f"https://www.cve.org/CVERecord?id={self.name}"
|
||||||
|
|
||||||
|
for _, elem in parser:
|
||||||
|
if elem.tag == "{http://cve.mitre.org/cve/downloads/1.0}item":
|
||||||
|
page = Page(elem)
|
||||||
|
sha = hashlib.sha512()
|
||||||
|
sha.update(page.text.encode())
|
||||||
|
checksum = sha.digest()
|
||||||
|
|
||||||
|
print(page.url)
|
||||||
|
try:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
cur.execute("""
|
||||||
|
INSERT INTO page (
|
||||||
|
domain_id,
|
||||||
|
last_index_date,
|
||||||
|
source,
|
||||||
|
url,
|
||||||
|
checksum,
|
||||||
|
title,
|
||||||
|
excerpt,
|
||||||
|
javascript,
|
||||||
|
fts_vector
|
||||||
|
-- text_content
|
||||||
|
-- hostname
|
||||||
|
) VALUES (
|
||||||
|
%(domain_id)s,
|
||||||
|
now(), 'import/cve',
|
||||||
|
%(url)s, %(checksum)s, %(title)s, %(content)s, true,
|
||||||
|
setweight(to_tsvector(%(title)s), 'A') ||
|
||||||
|
setweight(to_tsvector('cve.org'), 'A') ||
|
||||||
|
setweight(to_tsvector(%(content)s), 'D')
|
||||||
|
)
|
||||||
|
ON CONFLICT ON CONSTRAINT page_url_key DO UPDATE SET
|
||||||
|
last_index_date = now(),
|
||||||
|
checksum = %(checksum)s,
|
||||||
|
title = %(title)s,
|
||||||
|
excerpt = %(content)s,
|
||||||
|
-- TODO: Maybe move this to a sub-query
|
||||||
|
fts_vector =
|
||||||
|
setweight(to_tsvector(%(title)s), 'A') ||
|
||||||
|
setweight(to_tsvector('cve.org'), 'A') ||
|
||||||
|
setweight(to_tsvector(%(content)s), 'D');
|
||||||
|
""", {
|
||||||
|
"domain_id": domain_id,
|
||||||
|
"url": page.url,
|
||||||
|
"checksum": checksum,
|
||||||
|
"title": page.name,
|
||||||
|
"content": page.text
|
||||||
|
})
|
||||||
|
conn.commit()
|
||||||
|
except psycopg2.errors.UniqueViolation:
|
||||||
|
conn.rollback()
|
||||||
|
|
||||||
|
elem.clear()
|
||||||
|
del elem
|
Loading…
Reference in a new issue