From 062e63437a3b46ac3d252325e88e466fd1ccd50c Mon Sep 17 00:00:00 2001 From: "Haelwenn (lanodan) Monnier" Date: Mon, 11 Jul 2022 17:00:06 +0200 Subject: [PATCH] import/cve.org: New importer --- import/cve.org/main.py | 92 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100755 import/cve.org/main.py diff --git a/import/cve.org/main.py b/import/cve.org/main.py new file mode 100755 index 0000000..7fa2c03 --- /dev/null +++ b/import/cve.org/main.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python +# copied from mediawiki parser +# Usage: Download https://cve.mitre.org/data/downloads/allitems.xml and pass it to stdin +import configparser +import hashlib +import psycopg2 +import sys +import xml.etree.ElementTree as ET + +# TODO: +# - Update domain last_index_date and crawl_duration +# ? Apply weighting heuristics (primarily for Main_Page) + +conf = configparser.ConfigParser() +conf.read("config.ini") # TODO: Search multiple paths +cs = conf["searchhut"]["connection-string"] +conn = psycopg2.connect(cs) + +with conn.cursor() as cur: + cur.execute("SELECT id FROM domain WHERE hostname = %s", ("www.cve.org",)) + domain_id = cur.fetchone()[0] + +parser = ET.iterparse(sys.stdin.buffer) + +class Page: + def __init__(self, elem): + #name = elem.find("{http://cve.mitre.org/cve/downloads/1.0}item/@name") + self.name = elem.attrib["name"] + + text = elem.find("{http://cve.mitre.org/cve/downloads/1.0}desc") + if text is not None: + self.text = text.text + else: + self.text = None + + # Maybe it should use an alternative frontend similar to gemini://hacktivis.me/cgi-bin/cve?{name} instead? + self.url = f"https://www.cve.org/CVERecord?id={self.name}" + +for _, elem in parser: + if elem.tag == "{http://cve.mitre.org/cve/downloads/1.0}item": + page = Page(elem) + sha = hashlib.sha512() + sha.update(page.text.encode()) + checksum = sha.digest() + + print(page.url) + try: + with conn.cursor() as cur: + cur.execute(""" + INSERT INTO page ( + domain_id, + last_index_date, + source, + url, + checksum, + title, + excerpt, + javascript, + fts_vector + -- text_content + -- hostname + ) VALUES ( + %(domain_id)s, + now(), 'import/cve', + %(url)s, %(checksum)s, %(title)s, %(content)s, true, + setweight(to_tsvector(%(title)s), 'A') || + setweight(to_tsvector('cve.org'), 'A') || + setweight(to_tsvector(%(content)s), 'D') + ) + ON CONFLICT ON CONSTRAINT page_url_key DO UPDATE SET + last_index_date = now(), + checksum = %(checksum)s, + title = %(title)s, + excerpt = %(content)s, + -- TODO: Maybe move this to a sub-query + fts_vector = + setweight(to_tsvector(%(title)s), 'A') || + setweight(to_tsvector('cve.org'), 'A') || + setweight(to_tsvector(%(content)s), 'D'); + """, { + "domain_id": domain_id, + "url": page.url, + "checksum": checksum, + "title": page.name, + "content": page.text + }) + conn.commit() + except psycopg2.errors.UniqueViolation: + conn.rollback() + + elem.clear() + del elem