import/cve.org: New importer

2022-07-11 17:00:06 +02:00 · 2022-07-11 17:00:06 +02:00 · 062e63437a
commit 062e63437a
parent fde8b75efd
1 changed files with 92 additions and 0 deletions
--- a/import/cve.org/main.py
+++ b/import/cve.org/main.py
@ -0,0 +1,92 @@
 #!/usr/bin/env python
 # copied from mediawiki parser
 # Usage: Download https://cve.mitre.org/data/downloads/allitems.xml and pass it to stdin
 import configparser
 import hashlib
 import psycopg2
 import sys
 import xml.etree.ElementTree as ET
 # TODO:
 # - Update domain last_index_date and crawl_duration
 # ? Apply weighting heuristics (primarily for Main_Page)
 conf = configparser.ConfigParser()
 conf.read("config.ini") # TODO: Search multiple paths
 cs = conf["searchhut"]["connection-string"]
 conn = psycopg2.connect(cs)
 with conn.cursor() as cur:
    cur.execute("SELECT id FROM domain WHERE hostname = %s", ("www.cve.org",))
    domain_id = cur.fetchone()[0]
 parser = ET.iterparse(sys.stdin.buffer)
 class Page:
    def __init__(self, elem):
        #name = elem.find("{http://cve.mitre.org/cve/downloads/1.0}item/@name")
        self.name = elem.attrib["name"]
        text = elem.find("{http://cve.mitre.org/cve/downloads/1.0}desc")
        if text is not None:
            self.text = text.text
        else:
            self.text = None
        # Maybe it should use an alternative frontend similar to gemini://hacktivis.me/cgi-bin/cve?{name} instead?
        self.url = f"https://www.cve.org/CVERecord?id={self.name}"
 for _, elem in parser:
    if elem.tag == "{http://cve.mitre.org/cve/downloads/1.0}item":
        page = Page(elem)
        sha = hashlib.sha512()
        sha.update(page.text.encode())
        checksum = sha.digest()
        print(page.url)
        try:
            with conn.cursor() as cur:
                cur.execute("""
                INSERT INTO page (
                    domain_id,
                    last_index_date,
                    source,
                    url,
                    checksum,
                    title,
                    excerpt,
                    javascript,
                    fts_vector
                    -- text_content
                    -- hostname
                ) VALUES (
                    %(domain_id)s,
                    now(), 'import/cve',
                    %(url)s, %(checksum)s, %(title)s, %(content)s, true,
                    setweight(to_tsvector(%(title)s), 'A') ||
                    setweight(to_tsvector('cve.org'), 'A') ||
                    setweight(to_tsvector(%(content)s), 'D')
                )
                ON CONFLICT ON CONSTRAINT page_url_key DO UPDATE SET
                    last_index_date = now(),
                    checksum = %(checksum)s,
                    title = %(title)s,
                    excerpt = %(content)s,
                    -- TODO: Maybe move this to a sub-query
                    fts_vector =
                        setweight(to_tsvector(%(title)s), 'A') ||
                        setweight(to_tsvector('cve.org'), 'A') ||
                        setweight(to_tsvector(%(content)s), 'D');
                """, {
                    "domain_id": domain_id,
                    "url": page.url,
                    "checksum": checksum,
                    "title": page.name,
                    "content": page.text
                })
            conn.commit()
        except psycopg2.errors.UniqueViolation:
            conn.rollback()
        elem.clear()
        del elem