import bz2 import configparser import hashlib import mwparserfromhell as mw import psycopg2 import sys import xml.etree.ElementTree as ET from urllib.parse import quote # TODO: # - Update domain last_index_date and crawl_duration # - Better handling of wikitext templates # - Non-English pages # - Non-Wikipedia mediawikis # - Apply weighting heuristics (primarily for Main_Page) conf = configparser.ConfigParser() conf.read("config.ini") # TODO: Search multiple paths cs = conf["searchhut"]["connection-string"] conn = psycopg2.connect(cs) with conn.cursor() as cur: cur.execute("SELECT id FROM domain WHERE hostname = %s", ("en.wikipedia.org",)) domain_id = cur.fetchone()[0] unzipped = bz2.open(sys.stdin.buffer) parser = ET.iterparse(unzipped) class Page: def __init__(self, elem): title = elem.find("{http://www.mediawiki.org/xml/export-0.10/}title") self.title = title.text redirect = elem.find("{http://www.mediawiki.org/xml/export-0.10/}redirect") self.redirect = redirect rev = elem.find("{http://www.mediawiki.org/xml/export-0.10/}revision") model = rev.find("{http://www.mediawiki.org/xml/export-0.10/}model") self.model = model.text text = rev.find("{http://www.mediawiki.org/xml/export-0.10/}text") if text is not None: self.markup = text.text self.content = mw.parse(text.text).strip_code() else: self.markup = None self.content = None # TODO: Construct me from norm = quote(self.title.replace(" ", "_")) self.url = f"https://en.wikipedia.org/wiki/{norm}" for _, elem in parser: if elem.tag == "{http://www.mediawiki.org/xml/export-0.10/}page": page = Page(elem) if (page.model != "wikitext" or page.redirect is not None or page.content is None): continue sha = hashlib.sha512() sha.update(page.content.encode()) checksum = sha.digest() excerpt = page.content.split("\n")[0] # XXX: Could be better for non-English languages if len(excerpt) > 512: excerpt = excerpt[:512] + "…" print(page.url) try: with conn.cursor() as cur: cur.execute(""" INSERT INTO page ( domain_id, last_index_date, source, weight, crawl_priority, crawl_delay, url, checksum, title, excerpt, javascript, fts_vector -- text_content -- hostname ) VALUES ( %(domain_id)s, now(), 'import/mediawiki', 0, 0, '0s'::interval, %(url)s, %(checksum)s, %(title)s, %(excerpt)s, false, setweight(to_tsvector(%(title)s), 'A') || setweight(to_tsvector('en.wikipedia.org'), 'A') || setweight(to_tsvector(%(content)s), 'D') ) ON CONFLICT ON CONSTRAINT page_url_key DO UPDATE SET last_index_date = now(), checksum = %(checksum)s, title = %(title)s, excerpt = %(excerpt)s, -- TODO: Maybe move this to a sub-query fts_vector = setweight(to_tsvector(%(title)s), 'A') || setweight(to_tsvector('en.wikipedia.org'), 'A') || setweight(to_tsvector(%(content)s), 'D'); """, { "domain_id": domain_id, "url": page.url, "checksum": checksum, "title": page.title, "content": page.content, "excerpt": excerpt, }) conn.commit() except psycopg2.errors.UniqueViolation: conn.rollback()