diff --git a/import/mediawiki/main.py b/import/mediawiki/main.py new file mode 100644 index 0000000..e3d3a7b --- /dev/null +++ b/import/mediawiki/main.py @@ -0,0 +1,113 @@ +import bz2 +import configparser +import hashlib +import mwparserfromhell as mw +import psycopg2 +import sys +import xml.etree.ElementTree as ET +from urllib.parse import quote +# TODO: +# - Update domain last_index_date and crawl_duration +# - Better handling of wikitext templates +# - Non-English pages +# - Non-Wikipedia mediawikis +# - Apply weighting heuristics (primarily for Main_Page) + +conf = configparser.ConfigParser() +conf.read("config.ini") # TODO: Search multiple paths +cs = conf["searchhut"]["connection-string"] +conn = psycopg2.connect(cs) + +with conn.cursor() as cur: + cur.execute("SELECT id FROM domain WHERE hostname = %s", ("en.wikipedia.org",)) + domain_id = cur.fetchone()[0] + +unzipped = bz2.open(sys.stdin.buffer) +parser = ET.iterparse(unzipped) + +class Page: + def __init__(self, elem): + title = elem.find("{http://www.mediawiki.org/xml/export-0.10/}title") + self.title = title.text + redirect = elem.find("{http://www.mediawiki.org/xml/export-0.10/}redirect") + self.redirect = redirect + + rev = elem.find("{http://www.mediawiki.org/xml/export-0.10/}revision") + model = rev.find("{http://www.mediawiki.org/xml/export-0.10/}model") + self.model = model.text + text = rev.find("{http://www.mediawiki.org/xml/export-0.10/}text") + if text is not None: + self.markup = text.text + self.content = mw.parse(text.text).strip_code() + else: + self.markup = None + self.content = None + + # TODO: Construct me from + norm = quote(self.title.replace(" ", "_")) + self.url = f"https://en.wikipedia.org/wiki/{norm}" + +for _, elem in parser: + if elem.tag == "{http://www.mediawiki.org/xml/export-0.10/}page": + page = Page(elem) + if (page.model != "wikitext" + or page.redirect is not None + or page.content is None): + continue + sha = hashlib.sha512() + sha.update(page.content.encode()) + checksum = sha.digest() + + excerpt = page.content.split("\n")[0] + # XXX: Could be better for non-English languages + if len(excerpt) > 512: + excerpt = excerpt[:512] + "…" + + print(page.url) + try: + with conn.cursor() as cur: + cur.execute(""" + INSERT INTO page ( + domain_id, + last_index_date, + source, + weight, + crawl_priority, + crawl_delay, + url, + checksum, + title, + excerpt, + javascript, + fts_vector + -- text_content + -- hostname + ) VALUES ( + %(domain_id)s, + now(), 'import/mediawiki', 0, 0, '0s'::interval, + %(url)s, %(checksum)s, %(title)s, %(excerpt)s, false, + setweight(to_tsvector(%(title)s), 'A') || + setweight(to_tsvector('en.wikipedia.org'), 'A') || + setweight(to_tsvector(%(content)s), 'D') + ) + ON CONFLICT ON CONSTRAINT page_url_key DO UPDATE SET + last_index_date = now(), + checksum = %(checksum)s, + title = %(title)s, + excerpt = %(excerpt)s, + -- TODO: Maybe move this to a sub-query + fts_vector = + setweight(to_tsvector(%(title)s), 'A') || + setweight(to_tsvector('en.wikipedia.org'), 'A') || + setweight(to_tsvector(%(content)s), 'D'); + """, { + "domain_id": domain_id, + "url": page.url, + "checksum": checksum, + "title": page.title, + "content": page.content, + "excerpt": excerpt, + }) + conn.commit() + except psycopg2.errors.UniqueViolation: + conn.rollback()