searchhut/import/mediawiki/main.py

114 lines
4 KiB
Python
Raw Normal View History

2022-07-10 11:11:18 +02:00
import bz2
import configparser
import hashlib
import mwparserfromhell as mw
import psycopg2
import sys
import xml.etree.ElementTree as ET
from urllib.parse import quote
# TODO:
# - Update domain last_index_date and crawl_duration
# - Better handling of wikitext templates
# - Non-English pages
# - Non-Wikipedia mediawikis
# - Apply weighting heuristics (primarily for Main_Page)
conf = configparser.ConfigParser()
conf.read("config.ini") # TODO: Search multiple paths
cs = conf["searchhut"]["connection-string"]
conn = psycopg2.connect(cs)
with conn.cursor() as cur:
cur.execute("SELECT id FROM domain WHERE hostname = %s", ("en.wikipedia.org",))
domain_id = cur.fetchone()[0]
unzipped = bz2.open(sys.stdin.buffer)
parser = ET.iterparse(unzipped)
class Page:
def __init__(self, elem):
title = elem.find("{http://www.mediawiki.org/xml/export-0.10/}title")
self.title = title.text
redirect = elem.find("{http://www.mediawiki.org/xml/export-0.10/}redirect")
self.redirect = redirect
rev = elem.find("{http://www.mediawiki.org/xml/export-0.10/}revision")
model = rev.find("{http://www.mediawiki.org/xml/export-0.10/}model")
self.model = model.text
text = rev.find("{http://www.mediawiki.org/xml/export-0.10/}text")
if text is not None:
self.markup = text.text
self.content = mw.parse(text.text).strip_code()
else:
self.markup = None
self.content = None
# TODO: Construct me from <siteinfo>
norm = quote(self.title.replace(" ", "_"))
self.url = f"https://en.wikipedia.org/wiki/{norm}"
for _, elem in parser:
if elem.tag == "{http://www.mediawiki.org/xml/export-0.10/}page":
page = Page(elem)
if (page.model != "wikitext"
or page.redirect is not None
or page.content is None):
continue
sha = hashlib.sha512()
sha.update(page.content.encode())
checksum = sha.digest()
excerpt = page.content.split("\n")[0]
# XXX: Could be better for non-English languages
if len(excerpt) > 512:
excerpt = excerpt[:512] + ""
print(page.url)
try:
with conn.cursor() as cur:
cur.execute("""
INSERT INTO page (
domain_id,
last_index_date,
source,
weight,
crawl_priority,
crawl_delay,
url,
checksum,
title,
excerpt,
javascript,
fts_vector
-- text_content
-- hostname
) VALUES (
%(domain_id)s,
now(), 'import/mediawiki', 0, 0, '0s'::interval,
%(url)s, %(checksum)s, %(title)s, %(excerpt)s, false,
setweight(to_tsvector(%(title)s), 'A') ||
setweight(to_tsvector('en.wikipedia.org'), 'A') ||
setweight(to_tsvector(%(content)s), 'D')
)
ON CONFLICT ON CONSTRAINT page_url_key DO UPDATE SET
last_index_date = now(),
checksum = %(checksum)s,
title = %(title)s,
excerpt = %(excerpt)s,
-- TODO: Maybe move this to a sub-query
fts_vector =
setweight(to_tsvector(%(title)s), 'A') ||
setweight(to_tsvector('en.wikipedia.org'), 'A') ||
setweight(to_tsvector(%(content)s), 'D');
""", {
"domain_id": domain_id,
"url": page.url,
"checksum": checksum,
"title": page.title,
"content": page.content,
"excerpt": excerpt,
})
conn.commit()
except psycopg2.errors.UniqueViolation:
conn.rollback()