114 lines
4 KiB
Python
114 lines
4 KiB
Python
|
import bz2
|
||
|
import configparser
|
||
|
import hashlib
|
||
|
import mwparserfromhell as mw
|
||
|
import psycopg2
|
||
|
import sys
|
||
|
import xml.etree.ElementTree as ET
|
||
|
from urllib.parse import quote
|
||
|
# TODO:
|
||
|
# - Update domain last_index_date and crawl_duration
|
||
|
# - Better handling of wikitext templates
|
||
|
# - Non-English pages
|
||
|
# - Non-Wikipedia mediawikis
|
||
|
# - Apply weighting heuristics (primarily for Main_Page)
|
||
|
|
||
|
conf = configparser.ConfigParser()
|
||
|
conf.read("config.ini") # TODO: Search multiple paths
|
||
|
cs = conf["searchhut"]["connection-string"]
|
||
|
conn = psycopg2.connect(cs)
|
||
|
|
||
|
with conn.cursor() as cur:
|
||
|
cur.execute("SELECT id FROM domain WHERE hostname = %s", ("en.wikipedia.org",))
|
||
|
domain_id = cur.fetchone()[0]
|
||
|
|
||
|
unzipped = bz2.open(sys.stdin.buffer)
|
||
|
parser = ET.iterparse(unzipped)
|
||
|
|
||
|
class Page:
|
||
|
def __init__(self, elem):
|
||
|
title = elem.find("{http://www.mediawiki.org/xml/export-0.10/}title")
|
||
|
self.title = title.text
|
||
|
redirect = elem.find("{http://www.mediawiki.org/xml/export-0.10/}redirect")
|
||
|
self.redirect = redirect
|
||
|
|
||
|
rev = elem.find("{http://www.mediawiki.org/xml/export-0.10/}revision")
|
||
|
model = rev.find("{http://www.mediawiki.org/xml/export-0.10/}model")
|
||
|
self.model = model.text
|
||
|
text = rev.find("{http://www.mediawiki.org/xml/export-0.10/}text")
|
||
|
if text is not None:
|
||
|
self.markup = text.text
|
||
|
self.content = mw.parse(text.text).strip_code()
|
||
|
else:
|
||
|
self.markup = None
|
||
|
self.content = None
|
||
|
|
||
|
# TODO: Construct me from <siteinfo>
|
||
|
norm = quote(self.title.replace(" ", "_"))
|
||
|
self.url = f"https://en.wikipedia.org/wiki/{norm}"
|
||
|
|
||
|
for _, elem in parser:
|
||
|
if elem.tag == "{http://www.mediawiki.org/xml/export-0.10/}page":
|
||
|
page = Page(elem)
|
||
|
if (page.model != "wikitext"
|
||
|
or page.redirect is not None
|
||
|
or page.content is None):
|
||
|
continue
|
||
|
sha = hashlib.sha512()
|
||
|
sha.update(page.content.encode())
|
||
|
checksum = sha.digest()
|
||
|
|
||
|
excerpt = page.content.split("\n")[0]
|
||
|
# XXX: Could be better for non-English languages
|
||
|
if len(excerpt) > 512:
|
||
|
excerpt = excerpt[:512] + "…"
|
||
|
|
||
|
print(page.url)
|
||
|
try:
|
||
|
with conn.cursor() as cur:
|
||
|
cur.execute("""
|
||
|
INSERT INTO page (
|
||
|
domain_id,
|
||
|
last_index_date,
|
||
|
source,
|
||
|
weight,
|
||
|
crawl_priority,
|
||
|
crawl_delay,
|
||
|
url,
|
||
|
checksum,
|
||
|
title,
|
||
|
excerpt,
|
||
|
javascript,
|
||
|
fts_vector
|
||
|
-- text_content
|
||
|
-- hostname
|
||
|
) VALUES (
|
||
|
%(domain_id)s,
|
||
|
now(), 'import/mediawiki', 0, 0, '0s'::interval,
|
||
|
%(url)s, %(checksum)s, %(title)s, %(excerpt)s, false,
|
||
|
setweight(to_tsvector(%(title)s), 'A') ||
|
||
|
setweight(to_tsvector('en.wikipedia.org'), 'A') ||
|
||
|
setweight(to_tsvector(%(content)s), 'D')
|
||
|
)
|
||
|
ON CONFLICT ON CONSTRAINT page_url_key DO UPDATE SET
|
||
|
last_index_date = now(),
|
||
|
checksum = %(checksum)s,
|
||
|
title = %(title)s,
|
||
|
excerpt = %(excerpt)s,
|
||
|
-- TODO: Maybe move this to a sub-query
|
||
|
fts_vector =
|
||
|
setweight(to_tsvector(%(title)s), 'A') ||
|
||
|
setweight(to_tsvector('en.wikipedia.org'), 'A') ||
|
||
|
setweight(to_tsvector(%(content)s), 'D');
|
||
|
""", {
|
||
|
"domain_id": domain_id,
|
||
|
"url": page.url,
|
||
|
"checksum": checksum,
|
||
|
"title": page.title,
|
||
|
"content": page.content,
|
||
|
"excerpt": excerpt,
|
||
|
})
|
||
|
conn.commit()
|
||
|
except psycopg2.errors.UniqueViolation:
|
||
|
conn.rollback()
|