import/mediawiki: initial commit
This commit is contained in:
parent
e44770b9b7
commit
c8762965ac
1 changed files with 113 additions and 0 deletions
113
import/mediawiki/main.py
Normal file
113
import/mediawiki/main.py
Normal file
|
@ -0,0 +1,113 @@
|
|||
import bz2
|
||||
import configparser
|
||||
import hashlib
|
||||
import mwparserfromhell as mw
|
||||
import psycopg2
|
||||
import sys
|
||||
import xml.etree.ElementTree as ET
|
||||
from urllib.parse import quote
|
||||
# TODO:
|
||||
# - Update domain last_index_date and crawl_duration
|
||||
# - Better handling of wikitext templates
|
||||
# - Non-English pages
|
||||
# - Non-Wikipedia mediawikis
|
||||
# - Apply weighting heuristics (primarily for Main_Page)
|
||||
|
||||
conf = configparser.ConfigParser()
|
||||
conf.read("config.ini") # TODO: Search multiple paths
|
||||
cs = conf["searchhut"]["connection-string"]
|
||||
conn = psycopg2.connect(cs)
|
||||
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("SELECT id FROM domain WHERE hostname = %s", ("en.wikipedia.org",))
|
||||
domain_id = cur.fetchone()[0]
|
||||
|
||||
unzipped = bz2.open(sys.stdin.buffer)
|
||||
parser = ET.iterparse(unzipped)
|
||||
|
||||
class Page:
|
||||
def __init__(self, elem):
|
||||
title = elem.find("{http://www.mediawiki.org/xml/export-0.10/}title")
|
||||
self.title = title.text
|
||||
redirect = elem.find("{http://www.mediawiki.org/xml/export-0.10/}redirect")
|
||||
self.redirect = redirect
|
||||
|
||||
rev = elem.find("{http://www.mediawiki.org/xml/export-0.10/}revision")
|
||||
model = rev.find("{http://www.mediawiki.org/xml/export-0.10/}model")
|
||||
self.model = model.text
|
||||
text = rev.find("{http://www.mediawiki.org/xml/export-0.10/}text")
|
||||
if text is not None:
|
||||
self.markup = text.text
|
||||
self.content = mw.parse(text.text).strip_code()
|
||||
else:
|
||||
self.markup = None
|
||||
self.content = None
|
||||
|
||||
# TODO: Construct me from <siteinfo>
|
||||
norm = quote(self.title.replace(" ", "_"))
|
||||
self.url = f"https://en.wikipedia.org/wiki/{norm}"
|
||||
|
||||
for _, elem in parser:
|
||||
if elem.tag == "{http://www.mediawiki.org/xml/export-0.10/}page":
|
||||
page = Page(elem)
|
||||
if (page.model != "wikitext"
|
||||
or page.redirect is not None
|
||||
or page.content is None):
|
||||
continue
|
||||
sha = hashlib.sha512()
|
||||
sha.update(page.content.encode())
|
||||
checksum = sha.digest()
|
||||
|
||||
excerpt = page.content.split("\n")[0]
|
||||
# XXX: Could be better for non-English languages
|
||||
if len(excerpt) > 512:
|
||||
excerpt = excerpt[:512] + "…"
|
||||
|
||||
print(page.url)
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
INSERT INTO page (
|
||||
domain_id,
|
||||
last_index_date,
|
||||
source,
|
||||
weight,
|
||||
crawl_priority,
|
||||
crawl_delay,
|
||||
url,
|
||||
checksum,
|
||||
title,
|
||||
excerpt,
|
||||
javascript,
|
||||
fts_vector
|
||||
-- text_content
|
||||
-- hostname
|
||||
) VALUES (
|
||||
%(domain_id)s,
|
||||
now(), 'import/mediawiki', 0, 0, '0s'::interval,
|
||||
%(url)s, %(checksum)s, %(title)s, %(excerpt)s, false,
|
||||
setweight(to_tsvector(%(title)s), 'A') ||
|
||||
setweight(to_tsvector('en.wikipedia.org'), 'A') ||
|
||||
setweight(to_tsvector(%(content)s), 'D')
|
||||
)
|
||||
ON CONFLICT ON CONSTRAINT page_url_key DO UPDATE SET
|
||||
last_index_date = now(),
|
||||
checksum = %(checksum)s,
|
||||
title = %(title)s,
|
||||
excerpt = %(excerpt)s,
|
||||
-- TODO: Maybe move this to a sub-query
|
||||
fts_vector =
|
||||
setweight(to_tsvector(%(title)s), 'A') ||
|
||||
setweight(to_tsvector('en.wikipedia.org'), 'A') ||
|
||||
setweight(to_tsvector(%(content)s), 'D');
|
||||
""", {
|
||||
"domain_id": domain_id,
|
||||
"url": page.url,
|
||||
"checksum": checksum,
|
||||
"title": page.title,
|
||||
"content": page.content,
|
||||
"excerpt": excerpt,
|
||||
})
|
||||
conn.commit()
|
||||
except psycopg2.errors.UniqueViolation:
|
||||
conn.rollback()
|
Loading…
Reference in a new issue