import/mediawiki: use namespace IDs for filtering

Updates the mediawiki importer to use the namespace IDs for filtering
instead of matching for the beginning of the article title. This better
supports other language versions and non-Wikipedia wikis.

Signed-off-by: Taavi Väänänen <hi@taavi.wtf>
This commit is contained in:
Taavi Väänänen 2022-07-12 16:15:14 +03:00 committed by Drew DeVault
parent 82d73c6e31
commit 00a37d0b48

View file

@ -31,6 +31,8 @@ class Page:
self.title = title.text
redirect = elem.find("{http://www.mediawiki.org/xml/export-0.10/}redirect")
self.redirect = redirect
namespace = elem.find("{http://www.mediawiki.org/xml/export-0.10/}ns")
self.namespace = int(namespace.text)
rev = elem.find("{http://www.mediawiki.org/xml/export-0.10/}revision")
model = rev.find("{http://www.mediawiki.org/xml/export-0.10/}model")
@ -54,12 +56,19 @@ for _, elem in parser:
elem.clear()
del elem
continue
if (page.title.startswith("Wikipedia:")
or page.title.startswith("Template:")
or page.title.startswith("File:")):
# see https://www.mediawiki.org/wiki/Manual:Namespace for MW core namespaces,
# although larger wikis usually have custom namespaces that can be listed via the API:
# https://en.wikipedia.org/w/api.php?action=query&meta=siteinfo&siprop=namespaces&formatversion=2
if page.namespace in (
4, # NS_PROJECT (aka Wikipedia:)
6, # NS_FILE
10, # NS_TEMPLATE
):
elem.clear()
del elem
continue
content = mw.parse(page.markup).strip_code()
sha = hashlib.sha512()
sha.update(content.encode())