From 00a37d0b48cec3e590db90de9099caeca822b691 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Taavi=20V=C3=A4=C3=A4n=C3=A4nen?= Date: Tue, 12 Jul 2022 16:15:14 +0300 Subject: [PATCH] import/mediawiki: use namespace IDs for filtering MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Updates the mediawiki importer to use the namespace IDs for filtering instead of matching for the beginning of the article title. This better supports other language versions and non-Wikipedia wikis. Signed-off-by: Taavi Väänänen --- import/mediawiki/main.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/import/mediawiki/main.py b/import/mediawiki/main.py index ea126d3..9c660bc 100644 --- a/import/mediawiki/main.py +++ b/import/mediawiki/main.py @@ -31,6 +31,8 @@ class Page: self.title = title.text redirect = elem.find("{http://www.mediawiki.org/xml/export-0.10/}redirect") self.redirect = redirect + namespace = elem.find("{http://www.mediawiki.org/xml/export-0.10/}ns") + self.namespace = int(namespace.text) rev = elem.find("{http://www.mediawiki.org/xml/export-0.10/}revision") model = rev.find("{http://www.mediawiki.org/xml/export-0.10/}model") @@ -54,12 +56,19 @@ for _, elem in parser: elem.clear() del elem continue - if (page.title.startswith("Wikipedia:") - or page.title.startswith("Template:") - or page.title.startswith("File:")): + + # see https://www.mediawiki.org/wiki/Manual:Namespace for MW core namespaces, + # although larger wikis usually have custom namespaces that can be listed via the API: + # https://en.wikipedia.org/w/api.php?action=query&meta=siteinfo&siprop=namespaces&formatversion=2 + if page.namespace in ( + 4, # NS_PROJECT (aka Wikipedia:) + 6, # NS_FILE + 10, # NS_TEMPLATE + ): elem.clear() del elem continue + content = mw.parse(page.markup).strip_code() sha = hashlib.sha512() sha.update(content.encode())