import/mediawiki: use namespace IDs for filtering
Updates the mediawiki importer to use the namespace IDs for filtering instead of matching for the beginning of the article title. This better supports other language versions and non-Wikipedia wikis. Signed-off-by: Taavi Väänänen <hi@taavi.wtf>
This commit is contained in:
parent
82d73c6e31
commit
00a37d0b48
1 changed files with 12 additions and 3 deletions
|
@ -31,6 +31,8 @@ class Page:
|
|||
self.title = title.text
|
||||
redirect = elem.find("{http://www.mediawiki.org/xml/export-0.10/}redirect")
|
||||
self.redirect = redirect
|
||||
namespace = elem.find("{http://www.mediawiki.org/xml/export-0.10/}ns")
|
||||
self.namespace = int(namespace.text)
|
||||
|
||||
rev = elem.find("{http://www.mediawiki.org/xml/export-0.10/}revision")
|
||||
model = rev.find("{http://www.mediawiki.org/xml/export-0.10/}model")
|
||||
|
@ -54,12 +56,19 @@ for _, elem in parser:
|
|||
elem.clear()
|
||||
del elem
|
||||
continue
|
||||
if (page.title.startswith("Wikipedia:")
|
||||
or page.title.startswith("Template:")
|
||||
or page.title.startswith("File:")):
|
||||
|
||||
# see https://www.mediawiki.org/wiki/Manual:Namespace for MW core namespaces,
|
||||
# although larger wikis usually have custom namespaces that can be listed via the API:
|
||||
# https://en.wikipedia.org/w/api.php?action=query&meta=siteinfo&siprop=namespaces&formatversion=2
|
||||
if page.namespace in (
|
||||
4, # NS_PROJECT (aka Wikipedia:)
|
||||
6, # NS_FILE
|
||||
10, # NS_TEMPLATE
|
||||
):
|
||||
elem.clear()
|
||||
del elem
|
||||
continue
|
||||
|
||||
content = mw.parse(page.markup).strip_code()
|
||||
sha = hashlib.sha512()
|
||||
sha.update(content.encode())
|
||||
|
|
Loading…
Reference in a new issue