Created
January 11, 2019 03:54
-
-
Save onlurking/5dc00d9473bab4cb1d14b49d11951791 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from xmltodict import parse | |
from collections import namedtuple | |
with open("vimwikiacom-20190108-current.xml", "rb") as file: | |
wiki = parse(file, xml_attribs=True) | |
pages = wiki['mediawiki']['page'] | |
def extract_pages(section): | |
return [page for page in pages if page['title'].startswith(section)] | |
# TODO: write code to read the namespaces from the wiki dump | |
page_types = ('VimTip', 'Talk:', 'User:', 'User talk:', 'Vim Tips Wiki:', 'Vim Tips Wiki talk:', 'Template talk:', 'File:', 'MediaWiki:', 'MediaWiki talk:', 'Template:', 'Help:', 'Help talk:', 'Category:', 'Category talk:', 'Forum:', 'Script:', 'Script talk:') | |
pages = [page for page in pages if not page['title'].startswith(page_types)] | |
def parse_page(page): | |
Page = namedtuple('Page', 'title content comments last_updated id') | |
return Page(title=page['title'], | |
content=page['revision']['text']['#text'], | |
comments=page['revision']['comment'], | |
last_updated=page['revision']['timestamp'], | |
id=page['id']) | |
n = parse_page(pages[500]) | |
print(n.content) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment