Skip to content

Instantly share code, notes, and snippets.

@onlurking
Created January 11, 2019 03:54
Show Gist options
  • Save onlurking/5dc00d9473bab4cb1d14b49d11951791 to your computer and use it in GitHub Desktop.
Save onlurking/5dc00d9473bab4cb1d14b49d11951791 to your computer and use it in GitHub Desktop.
from xmltodict import parse
from collections import namedtuple
with open("vimwikiacom-20190108-current.xml", "rb") as file:
wiki = parse(file, xml_attribs=True)
pages = wiki['mediawiki']['page']
def extract_pages(section):
return [page for page in pages if page['title'].startswith(section)]
# TODO: write code to read the namespaces from the wiki dump
page_types = ('VimTip', 'Talk:', 'User:', 'User talk:', 'Vim Tips Wiki:', 'Vim Tips Wiki talk:', 'Template talk:', 'File:', 'MediaWiki:', 'MediaWiki talk:', 'Template:', 'Help:', 'Help talk:', 'Category:', 'Category talk:', 'Forum:', 'Script:', 'Script talk:')
pages = [page for page in pages if not page['title'].startswith(page_types)]
def parse_page(page):
Page = namedtuple('Page', 'title content comments last_updated id')
return Page(title=page['title'],
content=page['revision']['text']['#text'],
comments=page['revision']['comment'],
last_updated=page['revision']['timestamp'],
id=page['id'])
n = parse_page(pages[500])
print(n.content)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment