Skip to content

Instantly share code, notes, and snippets.

@WillKoehrsen
Last active October 9, 2018 05:04
Show Gist options
  • Save WillKoehrsen/1c3afe1786cb36296b82892bd3a914f4 to your computer and use it in GitHub Desktop.
Save WillKoehrsen/1c3afe1786cb36296b82892bd3a914f4 to your computer and use it in GitHub Desktop.
import xml.sax
class WikiXmlHandler(xml.sax.handler.ContentHandler):
"""Content handler for Wiki XML data using SAX"""
def __init__(self):
xml.sax.handler.ContentHandler.__init__(self)
self._buffer = None
self._values = {}
self._current_tag = None
self._pages = []
def characters(self, content):
"""Characters between opening and closing tags"""
if self._current_tag:
self._buffer.append(content)
def startElement(self, name, attrs):
"""Opening tag of element"""
if name in ('title', 'text'):
self._current_tag = name
self._buffer = []
def endElement(self, name):
"""Closing tag of element"""
if name == self._current_tag:
self._values[name] = ' '.join(self._buffer)
if name == 'page':
self._pages.append((self._values['title'], self._values['text']))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment