Created
March 1, 2011 19:12
-
-
Save vpetro/849682 to your computer and use it in GitHub Desktop.
SAX-based parser to extract blocks of elements as Python dictionaries
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from xml.sax import ContentHandler, parseString | |
from datetime import datetime | |
class Parser(ContentHandler): | |
def __init__(self, block_name, attrs=None): | |
ContentHandler.__init__(self) | |
self._block_name = block_name | |
self._name = None | |
self._blocks = list() | |
self._current_block = None | |
self._content = "" | |
self._attrs = attrs | |
def startElement(self, name, attrs): | |
if self._block_name == name: | |
self._current_block = list() | |
else: | |
self._name = name | |
if self._attrs and self._name in self._attrs: | |
for elem_name, attr_name in self._attrs.items(): | |
if elem_name == self._name: | |
self._current_block.append(("%s_%s" % (elem_name, attr_name), attrs[attr_name])) | |
def characters(self, content): | |
self._content += content | |
def endElement(self, name): | |
if self._name == name and len(self._content) > 0: | |
self._current_block.append((self._name, self._content.strip())) | |
self._content = "" | |
if self._block_name == name: | |
self._blocks.append(self._current_block) | |
self._current_block = None | |
def endDocument(self): | |
result = list() | |
for block in self._blocks: | |
new_dict = {} | |
for (name, value) in block: | |
if name in new_dict: | |
if isinstance(new_dict[name], list): | |
new_dict[name].append(value) | |
else: | |
lst = [new_dict[name], value] | |
new_dict[name] = lst | |
else: | |
new_dict[name] = value | |
result.append(new_dict) | |
self._blocks = result | |
def parse_date(date_string): | |
date = None | |
if date_string is None or len(date_string) == 0: | |
return None | |
if date_string == "ongoing": | |
now = datetime.now() | |
end_date = None | |
try: | |
end_date = datetime(year=now.year, month=now.month, day=now.day+7) | |
except ValueError: | |
if now.month == 12: | |
end_date = datetime(year=now.year+1, month=1, day=7) | |
else: | |
end_date = datetime(year=now.year, month=now.month+1, day=7) | |
return end_date | |
# try to split the string on a space | |
split_string = date_string.split() | |
# if there are two portions to the date the first is the | |
# date, the second is the time. | |
if len(split_string) > 1: | |
date_string = split_string[0] | |
# split the string to get out the parts of the date | |
year, month, day = date_string.split('-') | |
# create a new datetime object | |
date = datetime(int(year), int(month), int(day)) | |
return date |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment