Created
September 13, 2018 18:28
-
-
Save snarkmaster/e426af8e7f22dcd4890704d86f9b12f9 to your computer and use it in GitHub Desktop.
Parsing RPM location href from the repo's primary XML
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class _RpmLocationParser(AbstractContextManager): | |
''' | |
Parses through -primary.xml.gz and extracts `href` from each `location`. | |
NB: This XML parsing takes ~4 seconds, while it takes ~1.5 seconds to | |
unpack -primary.sqlite.bz2 and `SELECT location_href FROM packages`. | |
''' | |
def __init__(self): | |
self.decompressor = zlib.decompressobj(wbits=zlib.MAX_WBITS + 16) | |
self.xml_parser = XMLPullParser(['start']) | |
# ElementTree mangles the tags thus: '{xml_namespace}tag_name' | |
self.location_re = re.compile('({[^}]+}|)location') | |
# This context manager does not suppress exceptions. | |
def __exit__(self, exc_type, exc_val, exc_tb) -> None: | |
# Closing the parser breaks some circular refs to conserve RAM. | |
self.xml_parser.close() | |
def feed(self, chunk: bytes) -> Iterator[str]: | |
self.xml_parser.feed(self.decompressor.decompress(chunk)) | |
for event, elt in self.xml_parser.read_events(): | |
assert event == 'start' | |
if self.location_re.match(elt.tag): | |
yield elt.attrib['href'] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I managed to make this ~2x faster by using
Element.clear()
and reducing the chunk size that I feed intoXMLPullParser
: