Last active
June 29, 2023 12:32
-
-
Save hoshiyosan/9cc71270de895d181f8f35d7c663f9d1 to your computer and use it in GitHub Desktop.
Iterate over large XML hosted by a web server. Download is performed once, then cached file is used.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import io | |
import logging | |
import os | |
from typing import Generator | |
import xml.etree.ElementTree | |
import requests | |
logging.basicConfig(level=logging.DEBUG, format="%(levelname)s :: %(message)s") | |
LOG = logging.getLogger(__name__) | |
class BytesStream(io.TextIOWrapper): | |
"""Wrapper that provide file-like behaviour to a bytes generator""" | |
def __init__(self, generator: Generator[bytes, None, None]): | |
self.generator = generator | |
def read(self, __size: int) -> bytes: | |
try: | |
return next(self.generator) | |
except StopIteration: | |
return b"" | |
def download_file_by_chunks(url: str, filename: str, chunk_size: int): | |
"""Download-read content of a remote file by chunk""" | |
LOG.info("Downloading file %s from: %s", filename, url) | |
with requests.get(url, stream=True) as response: | |
response.raise_for_status() | |
with open(filename, "wb") as f: | |
for chunk in response.iter_content(chunk_size=chunk_size): | |
LOG.debug("Reading/saving chunk from remote file: %s", url) | |
yield chunk | |
f.write(chunk) | |
def read_file_by_chunks(filename: str, chunk_size: int): | |
"""Read content of an existing file by chunk""" | |
LOG.info("Loading file %s from disk", filename) | |
with open(filename, "rb") as f: | |
chunk = True | |
while chunk: | |
LOG.debug("Reading chunk from file: %s", filename) | |
chunk = f.read(chunk_size) | |
yield chunk | |
def read_or_download_file_by_chunks(url: str, filename: str, chunk_size: int = 1024): | |
"""Read content of a remote file cached locally. | |
Download it if file is not cached yet. | |
""" | |
if os.path.exists(filename): | |
file_iterator = read_file_by_chunks(filename, chunk_size=chunk_size) | |
else: | |
file_iterator = download_file_by_chunks(url, filename, chunk_size=chunk_size) | |
return file_iterator | |
def iter_xml_elements(url: str, filename: str, tag: str = None): | |
"""Iter elements contains in an XML file. | |
:param URL: URL of remote XML file | |
:param filename: Name used to cache file locally. | |
:param tag: (optional) Only iterate elements with specified tag. | |
""" | |
bytes_generator = read_or_download_file_by_chunks(url, filename, chunk_size=4096) | |
file_like_byte_stream = BytesStream(bytes_generator) | |
for _, element in xml.etree.ElementTree.iterparse(file_like_byte_stream, events=("end",)): | |
if tag is None or element.tag == tag: | |
yield element | |
def process_file(url: str, filename: str): | |
"""Iterate over elements of a large XML file hosted on a remote web server.""" | |
for xml_element in iter_xml_elements(url, filename, tag="CD"): | |
title = xml_element.find("TITLE").text | |
artist = xml_element.find("ARTIST").text | |
year = xml_element.find("YEAR").text | |
LOG.info("Parsed CD: %s - %s (%s)", title, artist, year) | |
if __name__ == "__main__": | |
process_file( | |
url="https://www.w3schools.com/xml/cd_catalog.xml", filename="cd_catalog.xml" | |
) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
INFO :: Downloading file cd_catalog.xml from: https://www.w3schools.com/xml/cd_catalog.xml | |
DEBUG :: Starting new HTTPS connection (1): www.w3schools.com:443 | |
DEBUG :: https://www.w3schools.com:443 "GET /xml/cd_catalog.xml HTTP/1.1" 200 1016 | |
DEBUG :: Reading/saving chunk from remote file: https://www.w3schools.com/xml/cd_catalog.xml | |
INFO :: Parsed CD: Empire Burlesque - Bob Dylan (1985) | |
INFO :: Parsed CD: Hide your heart - Bonnie Tyler (1988) | |
INFO :: Parsed CD: Greatest Hits - Dolly Parton (1982) | |
INFO :: Parsed CD: Still got the blues - Gary Moore (1990) | |
INFO :: Parsed CD: Eros - Eros Ramazzotti (1997) | |
INFO :: Parsed CD: One night only - Bee Gees (1998) | |
INFO :: Parsed CD: Sylvias Mother - Dr.Hook (1973) | |
INFO :: Parsed CD: Maggie May - Rod Stewart (1990) | |
INFO :: Parsed CD: Romanza - Andrea Bocelli (1996) | |
INFO :: Parsed CD: When a man loves a woman - Percy Sledge (1987) | |
INFO :: Parsed CD: Black angel - Savage Rose (1995) | |
INFO :: Parsed CD: 1999 Grammy Nominees - Many (1999) | |
INFO :: Parsed CD: For the good times - Kenny Rogers (1995) | |
INFO :: Parsed CD: Big Willie style - Will Smith (1997) | |
INFO :: Parsed CD: Tupelo Honey - Van Morrison (1971) | |
INFO :: Parsed CD: Soulsville - Jorn Hoel (1996) | |
INFO :: Parsed CD: The very best of - Cat Stevens (1990) | |
INFO :: Parsed CD: Stop - Sam Brown (1988) | |
INFO :: Parsed CD: Bridge of Spies - T'Pau (1987) | |
INFO :: Parsed CD: Private Dancer - Tina Turner (1983) | |
INFO :: Parsed CD: Midt om natten - Kim Larsen (1983) | |
DEBUG :: Reading/saving chunk from remote file: https://www.w3schools.com/xml/cd_catalog.xml | |
INFO :: Parsed CD: Pavarotti Gala Concert - Luciano Pavarotti (1991) | |
INFO :: Parsed CD: The dock of the bay - Otis Redding (1968) | |
INFO :: Parsed CD: Picture book - Simply Red (1985) | |
INFO :: Parsed CD: Red - The Communards (1987) | |
INFO :: Parsed CD: Unchain my heart - Joe Cocker (1987) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
INFO :: Loading file cd_catalog.xml from disk | |
DEBUG :: Reading chunk from file: cd_catalog.xml | |
INFO :: Parsed CD: Empire Burlesque - Bob Dylan (1985) | |
INFO :: Parsed CD: Hide your heart - Bonnie Tyler (1988) | |
INFO :: Parsed CD: Greatest Hits - Dolly Parton (1982) | |
INFO :: Parsed CD: Still got the blues - Gary Moore (1990) | |
INFO :: Parsed CD: Eros - Eros Ramazzotti (1997) | |
INFO :: Parsed CD: One night only - Bee Gees (1998) | |
INFO :: Parsed CD: Sylvias Mother - Dr.Hook (1973) | |
INFO :: Parsed CD: Maggie May - Rod Stewart (1990) | |
INFO :: Parsed CD: Romanza - Andrea Bocelli (1996) | |
INFO :: Parsed CD: When a man loves a woman - Percy Sledge (1987) | |
INFO :: Parsed CD: Black angel - Savage Rose (1995) | |
INFO :: Parsed CD: 1999 Grammy Nominees - Many (1999) | |
INFO :: Parsed CD: For the good times - Kenny Rogers (1995) | |
INFO :: Parsed CD: Big Willie style - Will Smith (1997) | |
INFO :: Parsed CD: Tupelo Honey - Van Morrison (1971) | |
INFO :: Parsed CD: Soulsville - Jorn Hoel (1996) | |
INFO :: Parsed CD: The very best of - Cat Stevens (1990) | |
INFO :: Parsed CD: Stop - Sam Brown (1988) | |
INFO :: Parsed CD: Bridge of Spies - T'Pau (1987) | |
INFO :: Parsed CD: Private Dancer - Tina Turner (1983) | |
INFO :: Parsed CD: Midt om natten - Kim Larsen (1983) | |
DEBUG :: Reading chunk from file: cd_catalog.xml | |
INFO :: Parsed CD: Pavarotti Gala Concert - Luciano Pavarotti (1991) | |
INFO :: Parsed CD: The dock of the bay - Otis Redding (1968) | |
INFO :: Parsed CD: Picture book - Simply Red (1985) | |
INFO :: Parsed CD: Red - The Communards (1987) | |
INFO :: Parsed CD: Unchain my heart - Joe Cocker (1987) | |
DEBUG :: Reading chunk from file: cd_catalog.xml |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment