Skip to content

Instantly share code, notes, and snippets.

@hoshiyosan
Last active June 29, 2023 12:32
Show Gist options
  • Save hoshiyosan/9cc71270de895d181f8f35d7c663f9d1 to your computer and use it in GitHub Desktop.
Save hoshiyosan/9cc71270de895d181f8f35d7c663f9d1 to your computer and use it in GitHub Desktop.
Iterate over large XML hosted by a web server. Download is performed once, then cached file is used.
import io
import logging
import os
from typing import Generator
import xml.etree.ElementTree
import requests
logging.basicConfig(level=logging.DEBUG, format="%(levelname)s :: %(message)s")
LOG = logging.getLogger(__name__)
class BytesStream(io.TextIOWrapper):
"""Wrapper that provide file-like behaviour to a bytes generator"""
def __init__(self, generator: Generator[bytes, None, None]):
self.generator = generator
def read(self, __size: int) -> bytes:
try:
return next(self.generator)
except StopIteration:
return b""
def download_file_by_chunks(url: str, filename: str, chunk_size: int):
"""Download-read content of a remote file by chunk"""
LOG.info("Downloading file %s from: %s", filename, url)
with requests.get(url, stream=True) as response:
response.raise_for_status()
with open(filename, "wb") as f:
for chunk in response.iter_content(chunk_size=chunk_size):
LOG.debug("Reading/saving chunk from remote file: %s", url)
yield chunk
f.write(chunk)
def read_file_by_chunks(filename: str, chunk_size: int):
"""Read content of an existing file by chunk"""
LOG.info("Loading file %s from disk", filename)
with open(filename, "rb") as f:
chunk = True
while chunk:
LOG.debug("Reading chunk from file: %s", filename)
chunk = f.read(chunk_size)
yield chunk
def read_or_download_file_by_chunks(url: str, filename: str, chunk_size: int = 1024):
"""Read content of a remote file cached locally.
Download it if file is not cached yet.
"""
if os.path.exists(filename):
file_iterator = read_file_by_chunks(filename, chunk_size=chunk_size)
else:
file_iterator = download_file_by_chunks(url, filename, chunk_size=chunk_size)
return file_iterator
def iter_xml_elements(url: str, filename: str, tag: str = None):
"""Iter elements contains in an XML file.
:param URL: URL of remote XML file
:param filename: Name used to cache file locally.
:param tag: (optional) Only iterate elements with specified tag.
"""
bytes_generator = read_or_download_file_by_chunks(url, filename, chunk_size=4096)
file_like_byte_stream = BytesStream(bytes_generator)
for _, element in xml.etree.ElementTree.iterparse(file_like_byte_stream, events=("end",)):
if tag is None or element.tag == tag:
yield element
def process_file(url: str, filename: str):
"""Iterate over elements of a large XML file hosted on a remote web server."""
for xml_element in iter_xml_elements(url, filename, tag="CD"):
title = xml_element.find("TITLE").text
artist = xml_element.find("ARTIST").text
year = xml_element.find("YEAR").text
LOG.info("Parsed CD: %s - %s (%s)", title, artist, year)
if __name__ == "__main__":
process_file(
url="https://www.w3schools.com/xml/cd_catalog.xml", filename="cd_catalog.xml"
)
INFO :: Downloading file cd_catalog.xml from: https://www.w3schools.com/xml/cd_catalog.xml
DEBUG :: Starting new HTTPS connection (1): www.w3schools.com:443
DEBUG :: https://www.w3schools.com:443 "GET /xml/cd_catalog.xml HTTP/1.1" 200 1016
DEBUG :: Reading/saving chunk from remote file: https://www.w3schools.com/xml/cd_catalog.xml
INFO :: Parsed CD: Empire Burlesque - Bob Dylan (1985)
INFO :: Parsed CD: Hide your heart - Bonnie Tyler (1988)
INFO :: Parsed CD: Greatest Hits - Dolly Parton (1982)
INFO :: Parsed CD: Still got the blues - Gary Moore (1990)
INFO :: Parsed CD: Eros - Eros Ramazzotti (1997)
INFO :: Parsed CD: One night only - Bee Gees (1998)
INFO :: Parsed CD: Sylvias Mother - Dr.Hook (1973)
INFO :: Parsed CD: Maggie May - Rod Stewart (1990)
INFO :: Parsed CD: Romanza - Andrea Bocelli (1996)
INFO :: Parsed CD: When a man loves a woman - Percy Sledge (1987)
INFO :: Parsed CD: Black angel - Savage Rose (1995)
INFO :: Parsed CD: 1999 Grammy Nominees - Many (1999)
INFO :: Parsed CD: For the good times - Kenny Rogers (1995)
INFO :: Parsed CD: Big Willie style - Will Smith (1997)
INFO :: Parsed CD: Tupelo Honey - Van Morrison (1971)
INFO :: Parsed CD: Soulsville - Jorn Hoel (1996)
INFO :: Parsed CD: The very best of - Cat Stevens (1990)
INFO :: Parsed CD: Stop - Sam Brown (1988)
INFO :: Parsed CD: Bridge of Spies - T'Pau (1987)
INFO :: Parsed CD: Private Dancer - Tina Turner (1983)
INFO :: Parsed CD: Midt om natten - Kim Larsen (1983)
DEBUG :: Reading/saving chunk from remote file: https://www.w3schools.com/xml/cd_catalog.xml
INFO :: Parsed CD: Pavarotti Gala Concert - Luciano Pavarotti (1991)
INFO :: Parsed CD: The dock of the bay - Otis Redding (1968)
INFO :: Parsed CD: Picture book - Simply Red (1985)
INFO :: Parsed CD: Red - The Communards (1987)
INFO :: Parsed CD: Unchain my heart - Joe Cocker (1987)
INFO :: Loading file cd_catalog.xml from disk
DEBUG :: Reading chunk from file: cd_catalog.xml
INFO :: Parsed CD: Empire Burlesque - Bob Dylan (1985)
INFO :: Parsed CD: Hide your heart - Bonnie Tyler (1988)
INFO :: Parsed CD: Greatest Hits - Dolly Parton (1982)
INFO :: Parsed CD: Still got the blues - Gary Moore (1990)
INFO :: Parsed CD: Eros - Eros Ramazzotti (1997)
INFO :: Parsed CD: One night only - Bee Gees (1998)
INFO :: Parsed CD: Sylvias Mother - Dr.Hook (1973)
INFO :: Parsed CD: Maggie May - Rod Stewart (1990)
INFO :: Parsed CD: Romanza - Andrea Bocelli (1996)
INFO :: Parsed CD: When a man loves a woman - Percy Sledge (1987)
INFO :: Parsed CD: Black angel - Savage Rose (1995)
INFO :: Parsed CD: 1999 Grammy Nominees - Many (1999)
INFO :: Parsed CD: For the good times - Kenny Rogers (1995)
INFO :: Parsed CD: Big Willie style - Will Smith (1997)
INFO :: Parsed CD: Tupelo Honey - Van Morrison (1971)
INFO :: Parsed CD: Soulsville - Jorn Hoel (1996)
INFO :: Parsed CD: The very best of - Cat Stevens (1990)
INFO :: Parsed CD: Stop - Sam Brown (1988)
INFO :: Parsed CD: Bridge of Spies - T'Pau (1987)
INFO :: Parsed CD: Private Dancer - Tina Turner (1983)
INFO :: Parsed CD: Midt om natten - Kim Larsen (1983)
DEBUG :: Reading chunk from file: cd_catalog.xml
INFO :: Parsed CD: Pavarotti Gala Concert - Luciano Pavarotti (1991)
INFO :: Parsed CD: The dock of the bay - Otis Redding (1968)
INFO :: Parsed CD: Picture book - Simply Red (1985)
INFO :: Parsed CD: Red - The Communards (1987)
INFO :: Parsed CD: Unchain my heart - Joe Cocker (1987)
DEBUG :: Reading chunk from file: cd_catalog.xml
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment