Skip to content

Instantly share code, notes, and snippets.

@cpcloud
Created November 28, 2022 20:07
Show Gist options
  • Save cpcloud/c6c3697f562db9ab9de681f7cde9c511 to your computer and use it in GitHub Desktop.
Save cpcloud/c6c3697f562db9ab9de681f7cde9c511 to your computer and use it in GitHub Desktop.
Extract parquet metadata from URLs
"""
Given a URL pointing to a Parquet file, how do I get the schema from that file
as cheaply as possible?
Example URL (75 MB):
https://storage.googleapis.com/ibis-tutorial-data/wowah_data/wowah_data_raw.parquet
"""
PARQUET_MAGIC = b"PAR1"
import io
import struct
import pyarrow.parquet as pq
import requests
def range_header(start, end):
# 0-1024
# 0-1023
return {"Range": f"bytes={start}-{end}"}
def get_footer(url: str) -> tuple[int, int]:
# 1. get the number of bytes in the file
resp = requests.head(url)
resp.raise_for_status()
raw_nbytes = resp.headers["content-length"]
nbytes = int(raw_nbytes)
start = nbytes - 8
end = nbytes - 1
resp = requests.get(url, headers=range_header(start, end))
content = resp.content
assert len(content) == 8
# struct module docs
# i is first 4 bytes
# 4 s is 4 single bytes
footer_length, magic = struct.unpack("i4s", content)
assert magic == PARQUET_MAGIC
return footer_length, nbytes, content
def main(url: str):
footer_length, nbytes, raw_footer_info = get_footer(url)
start = nbytes - 8 - footer_length
end = nbytes - 9
assert end - start + 1 == footer_length, "your math is wrong"
resp = requests.get(url, headers=range_header(start, end))
# footer bytes
# contains the schema information among other things
content = resp.content
# read_metadata accepts files, buffers, maybe other things?
blob = io.BytesIO()
blob.write(content)
blob.write(raw_footer_info)
metadata = pq.read_metadata(blob)
return metadata.schema
if __name__ == "__main__":
import argparse
p = argparse.ArgumentParser()
p.add_argument("url", type=str)
args = p.parse_args()
schema = main(args.url)
print(schema)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment