cpcloud · November 28, 2022 20:07
diff --git a/getschema.py b/getschema.py
 """
 Given a URL pointing to a Parquet file, how do I get the schema from that file
 as cheaply as possible?

 Example URL (75 MB):
    https://storage.googleapis.com/ibis-tutorial-data/wowah_data/wowah_data_raw.parquet
 """

 PARQUET_MAGIC = b"PAR1"

 import io
 import struct

 import pyarrow.parquet as pq
 import requests


 def range_header(start, end):
    # 0-1024
    # 0-1023
    return {"Range": f"bytes={start}-{end}"}


 def get_footer(url: str) -> tuple[int, int]:
    # 1. get the number of bytes in the file
    resp = requests.head(url)
    resp.raise_for_status()
    raw_nbytes = resp.headers["content-length"]
    nbytes = int(raw_nbytes)
    start = nbytes - 8
    end = nbytes - 1
    resp = requests.get(url, headers=range_header(start, end))
    content = resp.content
    assert len(content) == 8
    # struct module docs
    # i is first 4 bytes
    # 4 s is 4 single bytes
    footer_length, magic = struct.unpack("i4s", content)
    assert magic == PARQUET_MAGIC
    return footer_length, nbytes, content


 def main(url: str):
    footer_length, nbytes, raw_footer_info = get_footer(url)

    start = nbytes - 8 - footer_length
    end = nbytes - 9

    assert end - start + 1 == footer_length, "your math is wrong"

    resp = requests.get(url, headers=range_header(start, end))
    # footer bytes
    # contains the schema information among other things
    content = resp.content
    # read_metadata accepts files, buffers, maybe other things?

    blob = io.BytesIO()
    blob.write(content)
    blob.write(raw_footer_info)
    metadata = pq.read_metadata(blob)
    return metadata.schema


 if __name__ == "__main__":
    import argparse

    p = argparse.ArgumentParser()
    p.add_argument("url", type=str)
    args = p.parse_args()
    schema = main(args.url)
    print(schema)
	"""
	Given a URL pointing to a Parquet file, how do I get the schema from that file
	as cheaply as possible?

	Example URL (75 MB):
	https://storage.googleapis.com/ibis-tutorial-data/wowah_data/wowah_data_raw.parquet
	"""

	PARQUET_MAGIC = b"PAR1"

	import io
	import struct

	import pyarrow.parquet as pq
	import requests


	def range_header(start, end):
	# 0-1024
	# 0-1023
	return {"Range": f"bytes={start}-{end}"}


	def get_footer(url: str) -> tuple[int, int]:
	# 1. get the number of bytes in the file
	resp = requests.head(url)
	resp.raise_for_status()
	raw_nbytes = resp.headers["content-length"]
	nbytes = int(raw_nbytes)
	start = nbytes - 8
	end = nbytes - 1
	resp = requests.get(url, headers=range_header(start, end))
	content = resp.content
	assert len(content) == 8
	# struct module docs
	# i is first 4 bytes
	# 4 s is 4 single bytes
	footer_length, magic = struct.unpack("i4s", content)
	assert magic == PARQUET_MAGIC
	return footer_length, nbytes, content


	def main(url: str):
	footer_length, nbytes, raw_footer_info = get_footer(url)

	start = nbytes - 8 - footer_length
	end = nbytes - 9

	assert end - start + 1 == footer_length, "your math is wrong"

	resp = requests.get(url, headers=range_header(start, end))
	# footer bytes
	# contains the schema information among other things
	content = resp.content
	# read_metadata accepts files, buffers, maybe other things?

	blob = io.BytesIO()
	blob.write(content)
	blob.write(raw_footer_info)
	metadata = pq.read_metadata(blob)
	return metadata.schema


	if __name__ == "__main__":
	import argparse

	p = argparse.ArgumentParser()
	p.add_argument("url", type=str)
	args = p.parse_args()
	schema = main(args.url)
	print(schema)