Created
January 28, 2022 17:02
-
-
Save grantmwilliams/1ceb490312c59e4fb6e4bc15b57e9707 to your computer and use it in GitHub Desktop.
Pyarrow uint32() -> int64() bug?
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pyarrow as pa | |
import pyarrow.parquet as pq | |
file_name_mapping = { | |
pa.int32(): "int32", | |
pa.uint32(): "uint32", | |
pa.int64(): "int64", | |
pa.uint64(): "uint64" | |
} | |
int_types = [pa.int32(), pa.uint32(), pa.int64(), pa.uint64()] | |
def write_file(pa_type, file_name): | |
schema = pa.schema([ | |
pa.field("idx", pa.string()), | |
pa.field("val", pa_type), | |
]) | |
table = pa.Table.from_pydict({ | |
'idx': ["A", "B", "C"], | |
'val': [4, 5, 6], | |
}, schema=schema) | |
pq.write_table(table, file_name, compression='snappy') | |
def read_file(pa_type, file_name): | |
with open(file_name, "rb") as fp: | |
parquet_file = pq.ParquetFile(fp) | |
col_type = parquet_file.schema_arrow.field("val").type | |
print(f"pa_type: {pa_type} -- schema_type: {col_type}") | |
print(parquet_file.schema.column(1)) | |
print("-" * 40) | |
for pa_type in int_types: | |
file_name = f"pyarrow_bug/data/{file_name_mapping[pa_type]}_file.snappy.parquet" | |
write_file(pa_type, file_name) | |
read_file(pa_type, file_name) | |
print("-" * 40) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The output from running the code looks like: