Created
November 15, 2024 16:51
-
-
Save anyweez/c2da30158856c0c5d4f7a52fd9cf4855 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pyarrow | |
import pyarrow.parquet as pq | |
import sys | |
if len(sys.argv) != 2: | |
print("Usage: python nan-checklist.py <parquet_file_path>") | |
sys.exit(1) | |
filename = sys.argv[1] | |
print("--------------------------------------") | |
print("------- Parquet validity check -------") | |
print("--------------------------------------") | |
print() | |
print("Checking for columns that can't be parsed by pyarrow...") | |
print() | |
def print_column_report(): | |
# Read in the parquet file to extract the column list. We'll check each column | |
# to see if it can be read by pyarrow. | |
parquet_file = pq.ParquetFile(filename) | |
column_names = parquet_file.schema_arrow.names | |
# `good_columns` are columns that can be parsed and loaded into a pyarrow table | |
good_columns: list[str] = [] | |
# `failed_columns` are columns that can't be parsed and loaded into a pyarrow table | |
failed_colums: list[str] = [] | |
for column_name in column_names: | |
try: | |
# Read a subset of the parquet table, which should include all columns that've already | |
# been parsed successfully + whatever column we're trying to read now. | |
table = pq.read_table(filename, columns=good_columns + [column_name]) | |
# This is comparable to what Subsalt's software does internally, and where we believe | |
# the error is coming from. The specific error we observed on 2024/11/13 was due to | |
# a NaN (not a number) value in a column that doesn't support NaN values, specifically | |
# an int64 column. | |
pyarrow.Table.from_pylist( | |
table.to_pandas().to_dict('records'), | |
schema=table.schema | |
) | |
except Exception as e: | |
# Catches all exceptions that occur when reading the parquet file in. The one we | |
# saw in practice was: | |
# Could not convert nan with type float: tried to convert to int64 | |
# | |
# We've been able to reproduce this by creating a parquet file with an int64 column | |
# that has a NaN value in it. | |
print(f" > Failed to read column {column_name}") | |
print(f" > {e}") | |
print() | |
failed_colums.append(column_name) | |
continue | |
good_columns.append(column_name) | |
print(f"Columns that passed: {good_columns}") | |
print(f"Columns that failed to read: {failed_colums}") | |
print_column_report() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment