Last active
April 22, 2020 14:33
-
-
Save jlinoff/29d7f5f0de4c2c99be256a7e8c9873fa to your computer and use it in GitHub Desktop.
Pyarrow based python script that checks a parquet file to see if it can be read
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
''' | |
This script accepts a list of parquetfiles and then runs through them | |
to see which column entries in the schema can be read. It is useful | |
for determining where pyarrow does not yet support a particular | |
schema type. | |
''' | |
import os | |
import sys | |
import pyarrow.parquet as pq | |
import pyarrow.lib | |
# Colors | |
_R = '\x1b[31m' | |
_G = '\x1b[32m' | |
_0 = '\x1b[0m' | |
def print_entry(entry: str, arg: str, i: int): | |
''' | |
Check a single entry. | |
''' | |
pass_flag = False | |
path, etype = entry.split(':') | |
path = path.strip() | |
etype = etype.strip() | |
print(f'{i:>5} {entry} ', end='') | |
try: | |
_table = pq.read_table(arg, columns=[path]) | |
print(f'{_G}OK', end='') | |
pass_flag = True | |
except pyarrow.lib.ArrowNotImplementedError as exc: # pylint: disable=c-extension-no-member | |
print(f'{_R}FAIL: {exc}', end='') | |
except pyarrow.lib.ArrowInvalid as exc: # pylint: disable=c-extension-no-member | |
print(f'{_R}FAIL: {exc}', end='') | |
except OSError as exc: | |
print(f'{_R}FAIL: {exc}', end='') | |
print(f'{_0}') | |
return pass_flag | |
def check(file_names: list): | |
''' | |
Check each parquet file on the command line. | |
''' | |
# Allow the user to disable color by setting NC=<anything> (except 'color'). | |
color = os.getenv('NC', 'color') == 'color' | |
if not color: | |
global _R, _G, _0 # pylint: disable=global-statement | |
_R = _G = _0 = '' | |
for arg in file_names: | |
print(f'\x1b[1m{arg}\x1b[0m') | |
parquet_file = pq.ParquetFile(arg) | |
print(f'metadata: {parquet_file.metadata}') | |
entries = str(parquet_file.schema).strip().split('\n') | |
num_passed = num_failed = 0 | |
i = 0 | |
for entry in entries: | |
entry = entry.strip() | |
if ':' in entry: | |
i += 1 | |
if print_entry(entry, arg, i): | |
num_passed += 1 | |
else: | |
num_failed += 1 | |
total = num_passed + num_failed | |
print(f'SUMMARY: total={total}, num_passed={num_passed}, num_failed={num_failed}') | |
if __name__ == '__main__': | |
check(sys.argv[1:]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment