Skip to content

Instantly share code, notes, and snippets.

@ksindi
Last active August 20, 2017 19:42
Show Gist options
  • Save ksindi/ce131c859dea481eb94446f6f28be8cd to your computer and use it in GitHub Desktop.
Save ksindi/ce131c859dea481eb94446f6f28be8cd to your computer and use it in GitHub Desktop.
import os
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
NUM_FIELDS = 600
NUM_TABLES = 100
fields = [
pa.field('column1', pa.string()),
pa.field('column2', pa.int64()),
] + [pa.field(f'column{i}', pa.string()) for i in range(4, NUM_FIELDS)]
schema = pa.schema(fields)
rows = [
{'column1': 'val1', 'column2': 123}, # , 'column3': ''
{'column1': 'val2', 'column2': 234, 'column3': ''},
{'column1': 'val3', 'column2': 345, 'column3': ''},
]
writers = [pq.ParquetWriter(f'table{i}.parquet', schema) for i in range(NUM_TABLES)]
for _ in range(3):
for writer in writers:
df = pd.DataFrame(rows)
pa_table = pa.Table.from_pandas(df, schema=schema, preserve_index=False)
writer.write_table(pa_table)
for writer in writers:
writer.close()
for i in range(NUM_TABLES):
os.remove(f'table{i}.parquet')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment