Skip to content

Instantly share code, notes, and snippets.

@willu47
Last active May 21, 2018 09:52
Show Gist options
  • Save willu47/a8e9893f8bc265ffc093c51d7298a964 to your computer and use it in GitHub Desktop.
Save willu47/a8e9893f8bc265ffc093c51d7298a964 to your computer and use it in GitHub Desktop.
Convert directory containing numpy arrays serialised using pyarrow into folder of csv files
import pyarrow as pa
import os
from csv import writer
from sys import argv
import numpy as np
def get_data(filepath):
data = None
with pa.memory_map(filepath, 'rb') as f:
f.seek(0)
buf = f.read_buffer()
data = pa.deserialize(buf)
return data
def write_csv_iter(data, destination_path, file):
if sum(data.shape) > 0:
it = np.nditer(data, flags=['multi_index'])
with open(destination_path, 'w') as writefile:
writ = writer(writefile)
while not it.finished:
if it[0] != 0:
writ.writerow(
[it.multi_index[0],
it.multi_index[1],
it[0]])
it.iternext()
else:
print("Skipping {} as it is empty".format(file))
def write_csv_nonzero(data, destination_path, file):
indices = np.nonzero(data)
if np.count_nonzero(data) > 0:
with open(destination_path, 'w') as writefile:
writ = writer(writefile)
for row, col in zip(indices[0], indices[1]):
writ.writerow([row, col, data[row, col]])
else:
print("Skipping {} as it is empty".format(file))
def convert_pyarrow_to_csv(path):
if not os.path.exists(os.path.join(path, 'converted')):
os.mkdir(os.path.join(path, 'converted'))
for file in os.listdir(path):
destination_path = os.path.join(path, 'converted', os.path.splitext(file)[0] + '.csv')
if not os.path.exists(destination_path) and file.startswith('output_'):
print("Converting {}".format(file))
data = get_data(os.path.join(path, file))
print("Array {} has dimensions {}".format(file, data.shape))
write_csv_nonzero(data, destination_path, file)
# write_csv_iter(data, destination_path, file)
if __name__ == '__main__':
assert len(argv) == 2, "Usage: python process.py <path_to_results_dir>"
convert_pyarrow_to_csv(argv[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment