Last active
May 21, 2018 09:52
-
-
Save willu47/a8e9893f8bc265ffc093c51d7298a964 to your computer and use it in GitHub Desktop.
Convert directory containing numpy arrays serialised using pyarrow into folder of csv files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pyarrow as pa | |
import os | |
from csv import writer | |
from sys import argv | |
import numpy as np | |
def get_data(filepath): | |
data = None | |
with pa.memory_map(filepath, 'rb') as f: | |
f.seek(0) | |
buf = f.read_buffer() | |
data = pa.deserialize(buf) | |
return data | |
def write_csv_iter(data, destination_path, file): | |
if sum(data.shape) > 0: | |
it = np.nditer(data, flags=['multi_index']) | |
with open(destination_path, 'w') as writefile: | |
writ = writer(writefile) | |
while not it.finished: | |
if it[0] != 0: | |
writ.writerow( | |
[it.multi_index[0], | |
it.multi_index[1], | |
it[0]]) | |
it.iternext() | |
else: | |
print("Skipping {} as it is empty".format(file)) | |
def write_csv_nonzero(data, destination_path, file): | |
indices = np.nonzero(data) | |
if np.count_nonzero(data) > 0: | |
with open(destination_path, 'w') as writefile: | |
writ = writer(writefile) | |
for row, col in zip(indices[0], indices[1]): | |
writ.writerow([row, col, data[row, col]]) | |
else: | |
print("Skipping {} as it is empty".format(file)) | |
def convert_pyarrow_to_csv(path): | |
if not os.path.exists(os.path.join(path, 'converted')): | |
os.mkdir(os.path.join(path, 'converted')) | |
for file in os.listdir(path): | |
destination_path = os.path.join(path, 'converted', os.path.splitext(file)[0] + '.csv') | |
if not os.path.exists(destination_path) and file.startswith('output_'): | |
print("Converting {}".format(file)) | |
data = get_data(os.path.join(path, file)) | |
print("Array {} has dimensions {}".format(file, data.shape)) | |
write_csv_nonzero(data, destination_path, file) | |
# write_csv_iter(data, destination_path, file) | |
if __name__ == '__main__': | |
assert len(argv) == 2, "Usage: python process.py <path_to_results_dir>" | |
convert_pyarrow_to_csv(argv[1]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment