Skip to content

Instantly share code, notes, and snippets.

@WillKoehrsen
Last active August 7, 2018 17:23
Show Gist options
  • Save WillKoehrsen/39da5a06f20e8c6ee8d48ffb3ed6e6cd to your computer and use it in GitHub Desktop.
Save WillKoehrsen/39da5a06f20e8c6ee8d48ffb3ed6e6cd to your computer and use it in GitHub Desktop.
from dask import delayed
import os
# Create list of all partitions
paths = ['../input/partitions/%s' % file for file os.listdir('../input/partitions/')]
start_index = 0
# Iterate through 8 paths (one batch) at a time
for i, end_index in enumerate(range(9, len(paths) + 5, 8)):
# Subset to the 8 paths in the batch
if end_index > len(paths):
subset_paths = paths[start_index:]
else:
subset_paths = paths[start_index: end_index]
# Empty list of feature matrices
fms = []
# Iterate through the batch
for path in subset_paths:
# Make the entityset
es = delayed(entityset_from_partition)(path)
# Make the feature matrix and add to the list
fm = delayed(feature_matrix_from_entityset)(es, feature_names = featurenames)
fms.append(fm)
# Final operation will be to concatenate together all of the feature matrices
X = delayed(pd.concat)(fms, axis = 0)
# This line actually runs the computation
feature_matrix = X.compute()
# Save the batch feature matrix to disk
feature_matrix.to_csv('../input/fm/%s.csv' % i, index = True)
# Start index becomes previous ending index for the next batch
start_index = end_index
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment