Last active
August 7, 2018 17:23
-
-
Save WillKoehrsen/39da5a06f20e8c6ee8d48ffb3ed6e6cd to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from dask import delayed | |
import os | |
# Create list of all partitions | |
paths = ['../input/partitions/%s' % file for file os.listdir('../input/partitions/')] | |
start_index = 0 | |
# Iterate through 8 paths (one batch) at a time | |
for i, end_index in enumerate(range(9, len(paths) + 5, 8)): | |
# Subset to the 8 paths in the batch | |
if end_index > len(paths): | |
subset_paths = paths[start_index:] | |
else: | |
subset_paths = paths[start_index: end_index] | |
# Empty list of feature matrices | |
fms = [] | |
# Iterate through the batch | |
for path in subset_paths: | |
# Make the entityset | |
es = delayed(entityset_from_partition)(path) | |
# Make the feature matrix and add to the list | |
fm = delayed(feature_matrix_from_entityset)(es, feature_names = featurenames) | |
fms.append(fm) | |
# Final operation will be to concatenate together all of the feature matrices | |
X = delayed(pd.concat)(fms, axis = 0) | |
# This line actually runs the computation | |
feature_matrix = X.compute() | |
# Save the batch feature matrix to disk | |
feature_matrix.to_csv('../input/fm/%s.csv' % i, index = True) | |
# Start index becomes previous ending index for the next batch | |
start_index = end_index |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment