WillKoehrsen · August 7, 2018 17:23
diff --git a/feature_matrix_dask.py b/feature_matrix_dask.py
 from dask import delayed
 import os

 # Create list of all partitions
 paths = ['../input/partitions/%s' % file for file os.listdir('../input/partitions/')]

 start_index = 0

 # Iterate through 8 paths (one batch) at a time
 for i, end_index in enumerate(range(9, len(paths) + 5, 8)):
    
    # Subset to the 8 paths in the batch
    if end_index > len(paths):
        subset_paths = paths[start_index:]
    else:
        subset_paths = paths[start_index: end_index]
    
    # Empty list of feature matrices
    fms = []
    
    # Iterate through the batch
    for path in subset_paths:

        # Make the entityset
        es = delayed(entityset_from_partition)(path)

        # Make the feature matrix and add to the list
        fm = delayed(feature_matrix_from_entityset)(es, feature_names = featurenames)
        fms.append(fm)

    # Final operation will be to concatenate together all of the feature matrices
    X = delayed(pd.concat)(fms, axis = 0)
    
    # This line actually runs the computation
    feature_matrix = X.compute()
  
    # Save the batch feature matrix to disk
    feature_matrix.to_csv('../input/fm/%s.csv' % i, index = True)
    
    # Start index becomes previous ending index for the next batch
    start_index = end_index
	from dask import delayed
	import os

	# Create list of all partitions
	paths = ['../input/partitions/%s' % file for file os.listdir('../input/partitions/')]

	start_index = 0

	# Iterate through 8 paths (one batch) at a time
	for i, end_index in enumerate(range(9, len(paths) + 5, 8)):

	# Subset to the 8 paths in the batch
	if end_index > len(paths):
	subset_paths = paths[start_index:]
	else:
	subset_paths = paths[start_index: end_index]

	# Empty list of feature matrices
	fms = []

	# Iterate through the batch
	for path in subset_paths:

	# Make the entityset
	es = delayed(entityset_from_partition)(path)

	# Make the feature matrix and add to the list
	fm = delayed(feature_matrix_from_entityset)(es, feature_names = featurenames)
	fms.append(fm)

	# Final operation will be to concatenate together all of the feature matrices
	X = delayed(pd.concat)(fms, axis = 0)

	# This line actually runs the computation
	feature_matrix = X.compute()

	# Save the batch feature matrix to disk
	feature_matrix.to_csv('../input/fm/%s.csv' % i, index = True)

	# Start index becomes previous ending index for the next batch
	start_index = end_index