Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save gvyshnya/d1e0313bb77fa2c98a588d8b8835942d to your computer and use it in GitHub Desktop.
Save gvyshnya/d1e0313bb77fa2c98a588d8b8835942d to your computer and use it in GitHub Desktop.
Parallel Audio Feature Extraction with Dask
for ebird in final_data:
print("Starting to process a new species: ", ebird)
ebird_data = train_csv[train_csv['species'] == ebird]
short_file_name = ebird_data['ebird_code'].unique()[0]
print("Short file name: ", short_file_name)
result = []
for index, row in ebird_data.iterrows():
# process each audio file
f = delayed(extract_feautres)(row['full_path'])
result.append(f)
# combine chunks with transformed data into a single training set
extracted_features = delayed(pd.concat)(result)
df = extracted_features.compute()
# save extracted features to CSV
output_path = "".join([c.TRANSFORMED_DATA_PATH, short_file_name, ".csv"])
df.to_csv(output_path, index=False)
print("Finished processing: ", ebird)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment