Skip to content

Instantly share code, notes, and snippets.

@gvyshnya
Created September 2, 2020 19:20
Show Gist options
  • Save gvyshnya/ed73be3faacfed0ae965766c327145e0 to your computer and use it in GitHub Desktop.
Save gvyshnya/ed73be3faacfed0ae965766c327145e0 to your computer and use it in GitHub Desktop.
for ebird in final_data:
print("Starting to process a new species: ", ebird)
ebird_data = train_csv[train_csv['species'] == ebird]
short_file_name = ebird_data['ebird_code'].unique()[0]
print("Short file name: ", short_file_name)
pool = mp.Pool(c.NUMBER_OF_CPU_IN_POOL) # use the number of parallel processes as per the configured
funclist = []
for index, row in ebird_data.iterrows():
# process each audio file
f = pool.apply_async(extract_feautres, [row['full_path']])
funclist.append(f)
result = []
for f in funclist:
result.append(f.get(timeout=600)) # timeout in 600 seconds = 10 mins
# combine chunks with transformed data into a single training set
extracted_features = pd.concat(result)
# save extracted features to CSV
output_path = "".join([c.TRANSFORMED_DATA_PATH, short_file_name, ".csv"])
extracted_features.to_csv(output_path, index=False)
# clean up
pool.close()
pool.join()
print("Finished processing: ", ebird)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment