Skip to content

Instantly share code, notes, and snippets.

@mindey
Last active October 20, 2017 22:26
Show Gist options
  • Save mindey/39011b069edd3bfdd5514141a6c1a091 to your computer and use it in GitHub Desktop.
Save mindey/39011b069edd3bfdd5514141a6c1a091 to your computer and use it in GitHub Desktop.
import pandas
from dask import dataframe
from dask.diagnostics import ProgressBar
def parallel_apply(df, func, progress=True, chunkrows=100, scheduler_address=None, *args, **kwargs):
if scheduler_address:
from dask.distributed import Client
client = Client(scheduler_address)
sd = dataframe.from_pandas(df, npartitions=int(len(df)/chunkrows))
if progress:
with ProgressBar():
return sd.apply(func, *args, **kwargs).compute()
else:
return sd.apply(func, *args, **kwargs).compute()
# For testing progress bar :)
test_df = pandas.DataFrame({'x': range(1000000), 'y': range(1000000)[::-1]})
test_df['z'] = parallel_apply(test_df, lambda row: row['x'] * row['y'], axis=1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment