Created
June 3, 2021 09:36
-
-
Save ma7555/ccba29a0ccf9f2bd5a73d3dd6fafae00 to your computer and use it in GitHub Desktop.
Parallel Pandas Apply
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import time | |
from multiprocessing import Pool, cpu_count, freeze_support | |
import numpy as np | |
import os | |
def timeit(method): | |
def timed(*args, **kw): | |
ts = time.time() | |
result = method(*args, **kw) | |
te = time.time() | |
if 'log_time' in kw: | |
name = kw.get('log_name', method.__name__.upper()) | |
kw['log_time'][name] = int((te - ts) * 1000) | |
else: | |
print('{} {:.2f} ms'.format(method.__name__, (te - ts))) | |
return result | |
return timed | |
@timeit | |
def parallelize_dataframe(df, func, n_cores=16): | |
df_split = np.array_split(df, n_cores) | |
pool = Pool(n_cores) | |
df = pd.concat(pool.map(func, df_split)) | |
pool.close() | |
pool.join() | |
return df | |
def to_array(df): | |
df['array'] = df.list.apply(np.array) | |
return df | |
if __name__ == '__main__': | |
df = pd.DataFrame(data={'list': [[0, 0]] * 500}) | |
df = parallelize_dataframe(df, to_array) | |
print(df.head()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment