Skip to content

Instantly share code, notes, and snippets.

@naranjja
Last active March 15, 2019 07:28
Show Gist options
  • Save naranjja/910b1af89a97b2aa2ffbeed163445c9d to your computer and use it in GitHub Desktop.
Save naranjja/910b1af89a97b2aa2ffbeed163445c9d to your computer and use it in GitHub Desktop.
Parallelizing Pandas row appending in Python
import time
import pandas as pd
from multiprocessing import Pool
def worker(i):
t = time.time()
time.sleep(0.5) # simulate processing something
d = {
"name": "Player {}".format(i + 1),
"points": (i + 1) ** 2.0
}
print("Iteration {} took {} seconds".format(i + 1, time.time() - t))
return d
def main(): # absolutely all execution code must be protected when messing with multiprocessing
# each subprocess WILL execute everything again if not contained in 'main'
# blocking way
print("The blocking way...")
t0 = time.time()
df = pd.DataFrame({"name": [], "points": []}) # in a blocking paradigm, we can initialize an empty dataframe
for i in range(10): # then N times,
t = time.time()
time.sleep(0.5) # simulate processing something
df.loc[i] = ["Player {}".format(i + 1), (i + 1) ** 2] # we can set row, col = some values
print("Iteration {} took {} seconds".format(i + 1, time.time() - t))
print(df)
print("The total number of points is {}".format(df["points"].sum()))
print("Total execution took {} seconds\n".format(time.time() - t0))
# parallel way
print("The parallel way...")
t0 = time.time()
# in a parallel paradigm, we shouldn't mutate over the same objects (it's messy)
# it might happen we start overwriting things
pool = Pool() # we can use a pool (rather than a thread since it's more efficient)
l = pool.map(worker, range(10)) # then map a function N times
# mapping will pass the iteratee as a parameter
# if you want to pass more than 1 parameter, you can use pool.starmap(func, argument_tuple)
# for example, l = pool.starmap(worker, ((i, "hello") for i in range(10)))
# l will contain a list of return values from the worker, in this case, a list of dicts
df = pd.DataFrame.from_records(l) # we can use this list of dicts to create a dataframe
print(df)
print("The total number of points is {}".format(df["points"].sum()))
print("Total execution took {} seconds".format(time.time() - t0))
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment