naranjja · March 15, 2019 07:28
diff --git a/parallelize.py b/parallelize.py
 import time
 import pandas as pd
 from multiprocessing import Pool

 def worker(i):
    t = time.time()
    time.sleep(0.5)  # simulate processing something
    d = {
        "name": "Player {}".format(i + 1),
        "points": (i + 1) ** 2.0
    }
    print("Iteration {} took {} seconds".format(i + 1, time.time() - t))
    return d

 def main():  # absolutely all execution code must be protected when messing with multiprocessing
    # each subprocess WILL execute everything again if not contained in 'main'

    # blocking way
    print("The blocking way...")
    t0 = time.time()

    df = pd.DataFrame({"name": [], "points": []}) # in a blocking paradigm, we can initialize an empty dataframe

    for i in range(10): # then N times,
        t = time.time()
        time.sleep(0.5)  # simulate processing something
        df.loc[i] = ["Player {}".format(i + 1), (i + 1) ** 2] # we can set row, col = some values
        print("Iteration {} took {} seconds".format(i + 1, time.time() - t))

    print(df)
    print("The total number of points is {}".format(df["points"].sum()))
    print("Total execution took {} seconds\n".format(time.time() - t0))
    
    # parallel way
    print("The parallel way...")
    t0 = time.time()

    # in a parallel paradigm, we shouldn't mutate over the same objects (it's messy)
    # it might happen we start overwriting things
    
    pool = Pool()  # we can use a pool (rather than a thread since it's more efficient)
    l = pool.map(worker, range(10))  # then map a function N times
    # mapping will pass the iteratee as a parameter
    # if you want to pass more than 1 parameter, you can use pool.starmap(func, argument_tuple)
    # for example, l = pool.starmap(worker, ((i, "hello") for i in range(10)))
    
    # l will contain a list of return values from the worker, in this case, a list of dicts
    df = pd.DataFrame.from_records(l)  # we can use this list of dicts to create a dataframe

    print(df)
    print("The total number of points is {}".format(df["points"].sum()))
    print("Total execution took {} seconds".format(time.time() - t0))

 if __name__ == "__main__":
    main()
diff --git a/requirements.txt b/requirements.txt
 pandas==0.24.1
	import time
	import pandas as pd
	from multiprocessing import Pool

	def worker(i):
	t = time.time()
	time.sleep(0.5) # simulate processing something
	d = {
	"name": "Player {}".format(i + 1),
	"points": (i + 1) ** 2.0
	}
	print("Iteration {} took {} seconds".format(i + 1, time.time() - t))
	return d

	def main(): # absolutely all execution code must be protected when messing with multiprocessing
	# each subprocess WILL execute everything again if not contained in 'main'

	# blocking way
	print("The blocking way...")
	t0 = time.time()

	df = pd.DataFrame({"name": [], "points": []}) # in a blocking paradigm, we can initialize an empty dataframe

	for i in range(10): # then N times,
	t = time.time()
	time.sleep(0.5) # simulate processing something
	df.loc[i] = ["Player {}".format(i + 1), (i + 1) ** 2] # we can set row, col = some values
	print("Iteration {} took {} seconds".format(i + 1, time.time() - t))

	print(df)
	print("The total number of points is {}".format(df["points"].sum()))
	print("Total execution took {} seconds\n".format(time.time() - t0))

	# parallel way
	print("The parallel way...")
	t0 = time.time()

	# in a parallel paradigm, we shouldn't mutate over the same objects (it's messy)
	# it might happen we start overwriting things

	pool = Pool() # we can use a pool (rather than a thread since it's more efficient)
	l = pool.map(worker, range(10)) # then map a function N times
	# mapping will pass the iteratee as a parameter
	# if you want to pass more than 1 parameter, you can use pool.starmap(func, argument_tuple)
	# for example, l = pool.starmap(worker, ((i, "hello") for i in range(10)))

	# l will contain a list of return values from the worker, in this case, a list of dicts
	df = pd.DataFrame.from_records(l) # we can use this list of dicts to create a dataframe

	print(df)
	print("The total number of points is {}".format(df["points"].sum()))
	print("Total execution took {} seconds".format(time.time() - t0))

	if __name__ == "__main__":
	main()