Created
June 14, 2018 10:43
-
-
Save kaustubhn/e58104730b0db59c7a08a810bf5fcf51 to your computer and use it in GitHub Desktop.
Execute a python function in parallel - Template
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from toolz import partition | |
from joblib import Parallel, delayed | |
def parallelize(func, iterator, n_jobs, extra): | |
extra = tuple(extra) | |
return Parallel(n_jobs=n_jobs)(delayed(func)(*(item + extra)) for item in iterator) | |
def iter_documents(file): | |
with open(file) as file_: | |
for line in file_: | |
yield line | |
def do_work(): | |
# Function to do work any data processing function | |
print("I am working") | |
if __name__ == "__main__": | |
in_file = "<path-to-input-file>" | |
out_dir = "<output-dir>" | |
n_workers = 4 # Number of workers (processes) | |
jobs = partition(1000, iter_documents(in_file)) | |
parallelize(do_work, enumerate(jobs), n_workers, [out_dir]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment