Skip to content

Instantly share code, notes, and snippets.

@prrao87
Created May 24, 2020 06:07
Show Gist options
  • Save prrao87/cba9ddfb137de11423a401e7bd2904f0 to your computer and use it in GitHub Desktop.
Save prrao87/cba9ddfb137de11423a401e7bd2904f0 to your computer and use it in GitHub Desktop.
from joblib import Parallel, delayed
def chunker(iterable, total_length, chunksize):
return (iterable[pos: pos + chunksize] for pos in range(0, total_length, chunksize))
def flatten(list_of_lists):
"Flatten a list of lists to a combined list"
return [item for sublist in list_of_lists for item in sublist]
def process_chunk(texts):
preproc_pipe = []
for doc in nlp.pipe(texts, batch_size=20):
preproc_pipe.append(lemmatize_pipe(doc))
return preproc_pipe
def preprocess_parallel(texts, chunksize=100):
executor = Parallel(n_jobs=7, backend='multiprocessing', prefer="processes")
do = delayed(process_chunk)
tasks = (do(chunk) for chunk in chunker(texts, len(df_preproc), chunksize=chunksize))
result = executor(tasks)
return flatten(result)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment