Created
January 22, 2023 13:09
-
-
Save mpangrazzi/bddc3addf00e4fb72c2547168b6e9015 to your computer and use it in GitHub Desktop.
Process-based parallelism with Python when processing a 500k+ rows dataset
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from multiprocessing import cpu_count, get_context | |
from tqdm.auto import tqdm | |
from time import sleep | |
# A fake function to simulate some work | |
def process_data(x: str) -> str: | |
sleep(0.001) | |
return x + " processed" | |
def main(): | |
# The 500k+ rows dataset we want to process | |
sample_dataset = [f"Sentence {i}" for i in range(550_000)] | |
# The output data | |
output_data = [] | |
# Get the multiprocessing context | |
ctx = get_context("spawn") | |
# Process-based parallelism | |
with ctx.Pool(cpu_count()) as pool: | |
for data in tqdm( | |
pool.imap_unordered(process_data, sample_dataset), | |
total=len(sample_dataset), | |
desc="Processing data" | |
): | |
output_data.append(data) | |
# Done! | |
print(output_data[0]) # Sentence 0 processed | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment