Last active
April 13, 2019 03:32
-
-
Save gyli/fa429f5e6b0b2b5e08fc7cadd0f38158 to your computer and use it in GitHub Desktop.
Simple version of processing large CSV file with Pandas and multiprocessing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import multiprocessing as mp | |
import traceback | |
def processing_chunk(chunk): | |
for row in chunk.iterrows(): | |
pass | |
time.sleep(5) | |
def main(): | |
pool_size = 4 | |
pool = mp.Pool(pool_size) | |
chunk_size = 1000 * pool_size | |
count = 0 | |
for file_chunk in pd.read_csv('data.csv', chunksize=chunk_size): | |
line = count * chunk_size | |
print(f"Processing {chunk_size} lines after line {line}") | |
# Split chunk evenly. It's better to use this method if every chunk takes similar time. | |
pool.map(processing_chunk, pd.np.array_split(file_chunk, pool_size)) | |
count += 1 | |
pool.close() | |
pool.join() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment