Skip to content

Instantly share code, notes, and snippets.

@jimmytuc
Created November 2, 2018 07:41
Show Gist options
  • Save jimmytuc/f7349286ea3c4efd07197c9cdb4a6f8e to your computer and use it in GitHub Desktop.
Save jimmytuc/f7349286ea3c4efd07197c9cdb4a6f8e to your computer and use it in GitHub Desktop.
Read big file by splitting chunks & seeking file parts
#/usr/bin/python3
import multiprocessing as mp
import os
def process_each_chunk(file, chunk_start, chunk_size):
with open(file) as f:
f.seek(chunk_start)
lines = f.read(chunk_size).splitlines()
for line in lines:
process(line)
def process(data):
# TODO: implement stuff here
print(data)
def specify_chunk(f, chunk_size=1024 * 1024):
file_end = os.path.getsize(f)
with open(f, 'rb') as f:
chunk_end = f.tell()
while True:
chunk_start = chunk_end
f.seek(chunk_size, 1)
f.readline()
chunk_end = f.tell()
yield chunk_start, chunk_end - chunk_start
if chunk_end > file_end:
break
if __name__ == '__main__':
file_path = './data/test.csv' # really big file
pool = mp.Pool(5)
jobs = []
for chunk_start, chunk_size in specify_chunk(file_path):
jobs.append(
pool.apply_async(process_each_chunk,
(file_path, chunk_start, chunk_size))
)
# wait for all jobs to finish
for job in jobs:
job.get()
pool.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment