Last active
July 28, 2018 22:34
-
-
Save Helw150/51c5f78f920e659866a295e283a8883f to your computer and use it in GitHub Desktop.
A Python Script which multi-processes large files with a rough progress bar
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""Counts the number of times a word occurs in a very large text file""" | |
from __future__ import print_function | |
import os | |
import sys | |
import argparse | |
import textacy | |
import multiprocessing | |
from tqdm import tqdm | |
from collections import Counter | |
from itertools import zip_longest | |
def grouper(n, iterable, padvalue=None): | |
return zip_longest(*[iter(iterable)] * n, fillvalue=padvalue) | |
def process_chunk(chunk): | |
if chunk: | |
# Roughly the overhead of a Python String | |
size = sys.getsizeof(chunk) - 40 | |
wordcount = Counter(chunk.split()) | |
return (wordcount, size) | |
else: | |
return (None, 0) | |
def main(arguments): | |
parser = argparse.ArgumentParser( | |
description=__doc__, | |
formatter_class=argparse.RawDescriptionHelpFormatter) | |
parser.add_argument( | |
'infile', help="Input file", type=argparse.FileType('r')) | |
parser.add_argument( | |
'outfile', help="Output file", type=argparse.FileType('w')) | |
args = parser.parse_args(arguments) | |
p = multiprocessing.Pool(8) | |
filesize = os.path.getsize(args.infile.name) | |
total_word_counts = None | |
with tqdm(total=filesize) as pbar: | |
for chunk in grouper(1000, args.infile): | |
results = p.map(process_chunk, chunk) | |
word_counts, sizes = zip(*results) | |
pbar.update(sum(sizes)) | |
chunk_wc = sum(word_counts, Counter()) | |
if total_word_counts != None: | |
total_word_counts += chunk_wc | |
else: | |
total_word_counts = chunk_wc | |
for item in total_word_counts.items(): | |
args.outfile.write("{} {}\n".format(*item)) | |
if __name__ == '__main__': | |
sys.exit(main(sys.argv[1:])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment