Last active
July 13, 2017 06:07
-
-
Save yunhan0/9e658cc3bab29dba7dab11966f001159 to your computer and use it in GitHub Desktop.
Example of processing large data file without running out of memory.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# Yunhan Li | |
# Example of processing large data file without running out of memory. | |
import sys, os, time | |
def processFile(filename, output): | |
bufferlimit = 10000 | |
buffer = [] | |
start_tic = time.clock() | |
# The with statement handles opening and closing the file, including if an exception is raised in the inner block. | |
# The for line in f treats the file object f as an iterable, | |
# which automatically uses buffered IO and memory management so you don't have to worry about large files. | |
with open(filename) as f, open(output, "w") as w: | |
# Read line by line, we usually don’t need all of the lines in the file in memory at once | |
# instead, we just need to be able to iterate through each one, do some processing and throw it away. | |
for line in f: | |
# Put processed result into a buffer, and only write to file if buffer reached to a certain size. | |
# This will avoid writing every iteration. | |
buffer.append(str(len(line)) + "\n") | |
if(len(buffer) == bufferlimit): | |
w.writelines(buffer) | |
buffer = [] | |
w.writelines(buffer) | |
print "Done. Using " + str(time.clock() - start_tic ) + "s"; | |
if __name__ == "__main__": | |
# Receiving argument from sys.argv, it includes: python file, input file, output file(maynot exist) | |
if len(sys.argv) != 3: | |
print "Number of arguments not match.\nThis program takes 3 arguments: the input file path, output file path." | |
exit(1) | |
your_file = str(sys.argv[1]) | |
output_file = str(sys.argv[2]) | |
if not your_file.endswith(".txt"): | |
print "I stupidly only accept txt file." | |
if not os.path.exists(your_file): | |
print "file does not exist." | |
processFile(your_file, output_file) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment