Created
November 16, 2015 01:10
-
-
Save dalemyers/10cef1cdf5f03a81db3e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
'''This script is a "simulation" of an external merge sort in a DBMS. In a DBMS | |
you will read pages from disk and be able to hold a particular number in | |
memory. This implementation allows you to set the number of pages by setting | |
the MAX_BUFFERS variable, and the size of a page by setting the | |
MAX_BUFFER_SPACE variable.''' | |
import sys | |
import os | |
import tempfile | |
#How many lines we can keep in memory at once per buffer before we are forced | |
#to write out | |
MAX_BUFFER_SPACE = 10000 | |
#How many buffers we can have in memory | |
MAX_BUFFERS = 10 | |
def get_lines(file_name): | |
'''Utility function for getting a generator from a file which spits out | |
lines of a file, converted to integers.''' | |
with open(file_name) as f: | |
for line in f: | |
yield int(line) | |
def get_next(iterator): | |
'''A wrapper to get the next value from an iterator without having to worry | |
about catching exceptions. It simply returns None when we have no more | |
elements in the iterator.''' | |
try: | |
return next(iterator) | |
except StopIteration: | |
#We have run out | |
return None | |
def get_temp_file(): | |
'''Creates a temporary file in a particular location (for easy debugging) | |
and returns a file wrapper and a path in a tuple.''' | |
temp_file_handle, temp_file_path = tempfile.mkstemp(dir="H:/My Documents/") | |
temp_file = os.fdopen(temp_file_handle, "w") | |
return (temp_file, temp_file_path) | |
def write_temp_file(buff): | |
'''Writes a list of integers out to a temporary file.''' | |
temp_file, temp_file_path = get_temp_file() | |
for line in buff: | |
temp_file.write(str(line) + "\n") | |
temp_file.close() | |
return temp_file_path | |
def split_into_buffers(file): | |
'''Splits a large file into multiple files. While this isn't necessary, and | |
indeed even hurts performance, I did this to keep the algorithm simple. With | |
this split, we only need to think about how the n-way merge algorithm works.''' | |
output_files = [] | |
output = [] | |
#Get our iterator | |
lines = get_lines(file) | |
while True: | |
#Get the next line and append it to the output | |
next_line = get_next(lines) | |
if next_line != None: | |
output.append(next_line) | |
#If we run out of buffer space, or data, then sort the buffer and | |
#write it out to disk | |
if len(output) > MAX_BUFFER_SPACE or next_line == None: | |
if len(output) > 0: | |
output.sort() | |
output_files.append(write_temp_file(output)) | |
output = [] | |
if next_line == None: | |
break | |
#Return the list of files we have written out | |
return output_files | |
def merge_files(initial_file_names): | |
'''Implements a naive n-way merge of pre-sorted files. The naive method | |
simply iterates over the first value of each file it has open, finds the | |
smallest out of these, then writes this to the output file, and repeats | |
until all files have been used up completely.''' | |
all_file_names = initial_file_names[:] | |
#This algorithm repeats until we have no files left to merge | |
while True: | |
#If we have a single file we are finished | |
if len(all_file_names) == 1: | |
return all_file_names[0] | |
#If we can fit at least 1 line from each file in memory we can just go | |
#ahead and do a regular merge of all of them. If not, we need to take a | |
#subset and merge those, then iterate. | |
#This isn't strictly like a DMBS would do as we don't have to load an | |
#entire page into memory and can instead go line by line. We only load | |
#as many files as pages to approximate the performance. | |
if len(all_file_names) > MAX_BUFFERS: | |
file_names = all_file_names[:MAX_BUFFERS] | |
remaining_file_names = all_file_names[MAX_BUFFERS:] | |
else: | |
file_names = all_file_names[:] | |
remaining_file_names = [] | |
print "Merging", len(remaining_file_names) + len(file_names), "files" | |
#Since we know everything is sorted, we can just merge the files directly, | |
#without needing to store anything in memory. Instead, we will keep going | |
#until all buffers are exhausted | |
generators = map(get_lines, file_names) | |
output_file, output_file_path = get_temp_file() | |
current_values = [] | |
#Pre-populate the very first values from each file | |
for generator in generators: | |
current_values.append(get_next(generator)) | |
while True: | |
smallest_index = 0 | |
#Find the smallest int that isn't None (i.e. the file still has | |
#values in it) | |
while current_values[smallest_index] == None: | |
smallest_index += 1 | |
#If we have the entire list as None then break out | |
if smallest_index >= len(current_values): | |
break | |
#Check that we have at least one number | |
if smallest_index < len(current_values): | |
#Find the smallest number remaining | |
for i in range(smallest_index, len(current_values)): | |
if current_values[i] == None: | |
continue | |
if current_values[i] < current_values[smallest_index]: | |
smallest_index = i | |
#Write it out | |
output_file.write(str(current_values[smallest_index]) + "\n") | |
#Replace the smallest values | |
current_values[smallest_index] = get_next(generators[smallest_index]) | |
# Check if all files are empty | |
if smallest_index >= len(current_values): | |
#All files are exhausted so we are done with this round | |
#Add this file to the list and repeat everything until we are left | |
#with just a single file | |
all_file_names = remaining_file_names[:] | |
all_file_names.append(output_file_path) | |
output_file.close() | |
#Delete everything in all_files to stop us destroying the disk | |
for file_name in file_names: | |
os.remove(file_name) | |
break | |
def main(): | |
if len(sys.argv) != 2: | |
print "Expected a single argument of a file name" | |
sys.exit(1) | |
files = split_into_buffers(sys.argv[1]) | |
print "Split files" | |
print "Sorted file:", merge_files(files) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment