dalemyers · November 16, 2015 01:10
diff --git a/external_merge.py b/external_merge.py
 '''This script is a "simulation" of an external merge sort in a DBMS. In a DBMS
 you will read pages from disk and be able to hold a particular number in 
 memory. This implementation allows you to set the number of pages by setting 
 the MAX_BUFFERS variable, and the size of a page by setting the 
 MAX_BUFFER_SPACE variable.'''

 import sys
 import os
 import tempfile

 #How many lines we can keep in memory at once per buffer before we are forced
 #to write out
 MAX_BUFFER_SPACE = 10000
 #How many buffers we can have in memory
 MAX_BUFFERS = 10

 def get_lines(file_name):
 	'''Utility function for getting a generator from a file which spits out
 	lines of a file, converted to integers.'''
 	
 	with open(file_name) as f:
 		for line in f:
 			yield int(line)
 			
 			
 def get_next(iterator):
 	'''A wrapper to get the next value from an iterator without having to worry
 	about catching exceptions. It simply returns None when we have no more 
 	elements in the iterator.'''
 	
 	try:
 		return next(iterator)
 	except StopIteration:
 		#We have run out
 		return None

 def get_temp_file():
 	'''Creates a temporary file in a particular location (for easy debugging)
 	and returns a file wrapper and a path in a tuple.'''
 	
 	temp_file_handle, temp_file_path = tempfile.mkstemp(dir="H:/My Documents/")
 	temp_file = os.fdopen(temp_file_handle, "w")
 	return (temp_file, temp_file_path)
 	
 def write_temp_file(buff):
 	'''Writes a list of integers out to a temporary file.'''
 	temp_file, temp_file_path = get_temp_file()
 	
 	for line in buff:
 		temp_file.write(str(line) + "\n")
 	
 	temp_file.close()
 	
 	return temp_file_path
 	
 def split_into_buffers(file):
 	'''Splits a large file into multiple files. While this isn't necessary, and
 	indeed even hurts performance, I did this to keep the algorithm simple. With
 	this split, we only need to think about how the n-way merge algorithm works.'''
 	output_files = []
 	output = []
 	
 	#Get our iterator
 	lines = get_lines(file)
 	
 	while True:
 		
 		#Get the next line and append it to the output
 		next_line = get_next(lines)
 		if next_line != None:
 			output.append(next_line)
 		
 		#If we run out of buffer space, or data, then sort the buffer and 
 		#write it out to disk
 		if len(output) > MAX_BUFFER_SPACE or next_line == None:
 			if len(output) > 0:
 				output.sort()
 				output_files.append(write_temp_file(output))
 				output = []
 			
 			if next_line == None:
 				break

 	#Return the list of files we have written out
 	return output_files

 		
 def merge_files(initial_file_names):
 	'''Implements a naive n-way merge of pre-sorted files. The naive method
 	simply iterates over the first value of each file it has open, finds the 
 	smallest out of these, then writes this to the output file, and repeats
 	until all files have been used up completely.'''
 	
 	all_file_names = initial_file_names[:]

 	#This algorithm repeats until we have no files left to merge
 	while True:
 		#If we have a single file we are finished
 		if len(all_file_names) == 1:
 			return all_file_names[0]
 		
 		#If we can fit at least 1 line from each file in memory we can just go 
 		#ahead and do a regular merge of all of them. If not, we need to take a 
 		#subset and merge those, then iterate.
 		
 		#This isn't strictly like a DMBS would do as we don't have to load an 
 		#entire page into memory and can instead go line by line. We only load
 		#as many files as pages to approximate the performance.
 		if len(all_file_names) > MAX_BUFFERS:
 			file_names = all_file_names[:MAX_BUFFERS]
 			remaining_file_names = all_file_names[MAX_BUFFERS:]
 		else:
 			file_names = all_file_names[:]
 			remaining_file_names = []

 		print "Merging", len(remaining_file_names) + len(file_names), "files"
 		
 		#Since we know everything is sorted, we can just merge the files directly, 
 		#without needing to store anything in memory. Instead, we will keep going
 		#until all buffers are exhausted

 		generators = map(get_lines, file_names)
 		output_file, output_file_path = get_temp_file()
 		
 		current_values = []
 		
 		#Pre-populate the very first values from each file
 		for generator in generators:
 			current_values.append(get_next(generator))
 			
 		while True:
 			smallest_index = 0
 			
 			#Find the smallest int that isn't None (i.e. the file still has
 			#values in it)
 			while current_values[smallest_index] == None:
 				smallest_index += 1
 				#If we have the entire list as None then break out
 				if smallest_index >= len(current_values):
 					break

 			#Check that we have at least one number
 			if smallest_index < len(current_values):
 			
 				#Find the smallest number remaining
 				for i in range(smallest_index, len(current_values)):
 					if current_values[i] == None:
 						continue
 					if current_values[i] < current_values[smallest_index]:
 						smallest_index = i
 				
 				#Write it out
 				output_file.write(str(current_values[smallest_index]) + "\n")
 				
 				#Replace the smallest values
 				current_values[smallest_index] = get_next(generators[smallest_index])
 			
 			# Check if all files are empty
 			if smallest_index >= len(current_values):
 				#All files are exhausted so we are done with this round
 				#Add this file to the list and repeat everything until we are left
 				#with just a single file
 				all_file_names = remaining_file_names[:]
 				all_file_names.append(output_file_path)
 				output_file.close()
 				
 				#Delete everything in all_files to stop us destroying the disk
 				for file_name in file_names:
 					os.remove(file_name)
 					
 				break
 	
 def main():
 	if len(sys.argv) != 2:
 		print "Expected a single argument of a file name"
 		sys.exit(1)
 	
 	files = split_into_buffers(sys.argv[1])
 	print "Split files"
 	print "Sorted file:", merge_files(files)
 	
 if __name__ == "__main__":
 	main()
	'''This script is a "simulation" of an external merge sort in a DBMS. In a DBMS
	you will read pages from disk and be able to hold a particular number in
	memory. This implementation allows you to set the number of pages by setting
	the MAX_BUFFERS variable, and the size of a page by setting the
	MAX_BUFFER_SPACE variable.'''

	import sys
	import os
	import tempfile

	#How many lines we can keep in memory at once per buffer before we are forced
	#to write out
	MAX_BUFFER_SPACE = 10000
	#How many buffers we can have in memory
	MAX_BUFFERS = 10

	def get_lines(file_name):
	'''Utility function for getting a generator from a file which spits out
	lines of a file, converted to integers.'''

	with open(file_name) as f:
	for line in f:
	yield int(line)


	def get_next(iterator):
	'''A wrapper to get the next value from an iterator without having to worry
	about catching exceptions. It simply returns None when we have no more
	elements in the iterator.'''

	try:
	return next(iterator)
	except StopIteration:
	#We have run out
	return None

	def get_temp_file():
	'''Creates a temporary file in a particular location (for easy debugging)
	and returns a file wrapper and a path in a tuple.'''

	temp_file_handle, temp_file_path = tempfile.mkstemp(dir="H:/My Documents/")
	temp_file = os.fdopen(temp_file_handle, "w")
	return (temp_file, temp_file_path)

	def write_temp_file(buff):
	'''Writes a list of integers out to a temporary file.'''
	temp_file, temp_file_path = get_temp_file()

	for line in buff:
	temp_file.write(str(line) + "\n")

	temp_file.close()

	return temp_file_path

	def split_into_buffers(file):
	'''Splits a large file into multiple files. While this isn't necessary, and
	indeed even hurts performance, I did this to keep the algorithm simple. With
	this split, we only need to think about how the n-way merge algorithm works.'''
	output_files = []
	output = []

	#Get our iterator
	lines = get_lines(file)

	while True:

	#Get the next line and append it to the output
	next_line = get_next(lines)
	if next_line != None:
	output.append(next_line)

	#If we run out of buffer space, or data, then sort the buffer and
	#write it out to disk
	if len(output) > MAX_BUFFER_SPACE or next_line == None:
	if len(output) > 0:
	output.sort()
	output_files.append(write_temp_file(output))
	output = []

	if next_line == None:
	break

	#Return the list of files we have written out
	return output_files


	def merge_files(initial_file_names):
	'''Implements a naive n-way merge of pre-sorted files. The naive method
	simply iterates over the first value of each file it has open, finds the
	smallest out of these, then writes this to the output file, and repeats
	until all files have been used up completely.'''

	all_file_names = initial_file_names[:]

	#This algorithm repeats until we have no files left to merge
	while True:
	#If we have a single file we are finished
	if len(all_file_names) == 1:
	return all_file_names[0]

	#If we can fit at least 1 line from each file in memory we can just go
	#ahead and do a regular merge of all of them. If not, we need to take a
	#subset and merge those, then iterate.

	#This isn't strictly like a DMBS would do as we don't have to load an
	#entire page into memory and can instead go line by line. We only load
	#as many files as pages to approximate the performance.
	if len(all_file_names) > MAX_BUFFERS:
	file_names = all_file_names[:MAX_BUFFERS]
	remaining_file_names = all_file_names[MAX_BUFFERS:]
	else:
	file_names = all_file_names[:]
	remaining_file_names = []

	print "Merging", len(remaining_file_names) + len(file_names), "files"

	#Since we know everything is sorted, we can just merge the files directly,
	#without needing to store anything in memory. Instead, we will keep going
	#until all buffers are exhausted

	generators = map(get_lines, file_names)
	output_file, output_file_path = get_temp_file()

	current_values = []

	#Pre-populate the very first values from each file
	for generator in generators:
	current_values.append(get_next(generator))

	while True:
	smallest_index = 0

	#Find the smallest int that isn't None (i.e. the file still has
	#values in it)
	while current_values[smallest_index] == None:
	smallest_index += 1
	#If we have the entire list as None then break out
	if smallest_index >= len(current_values):
	break

	#Check that we have at least one number
	if smallest_index < len(current_values):

	#Find the smallest number remaining
	for i in range(smallest_index, len(current_values)):
	if current_values[i] == None:
	continue
	if current_values[i] < current_values[smallest_index]:
	smallest_index = i

	#Write it out
	output_file.write(str(current_values[smallest_index]) + "\n")

	#Replace the smallest values
	current_values[smallest_index] = get_next(generators[smallest_index])

	# Check if all files are empty
	if smallest_index >= len(current_values):
	#All files are exhausted so we are done with this round
	#Add this file to the list and repeat everything until we are left
	#with just a single file
	all_file_names = remaining_file_names[:]
	all_file_names.append(output_file_path)
	output_file.close()

	#Delete everything in all_files to stop us destroying the disk
	for file_name in file_names:
	os.remove(file_name)

	break

	def main():
	if len(sys.argv) != 2:
	print "Expected a single argument of a file name"
	sys.exit(1)

	files = split_into_buffers(sys.argv[1])
	print "Split files"
	print "Sorted file:", merge_files(files)

	if __name__ == "__main__":
	main()