jwnelson · August 29, 2015 14:23
diff --git a/nagaram.py b/nagaram.py
 #!/usr/bin/python
 import math
 import sys
 import argparse
 import time

 # Nagaram.py
 #
 # OVERVIEW
 # Searches a dictionary of common English words for anagrams with at least
 # four letters and as many anagrams as there are letters in the word.
 #
 # USAGE
 # Running this script with no options will use default settings for the dictionary file
 # to use (/usr/share/dict/words) and debugging verbosity (1).
 # Run "$python nagaram.py -h" for option details.
 #
 # ALGORITHM SUMMARY
 # A hash value (of sorts) is computed for each word in the dictionary by
 # summing together the cube of the ASCII values of the characters in the word so that
 # only words with the exact same letters in them (anagrams) will have the
 # same hash value. A binary search algorithm is then used to place each word
 # into a 2D list with rows that hold all the words with the same hash value.
 # At the end, the anagram array will look like this...
 #
 #             0             1             2             3
 #    0 |<hashval 0> | <anagram 01> | <anagram 02> | <anagram 03> . . .
 #    - |------------|--------------|--------------|------------
 #    1 |<hashval 1> | <anagram 11> | <anagram 12> |<anagram 13> . . .
 #    - |------------|--------------|--------------|------------
 #    2 |<hashval 2> | <anagram 21> | <anagram 22> |<anagram 23> . . .
 #    - |------------|--------------|--------------|------------
 #             .             .             .             .
 #             .             .             .             .
 #             .             .             .             .
 #
 # The anagrams are read from the rows.

 # Evaluates the hash value of a word from its letters
 # Calculates the sum of cube of the ASCII values of the word's letters so that
 # every anagram has a distinct hash value.
 def  hash_eval(word):

    hashval = 0
    for n in range(len(word)):
        c = word[n]
        hashval += pow(ord(c), 3)
    
    dprint('"%s" hashval = %d' %(word, hashval), 3)
    return hashval

 # Uses a binary search (implemented in the function binary_insert()) to insert the word
 # into numerical order by the word's hash value. If an anagram of the word is found, that is
 # if other words with the same hash values are found, the word is appended to the end of the 
 # row for that hash value.
 # Takes a tuple containing a hash value and a word, and a 2D list (insertlist) containing hash values
 # in column 0 sortedi n numerical order, and anagrams in the rest of the columns. 
 # Returns insertlist with wordtuple inserted into the list by its hashvalue
 def sorted_insert(wordtuple, insertlist):
    dprint('Searching list for anagrams of \'%s\'' %wordtuple[1], 3)
    
    # Account for an empty list
    if not insertlist:
        dprint('The list is empty. Inserting %s at element 0' %wordtuple[1], 3)
        insertlist = [wordtuple]
        return insertlist
        
    else:     
        inlen = len(insertlist)
        lowvalindex = 0
        highvalindex = len(insertlist)-1
    
        # Find the column of the array 
        insertElement, append = binary_insert(wordtuple[0], lowvalindex, highvalindex, insertlist)
        if append == 1:     # An existing anagram has been found. Add it to the column
            dprint('Adding %s to element %d' %(wordtuple[1], insertElement), 3)
            insertlist[insertElement].append(wordtuple[1])
        elif append == 0:       # No anagram found - insert the word before the given index
            dprint('Inserting %s before element %d' %(wordtuple[1], insertElement), 3)
            insertlist.insert((insertElement), wordtuple)
            
        return insertlist

 # Binary search function to find the element in the anagram list that has the given hash value
 def binary_insert(hashval, lowvalindex, highvalindex, insertlist):  
        
    
    dprint('lowval index = %d\thighval index = %d' %(lowvalindex, highvalindex), 4)
    if DEBUG > 3:
        count = -1
        for element in insertlist:
            count+=1
            dprint('\t%d\t%s\t%d' %(count, element[1], element[0]), 4)
            
    # Calculate the "pivot" midpoint index in the list     
    pivot = (highvalindex - lowvalindex)/2 + lowvalindex 
    #pivot = int(math.ceil(len(insertlist)/2)) + lowvalindex - 1
    dprint('pivot index = %s' %pivot, 4)
    #pivot = insertlist[int(math.floor(lowval + highval/2))][0]
    
    # This is a "new" anagram. Place it before or after the current
    # pivot depending on the given hashval
    if highvalindex == lowvalindex:
        if hashval > insertlist[highvalindex][0]:
            return highvalindex + 1, 0
        if hashval < insertlist[highvalindex][0]:
            return highvalindex, 0
        
    # The hashval is in the first half
    if hashval < insertlist[pivot][0]:
        
        return binary_insert(hashval, lowvalindex, pivot, insertlist)
        

    # The hashval is in the second half
    if hashval > insertlist[pivot][0]:
        return binary_insert(hashval, pivot + 1, highvalindex, insertlist)
        
        
    # Found the hashval
    if hashval == insertlist[pivot][0]:
        return pivot, 1

 # Removes anagram groups with fewer anagrams than letters and then writes the
 # remaining anagrams to the output file "nagaram.out"
 def write_list(inlist):
    outf = open(OUTFILE, 'w')
    listlen = len(inlist)
    
    # Remove the anagrams for which there are fewer anagrams than there are
    # letters in the anagrams
    global anagramlist

    element = 0
    dprint('Removing anagrams with fewer anagrams than there are letters and writing the remainder to nagaram.out', 1)
    while element < listlen:
        anagram = inlist[element]
        wordlen = len(anagram[1])
    
        if len(anagram) - 1 < wordlen:
            dprint('Removing the following anagrams: ', 3)
            dprint(anagram[1:], 3)
            inlist.pop(element)
        else:
            anagrams = len(inlist[element])
            for m in range(1, anagrams - 1):
                if anagrams >= len(anagram) - 1:
                    outf.write(str(inlist[element][m]))
                    outf.write(', ')
            if anagrams >= len(anagram) - 1:
                outf.write(inlist[element][anagrams - 1])
                outf.write('\n')
                    
        element += 1
        listlen = len(inlist) - 1
        
    outf.close()
    return
    
 # Simple debugging printout/logfile handler that accounts for debugging verbosity
 # passed in as a parameter.    
 def dprint(message, debugflag = 0):
    timestamp = time.time() - starttime
    if DEBUG >= debugflag:
        print message
        global logf
        logf.write('\n')
        logf.write('%f\t%s' %(timestamp, message))

    return



 ### MAIN ###
 starttime = time.time()

 defaultdict = '/usr/share/dict/words'
 OUTFILE = 'nagaram.out'

 # Setup command line argument parsing using argparse
 parser = argparse.ArgumentParser(description = 'Find all the anagrams in a given dictionary with at least four letters and at least as many anagrams as there are letters')

 parser.add_argument('-d', '--debug', action = 'store', type = int, help = 'Enable different levels of program print statements for debugging. 0 = debugging is off. The depth and detail of the debugging statements printed to the terminal and written to the debug log increases with higher numbers. Defaults to 1 = minimal debugging printouts.',  required = False, default = 1) 
 parser.add_argument('-f', action = 'store', help = 'The file containing the dictionary to look for anagrams in. If not given, this defaults to /usr/share/dict/words.', required = False, default = defaultdict)

 args = parser.parse_args()
 DEBUG = args.debug
 infile = args.f

 # Open a logfile to write to
 logf = open('debuglog.txt', 'w')
    
 # Logging header
 dprint('=====================================================================', 0)
 dprint('Nagaram.py - Dictionary anagram parser.', 0)
 dprint('\nWritten by Jack Nelson \[email protected]\n(310) 863-9873\nJune 2015', 0)
 dprint('=====================================================================', 0)
 dprint('Parameters: debug = %d, dictionary = %s' %(DEBUG, infile),1)
    
 # Read the contents of the dictionary file into the list "dictionary"
 wordf = open(infile, 'r')
 outf = open('nagaram_test.txt', 'w')

 dprint('Writing the entire dictionary into a list', 1) 
 dictionary = list(wordf)
 wordf.close()

 # Scan the entire dictionary and throw out words with less than four characters
 anagramlist = []   # Create an empty list to hold the anagrams
 dictlen = len(dictionary)

 # Step through the remaining words in the dictionary and search for anagrams
 dprint('Searching for anagrams and writing them to the list', 1)  
 for w in range(dictlen):
    
    
    word = dictionary[w]
    word = word.strip('\n')
    
    
    # Check if the word is possessive case (ends with an "'s") and therefore
    # generates duplicates. 
    if '\'s' in word:
        continue
    word = word.split('\'')[0]
      
    dprint(('\nReading in word \''+ word+ '\''), 4)
        
    if word.istitle() == True | word.isupper():
        
        dprint('Removing \'%s\' because it is a title or an acronym' %word, 4)
        continue
    
    if len(word) >= 4:
    
        # Get the hash value of the word
        hashval = hash_eval(word)
        
        
        # Sort and insert the word into the anagram list using the binary sorted insert function
        anagramlist = sorted_insert([hashval, word], anagramlist)

 write_list(anagramlist)
        

 dprint('Wrote list of anagrams to %s' %OUTFILE, 1)
 dprint('nagaram.py is done.', 0)
 print('Output is in the file "nagaram.out"')

 finishtime = time.time()
 elapsedtime = finishtime - starttime
 print "Time elapsed: %ds\n" %elapsedtime
 logf.write('\n Total time elapsed: %fs' %elapsedtime)
 logf.close()
	#!/usr/bin/python
	import math
	import sys
	import argparse
	import time

	# Nagaram.py
	#
	# OVERVIEW
	# Searches a dictionary of common English words for anagrams with at least
	# four letters and as many anagrams as there are letters in the word.
	#
	# USAGE
	# Running this script with no options will use default settings for the dictionary file
	# to use (/usr/share/dict/words) and debugging verbosity (1).
	# Run "$python nagaram.py -h" for option details.
	#
	# ALGORITHM SUMMARY
	# A hash value (of sorts) is computed for each word in the dictionary by
	# summing together the cube of the ASCII values of the characters in the word so that
	# only words with the exact same letters in them (anagrams) will have the
	# same hash value. A binary search algorithm is then used to place each word
	# into a 2D list with rows that hold all the words with the same hash value.
	# At the end, the anagram array will look like this...
	#
	# 0 1 2 3
	# 0 \|<hashval 0> \| <anagram 01> \| <anagram 02> \| <anagram 03> . . .
	# - \|------------\|--------------\|--------------\|------------
	# 1 \|<hashval 1> \| <anagram 11> \| <anagram 12> \|<anagram 13> . . .
	# - \|------------\|--------------\|--------------\|------------
	# 2 \|<hashval 2> \| <anagram 21> \| <anagram 22> \|<anagram 23> . . .
	# - \|------------\|--------------\|--------------\|------------
	# . . . .
	# . . . .
	# . . . .
	#
	# The anagrams are read from the rows.

	# Evaluates the hash value of a word from its letters
	# Calculates the sum of cube of the ASCII values of the word's letters so that
	# every anagram has a distinct hash value.
	def hash_eval(word):

	hashval = 0
	for n in range(len(word)):
	c = word[n]
	hashval += pow(ord(c), 3)

	dprint('"%s" hashval = %d' %(word, hashval), 3)
	return hashval

	# Uses a binary search (implemented in the function binary_insert()) to insert the word
	# into numerical order by the word's hash value. If an anagram of the word is found, that is
	# if other words with the same hash values are found, the word is appended to the end of the
	# row for that hash value.
	# Takes a tuple containing a hash value and a word, and a 2D list (insertlist) containing hash values
	# in column 0 sortedi n numerical order, and anagrams in the rest of the columns.
	# Returns insertlist with wordtuple inserted into the list by its hashvalue
	def sorted_insert(wordtuple, insertlist):
	dprint('Searching list for anagrams of \'%s\'' %wordtuple[1], 3)

	# Account for an empty list
	if not insertlist:
	dprint('The list is empty. Inserting %s at element 0' %wordtuple[1], 3)
	insertlist = [wordtuple]
	return insertlist

	else:
	inlen = len(insertlist)
	lowvalindex = 0
	highvalindex = len(insertlist)-1

	# Find the column of the array
	insertElement, append = binary_insert(wordtuple[0], lowvalindex, highvalindex, insertlist)
	if append == 1: # An existing anagram has been found. Add it to the column
	dprint('Adding %s to element %d' %(wordtuple[1], insertElement), 3)
	insertlist[insertElement].append(wordtuple[1])
	elif append == 0: # No anagram found - insert the word before the given index
	dprint('Inserting %s before element %d' %(wordtuple[1], insertElement), 3)
	insertlist.insert((insertElement), wordtuple)

	return insertlist

	# Binary search function to find the element in the anagram list that has the given hash value
	def binary_insert(hashval, lowvalindex, highvalindex, insertlist):


	dprint('lowval index = %d\thighval index = %d' %(lowvalindex, highvalindex), 4)
	if DEBUG > 3:
	count = -1
	for element in insertlist:
	count+=1
	dprint('\t%d\t%s\t%d' %(count, element[1], element[0]), 4)

	# Calculate the "pivot" midpoint index in the list
	pivot = (highvalindex - lowvalindex)/2 + lowvalindex
	#pivot = int(math.ceil(len(insertlist)/2)) + lowvalindex - 1
	dprint('pivot index = %s' %pivot, 4)
	#pivot = insertlist[int(math.floor(lowval + highval/2))][0]

	# This is a "new" anagram. Place it before or after the current
	# pivot depending on the given hashval
	if highvalindex == lowvalindex:
	if hashval > insertlist[highvalindex][0]:
	return highvalindex + 1, 0
	if hashval < insertlist[highvalindex][0]:
	return highvalindex, 0

	# The hashval is in the first half
	if hashval < insertlist[pivot][0]:

	return binary_insert(hashval, lowvalindex, pivot, insertlist)


	# The hashval is in the second half
	if hashval > insertlist[pivot][0]:
	return binary_insert(hashval, pivot + 1, highvalindex, insertlist)


	# Found the hashval
	if hashval == insertlist[pivot][0]:
	return pivot, 1

	# Removes anagram groups with fewer anagrams than letters and then writes the
	# remaining anagrams to the output file "nagaram.out"
	def write_list(inlist):
	outf = open(OUTFILE, 'w')
	listlen = len(inlist)

	# Remove the anagrams for which there are fewer anagrams than there are
	# letters in the anagrams
	global anagramlist

	element = 0
	dprint('Removing anagrams with fewer anagrams than there are letters and writing the remainder to nagaram.out', 1)
	while element < listlen:
	anagram = inlist[element]
	wordlen = len(anagram[1])

	if len(anagram) - 1 < wordlen:
	dprint('Removing the following anagrams: ', 3)
	dprint(anagram[1:], 3)
	inlist.pop(element)
	else:
	anagrams = len(inlist[element])
	for m in range(1, anagrams - 1):
	if anagrams >= len(anagram) - 1:
	outf.write(str(inlist[element][m]))
	outf.write(', ')
	if anagrams >= len(anagram) - 1:
	outf.write(inlist[element][anagrams - 1])
	outf.write('\n')

	element += 1
	listlen = len(inlist) - 1

	outf.close()
	return

	# Simple debugging printout/logfile handler that accounts for debugging verbosity
	# passed in as a parameter.
	def dprint(message, debugflag = 0):
	timestamp = time.time() - starttime
	if DEBUG >= debugflag:
	print message
	global logf
	logf.write('\n')
	logf.write('%f\t%s' %(timestamp, message))

	return



	### MAIN ###
	starttime = time.time()

	defaultdict = '/usr/share/dict/words'
	OUTFILE = 'nagaram.out'

	# Setup command line argument parsing using argparse
	parser = argparse.ArgumentParser(description = 'Find all the anagrams in a given dictionary with at least four letters and at least as many anagrams as there are letters')

	parser.add_argument('-d', '--debug', action = 'store', type = int, help = 'Enable different levels of program print statements for debugging. 0 = debugging is off. The depth and detail of the debugging statements printed to the terminal and written to the debug log increases with higher numbers. Defaults to 1 = minimal debugging printouts.', required = False, default = 1)
	parser.add_argument('-f', action = 'store', help = 'The file containing the dictionary to look for anagrams in. If not given, this defaults to /usr/share/dict/words.', required = False, default = defaultdict)

	args = parser.parse_args()
	DEBUG = args.debug
	infile = args.f

	# Open a logfile to write to
	logf = open('debuglog.txt', 'w')

	# Logging header
	dprint('=====================================================================', 0)
	dprint('Nagaram.py - Dictionary anagram parser.', 0)
	dprint('\nWritten by Jack Nelson \[email protected]\n(310) 863-9873\nJune 2015', 0)
	dprint('=====================================================================', 0)
	dprint('Parameters: debug = %d, dictionary = %s' %(DEBUG, infile),1)

	# Read the contents of the dictionary file into the list "dictionary"
	wordf = open(infile, 'r')
	outf = open('nagaram_test.txt', 'w')

	dprint('Writing the entire dictionary into a list', 1)
	dictionary = list(wordf)
	wordf.close()

	# Scan the entire dictionary and throw out words with less than four characters
	anagramlist = [] # Create an empty list to hold the anagrams
	dictlen = len(dictionary)

	# Step through the remaining words in the dictionary and search for anagrams
	dprint('Searching for anagrams and writing them to the list', 1)
	for w in range(dictlen):


	word = dictionary[w]
	word = word.strip('\n')


	# Check if the word is possessive case (ends with an "'s") and therefore
	# generates duplicates.
	if '\'s' in word:
	continue
	word = word.split('\'')[0]

	dprint(('\nReading in word \''+ word+ '\''), 4)

	if word.istitle() == True \| word.isupper():

	dprint('Removing \'%s\' because it is a title or an acronym' %word, 4)
	continue

	if len(word) >= 4:

	# Get the hash value of the word
	hashval = hash_eval(word)


	# Sort and insert the word into the anagram list using the binary sorted insert function
	anagramlist = sorted_insert([hashval, word], anagramlist)

	write_list(anagramlist)


	dprint('Wrote list of anagrams to %s' %OUTFILE, 1)
	dprint('nagaram.py is done.', 0)
	print('Output is in the file "nagaram.out"')

	finishtime = time.time()
	elapsedtime = finishtime - starttime
	print "Time elapsed: %ds\n" %elapsedtime
	logf.write('\n Total time elapsed: %fs' %elapsedtime)
	logf.close()