Last active
August 29, 2015 14:23
-
-
Save jwnelson/490d178b77cbd5dbf4cd to your computer and use it in GitHub Desktop.
Searches for anagrams of words in a dictionary with at least four letters and at least as many anagrams as letters.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import math | |
import sys | |
import argparse | |
import time | |
# Nagaram.py | |
# | |
# OVERVIEW | |
# Searches a dictionary of common English words for anagrams with at least | |
# four letters and as many anagrams as there are letters in the word. | |
# | |
# USAGE | |
# Running this script with no options will use default settings for the dictionary file | |
# to use (/usr/share/dict/words) and debugging verbosity (1). | |
# Run "$python nagaram.py -h" for option details. | |
# | |
# ALGORITHM SUMMARY | |
# A hash value (of sorts) is computed for each word in the dictionary by | |
# summing together the cube of the ASCII values of the characters in the word so that | |
# only words with the exact same letters in them (anagrams) will have the | |
# same hash value. A binary search algorithm is then used to place each word | |
# into a 2D list with rows that hold all the words with the same hash value. | |
# At the end, the anagram array will look like this... | |
# | |
# 0 1 2 3 | |
# 0 |<hashval 0> | <anagram 01> | <anagram 02> | <anagram 03> . . . | |
# - |------------|--------------|--------------|------------ | |
# 1 |<hashval 1> | <anagram 11> | <anagram 12> |<anagram 13> . . . | |
# - |------------|--------------|--------------|------------ | |
# 2 |<hashval 2> | <anagram 21> | <anagram 22> |<anagram 23> . . . | |
# - |------------|--------------|--------------|------------ | |
# . . . . | |
# . . . . | |
# . . . . | |
# | |
# The anagrams are read from the rows. | |
# Evaluates the hash value of a word from its letters | |
# Calculates the sum of cube of the ASCII values of the word's letters so that | |
# every anagram has a distinct hash value. | |
def hash_eval(word): | |
hashval = 0 | |
for n in range(len(word)): | |
c = word[n] | |
hashval += pow(ord(c), 3) | |
dprint('"%s" hashval = %d' %(word, hashval), 3) | |
return hashval | |
# Uses a binary search (implemented in the function binary_insert()) to insert the word | |
# into numerical order by the word's hash value. If an anagram of the word is found, that is | |
# if other words with the same hash values are found, the word is appended to the end of the | |
# row for that hash value. | |
# Takes a tuple containing a hash value and a word, and a 2D list (insertlist) containing hash values | |
# in column 0 sortedi n numerical order, and anagrams in the rest of the columns. | |
# Returns insertlist with wordtuple inserted into the list by its hashvalue | |
def sorted_insert(wordtuple, insertlist): | |
dprint('Searching list for anagrams of \'%s\'' %wordtuple[1], 3) | |
# Account for an empty list | |
if not insertlist: | |
dprint('The list is empty. Inserting %s at element 0' %wordtuple[1], 3) | |
insertlist = [wordtuple] | |
return insertlist | |
else: | |
inlen = len(insertlist) | |
lowvalindex = 0 | |
highvalindex = len(insertlist)-1 | |
# Find the column of the array | |
insertElement, append = binary_insert(wordtuple[0], lowvalindex, highvalindex, insertlist) | |
if append == 1: # An existing anagram has been found. Add it to the column | |
dprint('Adding %s to element %d' %(wordtuple[1], insertElement), 3) | |
insertlist[insertElement].append(wordtuple[1]) | |
elif append == 0: # No anagram found - insert the word before the given index | |
dprint('Inserting %s before element %d' %(wordtuple[1], insertElement), 3) | |
insertlist.insert((insertElement), wordtuple) | |
return insertlist | |
# Binary search function to find the element in the anagram list that has the given hash value | |
def binary_insert(hashval, lowvalindex, highvalindex, insertlist): | |
dprint('lowval index = %d\thighval index = %d' %(lowvalindex, highvalindex), 4) | |
if DEBUG > 3: | |
count = -1 | |
for element in insertlist: | |
count+=1 | |
dprint('\t%d\t%s\t%d' %(count, element[1], element[0]), 4) | |
# Calculate the "pivot" midpoint index in the list | |
pivot = (highvalindex - lowvalindex)/2 + lowvalindex | |
#pivot = int(math.ceil(len(insertlist)/2)) + lowvalindex - 1 | |
dprint('pivot index = %s' %pivot, 4) | |
#pivot = insertlist[int(math.floor(lowval + highval/2))][0] | |
# This is a "new" anagram. Place it before or after the current | |
# pivot depending on the given hashval | |
if highvalindex == lowvalindex: | |
if hashval > insertlist[highvalindex][0]: | |
return highvalindex + 1, 0 | |
if hashval < insertlist[highvalindex][0]: | |
return highvalindex, 0 | |
# The hashval is in the first half | |
if hashval < insertlist[pivot][0]: | |
return binary_insert(hashval, lowvalindex, pivot, insertlist) | |
# The hashval is in the second half | |
if hashval > insertlist[pivot][0]: | |
return binary_insert(hashval, pivot + 1, highvalindex, insertlist) | |
# Found the hashval | |
if hashval == insertlist[pivot][0]: | |
return pivot, 1 | |
# Removes anagram groups with fewer anagrams than letters and then writes the | |
# remaining anagrams to the output file "nagaram.out" | |
def write_list(inlist): | |
outf = open(OUTFILE, 'w') | |
listlen = len(inlist) | |
# Remove the anagrams for which there are fewer anagrams than there are | |
# letters in the anagrams | |
global anagramlist | |
element = 0 | |
dprint('Removing anagrams with fewer anagrams than there are letters and writing the remainder to nagaram.out', 1) | |
while element < listlen: | |
anagram = inlist[element] | |
wordlen = len(anagram[1]) | |
if len(anagram) - 1 < wordlen: | |
dprint('Removing the following anagrams: ', 3) | |
dprint(anagram[1:], 3) | |
inlist.pop(element) | |
else: | |
anagrams = len(inlist[element]) | |
for m in range(1, anagrams - 1): | |
if anagrams >= len(anagram) - 1: | |
outf.write(str(inlist[element][m])) | |
outf.write(', ') | |
if anagrams >= len(anagram) - 1: | |
outf.write(inlist[element][anagrams - 1]) | |
outf.write('\n') | |
element += 1 | |
listlen = len(inlist) - 1 | |
outf.close() | |
return | |
# Simple debugging printout/logfile handler that accounts for debugging verbosity | |
# passed in as a parameter. | |
def dprint(message, debugflag = 0): | |
timestamp = time.time() - starttime | |
if DEBUG >= debugflag: | |
print message | |
global logf | |
logf.write('\n') | |
logf.write('%f\t%s' %(timestamp, message)) | |
return | |
### MAIN ### | |
starttime = time.time() | |
defaultdict = '/usr/share/dict/words' | |
OUTFILE = 'nagaram.out' | |
# Setup command line argument parsing using argparse | |
parser = argparse.ArgumentParser(description = 'Find all the anagrams in a given dictionary with at least four letters and at least as many anagrams as there are letters') | |
parser.add_argument('-d', '--debug', action = 'store', type = int, help = 'Enable different levels of program print statements for debugging. 0 = debugging is off. The depth and detail of the debugging statements printed to the terminal and written to the debug log increases with higher numbers. Defaults to 1 = minimal debugging printouts.', required = False, default = 1) | |
parser.add_argument('-f', action = 'store', help = 'The file containing the dictionary to look for anagrams in. If not given, this defaults to /usr/share/dict/words.', required = False, default = defaultdict) | |
args = parser.parse_args() | |
DEBUG = args.debug | |
infile = args.f | |
# Open a logfile to write to | |
logf = open('debuglog.txt', 'w') | |
# Logging header | |
dprint('=====================================================================', 0) | |
dprint('Nagaram.py - Dictionary anagram parser.', 0) | |
dprint('\nWritten by Jack Nelson \[email protected]\n(310) 863-9873\nJune 2015', 0) | |
dprint('=====================================================================', 0) | |
dprint('Parameters: debug = %d, dictionary = %s' %(DEBUG, infile),1) | |
# Read the contents of the dictionary file into the list "dictionary" | |
wordf = open(infile, 'r') | |
outf = open('nagaram_test.txt', 'w') | |
dprint('Writing the entire dictionary into a list', 1) | |
dictionary = list(wordf) | |
wordf.close() | |
# Scan the entire dictionary and throw out words with less than four characters | |
anagramlist = [] # Create an empty list to hold the anagrams | |
dictlen = len(dictionary) | |
# Step through the remaining words in the dictionary and search for anagrams | |
dprint('Searching for anagrams and writing them to the list', 1) | |
for w in range(dictlen): | |
word = dictionary[w] | |
word = word.strip('\n') | |
# Check if the word is possessive case (ends with an "'s") and therefore | |
# generates duplicates. | |
if '\'s' in word: | |
continue | |
word = word.split('\'')[0] | |
dprint(('\nReading in word \''+ word+ '\''), 4) | |
if word.istitle() == True | word.isupper(): | |
dprint('Removing \'%s\' because it is a title or an acronym' %word, 4) | |
continue | |
if len(word) >= 4: | |
# Get the hash value of the word | |
hashval = hash_eval(word) | |
# Sort and insert the word into the anagram list using the binary sorted insert function | |
anagramlist = sorted_insert([hashval, word], anagramlist) | |
write_list(anagramlist) | |
dprint('Wrote list of anagrams to %s' %OUTFILE, 1) | |
dprint('nagaram.py is done.', 0) | |
print('Output is in the file "nagaram.out"') | |
finishtime = time.time() | |
elapsedtime = finishtime - starttime | |
print "Time elapsed: %ds\n" %elapsedtime | |
logf.write('\n Total time elapsed: %fs' %elapsedtime) | |
logf.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment