Skip to content

Instantly share code, notes, and snippets.

@jwnelson
Last active August 29, 2015 14:23
Show Gist options
  • Save jwnelson/490d178b77cbd5dbf4cd to your computer and use it in GitHub Desktop.
Save jwnelson/490d178b77cbd5dbf4cd to your computer and use it in GitHub Desktop.
Searches for anagrams of words in a dictionary with at least four letters and at least as many anagrams as letters.
#!/usr/bin/python
import math
import sys
import argparse
import time
# Nagaram.py
#
# OVERVIEW
# Searches a dictionary of common English words for anagrams with at least
# four letters and as many anagrams as there are letters in the word.
#
# USAGE
# Running this script with no options will use default settings for the dictionary file
# to use (/usr/share/dict/words) and debugging verbosity (1).
# Run "$python nagaram.py -h" for option details.
#
# ALGORITHM SUMMARY
# A hash value (of sorts) is computed for each word in the dictionary by
# summing together the cube of the ASCII values of the characters in the word so that
# only words with the exact same letters in them (anagrams) will have the
# same hash value. A binary search algorithm is then used to place each word
# into a 2D list with rows that hold all the words with the same hash value.
# At the end, the anagram array will look like this...
#
# 0 1 2 3
# 0 |<hashval 0> | <anagram 01> | <anagram 02> | <anagram 03> . . .
# - |------------|--------------|--------------|------------
# 1 |<hashval 1> | <anagram 11> | <anagram 12> |<anagram 13> . . .
# - |------------|--------------|--------------|------------
# 2 |<hashval 2> | <anagram 21> | <anagram 22> |<anagram 23> . . .
# - |------------|--------------|--------------|------------
# . . . .
# . . . .
# . . . .
#
# The anagrams are read from the rows.
# Evaluates the hash value of a word from its letters
# Calculates the sum of cube of the ASCII values of the word's letters so that
# every anagram has a distinct hash value.
def hash_eval(word):
hashval = 0
for n in range(len(word)):
c = word[n]
hashval += pow(ord(c), 3)
dprint('"%s" hashval = %d' %(word, hashval), 3)
return hashval
# Uses a binary search (implemented in the function binary_insert()) to insert the word
# into numerical order by the word's hash value. If an anagram of the word is found, that is
# if other words with the same hash values are found, the word is appended to the end of the
# row for that hash value.
# Takes a tuple containing a hash value and a word, and a 2D list (insertlist) containing hash values
# in column 0 sortedi n numerical order, and anagrams in the rest of the columns.
# Returns insertlist with wordtuple inserted into the list by its hashvalue
def sorted_insert(wordtuple, insertlist):
dprint('Searching list for anagrams of \'%s\'' %wordtuple[1], 3)
# Account for an empty list
if not insertlist:
dprint('The list is empty. Inserting %s at element 0' %wordtuple[1], 3)
insertlist = [wordtuple]
return insertlist
else:
inlen = len(insertlist)
lowvalindex = 0
highvalindex = len(insertlist)-1
# Find the column of the array
insertElement, append = binary_insert(wordtuple[0], lowvalindex, highvalindex, insertlist)
if append == 1: # An existing anagram has been found. Add it to the column
dprint('Adding %s to element %d' %(wordtuple[1], insertElement), 3)
insertlist[insertElement].append(wordtuple[1])
elif append == 0: # No anagram found - insert the word before the given index
dprint('Inserting %s before element %d' %(wordtuple[1], insertElement), 3)
insertlist.insert((insertElement), wordtuple)
return insertlist
# Binary search function to find the element in the anagram list that has the given hash value
def binary_insert(hashval, lowvalindex, highvalindex, insertlist):
dprint('lowval index = %d\thighval index = %d' %(lowvalindex, highvalindex), 4)
if DEBUG > 3:
count = -1
for element in insertlist:
count+=1
dprint('\t%d\t%s\t%d' %(count, element[1], element[0]), 4)
# Calculate the "pivot" midpoint index in the list
pivot = (highvalindex - lowvalindex)/2 + lowvalindex
#pivot = int(math.ceil(len(insertlist)/2)) + lowvalindex - 1
dprint('pivot index = %s' %pivot, 4)
#pivot = insertlist[int(math.floor(lowval + highval/2))][0]
# This is a "new" anagram. Place it before or after the current
# pivot depending on the given hashval
if highvalindex == lowvalindex:
if hashval > insertlist[highvalindex][0]:
return highvalindex + 1, 0
if hashval < insertlist[highvalindex][0]:
return highvalindex, 0
# The hashval is in the first half
if hashval < insertlist[pivot][0]:
return binary_insert(hashval, lowvalindex, pivot, insertlist)
# The hashval is in the second half
if hashval > insertlist[pivot][0]:
return binary_insert(hashval, pivot + 1, highvalindex, insertlist)
# Found the hashval
if hashval == insertlist[pivot][0]:
return pivot, 1
# Removes anagram groups with fewer anagrams than letters and then writes the
# remaining anagrams to the output file "nagaram.out"
def write_list(inlist):
outf = open(OUTFILE, 'w')
listlen = len(inlist)
# Remove the anagrams for which there are fewer anagrams than there are
# letters in the anagrams
global anagramlist
element = 0
dprint('Removing anagrams with fewer anagrams than there are letters and writing the remainder to nagaram.out', 1)
while element < listlen:
anagram = inlist[element]
wordlen = len(anagram[1])
if len(anagram) - 1 < wordlen:
dprint('Removing the following anagrams: ', 3)
dprint(anagram[1:], 3)
inlist.pop(element)
else:
anagrams = len(inlist[element])
for m in range(1, anagrams - 1):
if anagrams >= len(anagram) - 1:
outf.write(str(inlist[element][m]))
outf.write(', ')
if anagrams >= len(anagram) - 1:
outf.write(inlist[element][anagrams - 1])
outf.write('\n')
element += 1
listlen = len(inlist) - 1
outf.close()
return
# Simple debugging printout/logfile handler that accounts for debugging verbosity
# passed in as a parameter.
def dprint(message, debugflag = 0):
timestamp = time.time() - starttime
if DEBUG >= debugflag:
print message
global logf
logf.write('\n')
logf.write('%f\t%s' %(timestamp, message))
return
### MAIN ###
starttime = time.time()
defaultdict = '/usr/share/dict/words'
OUTFILE = 'nagaram.out'
# Setup command line argument parsing using argparse
parser = argparse.ArgumentParser(description = 'Find all the anagrams in a given dictionary with at least four letters and at least as many anagrams as there are letters')
parser.add_argument('-d', '--debug', action = 'store', type = int, help = 'Enable different levels of program print statements for debugging. 0 = debugging is off. The depth and detail of the debugging statements printed to the terminal and written to the debug log increases with higher numbers. Defaults to 1 = minimal debugging printouts.', required = False, default = 1)
parser.add_argument('-f', action = 'store', help = 'The file containing the dictionary to look for anagrams in. If not given, this defaults to /usr/share/dict/words.', required = False, default = defaultdict)
args = parser.parse_args()
DEBUG = args.debug
infile = args.f
# Open a logfile to write to
logf = open('debuglog.txt', 'w')
# Logging header
dprint('=====================================================================', 0)
dprint('Nagaram.py - Dictionary anagram parser.', 0)
dprint('\nWritten by Jack Nelson \[email protected]\n(310) 863-9873\nJune 2015', 0)
dprint('=====================================================================', 0)
dprint('Parameters: debug = %d, dictionary = %s' %(DEBUG, infile),1)
# Read the contents of the dictionary file into the list "dictionary"
wordf = open(infile, 'r')
outf = open('nagaram_test.txt', 'w')
dprint('Writing the entire dictionary into a list', 1)
dictionary = list(wordf)
wordf.close()
# Scan the entire dictionary and throw out words with less than four characters
anagramlist = [] # Create an empty list to hold the anagrams
dictlen = len(dictionary)
# Step through the remaining words in the dictionary and search for anagrams
dprint('Searching for anagrams and writing them to the list', 1)
for w in range(dictlen):
word = dictionary[w]
word = word.strip('\n')
# Check if the word is possessive case (ends with an "'s") and therefore
# generates duplicates.
if '\'s' in word:
continue
word = word.split('\'')[0]
dprint(('\nReading in word \''+ word+ '\''), 4)
if word.istitle() == True | word.isupper():
dprint('Removing \'%s\' because it is a title or an acronym' %word, 4)
continue
if len(word) >= 4:
# Get the hash value of the word
hashval = hash_eval(word)
# Sort and insert the word into the anagram list using the binary sorted insert function
anagramlist = sorted_insert([hashval, word], anagramlist)
write_list(anagramlist)
dprint('Wrote list of anagrams to %s' %OUTFILE, 1)
dprint('nagaram.py is done.', 0)
print('Output is in the file "nagaram.out"')
finishtime = time.time()
elapsedtime = finishtime - starttime
print "Time elapsed: %ds\n" %elapsedtime
logf.write('\n Total time elapsed: %fs' %elapsedtime)
logf.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment