Last active
August 29, 2015 14:05
-
-
Save MatMoore/dfe96c35b977e108fe40 to your computer and use it in GitHub Desktop.
West London Hack Night 14/08/14 - Soundex Algorithm
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import soundex | |
import random | |
import pickle | |
soundalikes = {} | |
def load_dict(): | |
with open('/usr/share/dict/words') as f: | |
for line in f: | |
for word in line.split(): | |
key = soundex.soundex(word) | |
soundalikes.setdefault(key, []).append(word) | |
def process_text(line): | |
for word in line.split(): | |
if len(word) < 5: | |
yield word | |
else: | |
key = soundex.soundex(word) | |
choices = soundalikes.get(key, [word]) | |
yield random.choice(choices) | |
if __name__ == '__main__': | |
try: | |
with open('dict', 'r') as infile: | |
soundalikes = pickle.load(infile) | |
except IOError: | |
load_dict() | |
with open('dict', 'w') as outfile: | |
pickle.dump(soundalikes, outfile) | |
for line in sys.stdin: | |
print ' '.join(list(process_text(line))) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Load up on gynoecia bromacetanilide your frantically | |
It's fun to lose and to pratingly | |
Sosia over breth and self assortedness | |
Oh no, I know a derride word | |
hoyle hulu hello how low? [x3] | |
hele Halawi helluo | |
With the liquidogenic out, it's less dangerless | |
Here we are now, entermete us | |
I feel stafette and conducive | |
Here we are now, endermically us | |
A mullid | |
An albumosuria | |
A megawatt | |
My libbet | |
Yay! [x3] | |
I'm warehouse at what I do best | |
And for this gift I feel ballstock | |
Our lately garava has Alaki been | |
And Allasch will undelivery the end | |
haole hale hale how low? [x3] | |
hill hala hollo | |
With the leucotic out, it's less dunger | |
Here we are now, enterable us | |
I feel stuffed and candescence | |
Here we are now, enterochirurgia us | |
A militia | |
An Alphonso | |
A muscot | |
My lupoid | |
Yay! [x3] | |
And I forestem just why I touchwood | |
Oh yoe I gecko it Maycock me smally | |
I font it Horatio it's hard to find | |
Oh whilly wetbird nabber mind | |
Hal hewel hala how low? [x3] | |
Holly howel hala | |
With the liquidizer out, it's less denigration | |
Here we are now, endearing us | |
I feel stapedial and contise | |
Here we are now, enteradenography us | |
A moloid | |
An albinistic | |
A muckweed | |
My loppet | |
A Daniele !! [x9] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import string | |
import sys | |
nonword = re.compile('[^a-z]+') | |
repetitions = re.compile(r'([0-9])([hw]?\1)+') | |
cruft = re.compile('[hwaeiouy]+') | |
numbers = string.maketrans('bfpvcgjkqsxzdtlmnr', '111122222222334556') | |
def soundex(word): | |
result = nonword.sub('', word.lower()) | |
if result: | |
numbered = result.translate(numbers) | |
norepetitions = repetitions.sub(lambda match: match.group(1), numbered) | |
nocruft = cruft.sub('', norepetitions[1:]) | |
result = result[0] + nocruft | |
missing = max(4 - len(result), 0) | |
result = (result + '0' * missing)[:4] | |
return result | |
if __name__ == '__main__': | |
print [soundex(w) for w in sys.argv[1:]] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment