Created
April 26, 2017 14:12
-
-
Save jan-g/51d46f470c639399ab3f42789f5dbae6 to your computer and use it in GitHub Desktop.
Anagrams
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import collections | |
import argparse | |
import itertools | |
def load_ngraphs(fn, n): | |
"""Load in a set of ngraph counts from a source corpus""" | |
# Read in ngraph | |
c = collections.Counter() | |
with open(fn) as f: | |
# Go through each line of the OSW (or your other source of sample words) | |
# Pad each word with a bracketing character, $apple$ | |
# Split each word into a sequence of ngraphs, $a ap pp pl le e$ | |
# Count up all of those ngraphs. | |
for line in f: | |
line = '$' + line.strip() + '$' | |
for ngraph in [line[i:i + n] for i in range(len(line) - n + 1)]: | |
c[ngraph] += 1 | |
total = sum(c[k] for k in c) | |
return {k: c[k] / total for k in c} | |
def score(freqs, word, n): | |
"""Word out the probabilistic score for this particular word""" | |
product = 1 | |
word = '$' + word + '$' | |
for ngraph in [word[i:i + n] for i in range(len(word) - n + 1)]: | |
product *= freqs.get(ngraph, 0) | |
return product | |
def hunt(freqs, word, n): | |
return {anag: score(freqs, anag, n) | |
for anag in set(map(''.join, itertools.permutations(word)))} | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--dict', default='source.txt', help='word list source') | |
parser.add_argument('-n', default=2, type=int, help='length of n-gram to use') | |
parser.add_argument('anag', help='word to look for an anagram for') | |
args = parser.parse_args() | |
c = load_ngraphs(args.dict, args.n) | |
results = hunt(c, args.anag, args.n) | |
m = max(results[k] for k in results) | |
by_score = sorted([(results[k] / m, k) for k in results if results[k] > 0], reverse=True) | |
for i, (score, word) in enumerate(by_score[:100]): | |
print(i, score, word) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment