Created
June 22, 2020 14:58
-
-
Save rtpg/8dabf0ceb0fc807ae80510a7989ef709 to your computer and use it in GitHub Desktop.
Find the biggest set of anagrams!
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
An implementation of the Fermat's library anagram detection | |
https://twitter.com/fermatslibrary/status/1275066521450975234 | |
Takes a list of line seperated words and uses prime factors to | |
encode letters and identify the equivalency classes. | |
Prints out the top 10 anagram classes for the provided word list | |
""" | |
from collections import defaultdict, Counter | |
import sys | |
primes = [ | |
2, | |
3, | |
5, | |
7, | |
11, | |
13, | |
17, | |
19, | |
23, | |
29, | |
31, | |
37, | |
41, | |
43, | |
47, | |
53, | |
59, | |
61, | |
67, | |
71, | |
73, | |
79, | |
83, | |
89, | |
97, | |
101, | |
] | |
letters = "abcdefghijklmnopqrstuvwxyz" | |
letters_to_numbers = { | |
letters[i]: primes[i] for i in range(26) | |
} | |
numbers_to_letters = { | |
primes[i]: letters[i] for i in range(26) | |
} | |
def score(word): | |
result = 1 | |
for c in word: | |
c = c.lower() # lowercase | |
# ignore unrecognized symbols | |
if c not in letters_to_numbers: | |
continue | |
result *= letters_to_numbers[c] | |
return result | |
# there were ~100k words in the dictionary | |
# counting the occurences | |
c = Counter() | |
# actually storing the results | |
full_dict = defaultdict(list) | |
def disqualified(line): | |
# remove lines that are boring | |
# 's? really? | |
if line.strip()[-2:] == "'s": | |
return True | |
# proper nouns, no thank you | |
if line[0].lower() != line[0]: | |
return True | |
# no n' and t' (or s') (or l'), for french | |
if line.strip()[:2] in {"n'", "t'", "l'", "s'"}: | |
return True | |
return False | |
# a way to generate a word list through aspell (here for french dictionary) | |
# aspell -d fr dump master | aspell -l fr expand > fr.dict | |
with open(sys.argv[1], 'r') as word_list: | |
# keep count to know the speed | |
for line in word_list: | |
if disqualified(line): | |
continue | |
s = score(line) | |
c[s] += 1 | |
full_dict[s].append(line) | |
c.most_common(10) | |
def factors(n): | |
# actually give letters, not the numbers | |
results = defaultdict(int) | |
current_factor = 2 | |
while n > 1: | |
if n % current_factor == 0: | |
n /= current_factor | |
results[current_factor] += 1 | |
else: | |
current_factor += 1 | |
# map to the letters | |
return { | |
numbers_to_letters[k]: v for k, v in results.items() | |
} | |
# show 10 most common with the letters involved | |
for factor_class, result_count in c.most_common(10): | |
components = factors(factor_class) | |
components_pretty = "" | |
for letter, count in components.items(): | |
components_pretty += letter.upper() * count | |
print(components_pretty, " has ", result_count, " results") | |
for w in full_dict[factor_class]: | |
print(f" - {w.strip()}") | |
print("================") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment