Created
March 18, 2021 03:40
-
-
Save sjolsen/b50d2a2044c5b958f72c0529a0ff9893 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
from typing import FrozenSet, Iterable, Iterator, List, NamedTuple, NewType, Sequence, Set | |
from frozendict import frozendict | |
from multiset import FrozenMultiset | |
# Text munging | |
def normalize(s: str) -> str: | |
return ''.join(c.lower() for c in s if c.isalnum()) | |
# Core algorithms | |
NamePool = NewType('NamePool', FrozenMultiset) | |
class WordConstruction(NamedTuple): | |
word: str | |
names: NamePool | |
def derive_constructions(word: str, names: NamePool) -> Iterator[WordConstruction]: | |
if not word: | |
yield WordConstruction('', NamePool(FrozenMultiset())) | |
return | |
seen: Set[WordConstruction] = set() | |
for name in names.distinct_elements(): | |
for i in range(len(name)): | |
prefix = name[0:1 + i] | |
if not word.startswith(normalize(prefix)): | |
break | |
suffix = word[len(prefix):] | |
sub_names = names - NamePool(FrozenMultiset([name])) | |
for sub_construction in derive_constructions(suffix, sub_names): | |
next_item = WordConstruction( | |
word=prefix + sub_construction.word, | |
names=NamePool(FrozenMultiset([name])) + sub_construction.names) | |
if next_item not in seen: | |
yield next_item | |
seen.add(next_item) | |
ConstructionMap = NewType('ConstructionMap', frozendict) | |
def build_construction_map(words: Iterable[str], names: NamePool) -> ConstructionMap: | |
result = {} | |
for word in words: | |
constructions = frozenset(derive_constructions(word, names)) | |
if constructions: | |
result[word] = constructions | |
return ConstructionMap(frozendict(result)) | |
WordList = NewType('WordList', FrozenMultiset) | |
def generate_wordlists(cmap: ConstructionMap, names: NamePool) -> Iterator[WordList]: | |
if not names: | |
yield WordList(FrozenMultiset()) | |
return | |
seen: Set[WordList] = set() | |
for word, constructions in sorted(cmap.items()): | |
for construction in constructions: | |
if not (construction.names <= names): | |
continue | |
sub_names = names - construction.names | |
for sub_wordlist in generate_wordlists(cmap, sub_names): | |
next_item = WordList(FrozenMultiset([construction.word])) + sub_wordlist | |
if next_item not in seen: | |
yield next_item | |
seen.add(next_item) | |
# Main logic | |
def load_dictionary() -> Set[str]: | |
result = set() | |
with open('/usr/share/dict/words', 'rt') as f: | |
for line in f: | |
result.add(line.strip()) | |
return result | |
HARDCODED_DENYLIST = 'acs ada adam adams adas ah ahmad ahmads ams asama asamas assad assam ca cad cads cas christa christas christmas christs dacha dachas davis daviss ha haas haass hale haled hales halest halss hm ma masada masadas mashs saar saars sac sachs sarah sarahs sasha scad scads sh shad shads shasta stu stus' | |
def normalized_dictionary(names: Set[str]) -> FrozenSet[str]: | |
words = load_dictionary() | |
bad_words = set(HARDCODED_DENYLIST.split(' ')) | |
for candidate in words: | |
# Letters | |
if len(candidate) == 1: | |
bad_words.add(candidate) | |
# Names | |
bad_words |= names | |
# Possessives and plurals | |
bad_words |= set(base + suffix for base in bad_words for suffix in ("'s", 's', 'es', 'ses')) | |
return frozenset(map(normalize, words)) - frozenset(map(normalize, bad_words)) | |
def main(argv: Sequence[str]): | |
names = NamePool(FrozenMultiset(argv[1:])) | |
cmap = build_construction_map(normalized_dictionary(set(names)), names) | |
seen: Set[str] = set() | |
for wordlist in generate_wordlists(cmap, names): | |
print(' '.join(wordlist)) | |
seen |= set(normalize(w) for w in wordlist) | |
print('=== Unique words ===') | |
print(' '.join(sorted(seen))) | |
if __name__ == '__main__': | |
sys.exit(main(sys.argv)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment