Skip to content

Instantly share code, notes, and snippets.

@sjolsen
Created March 18, 2021 03:40
Show Gist options
  • Save sjolsen/b50d2a2044c5b958f72c0529a0ff9893 to your computer and use it in GitHub Desktop.
Save sjolsen/b50d2a2044c5b958f72c0529a0ff9893 to your computer and use it in GitHub Desktop.
import sys
from typing import FrozenSet, Iterable, Iterator, List, NamedTuple, NewType, Sequence, Set
from frozendict import frozendict
from multiset import FrozenMultiset
# Text munging
def normalize(s: str) -> str:
return ''.join(c.lower() for c in s if c.isalnum())
# Core algorithms
NamePool = NewType('NamePool', FrozenMultiset)
class WordConstruction(NamedTuple):
word: str
names: NamePool
def derive_constructions(word: str, names: NamePool) -> Iterator[WordConstruction]:
if not word:
yield WordConstruction('', NamePool(FrozenMultiset()))
return
seen: Set[WordConstruction] = set()
for name in names.distinct_elements():
for i in range(len(name)):
prefix = name[0:1 + i]
if not word.startswith(normalize(prefix)):
break
suffix = word[len(prefix):]
sub_names = names - NamePool(FrozenMultiset([name]))
for sub_construction in derive_constructions(suffix, sub_names):
next_item = WordConstruction(
word=prefix + sub_construction.word,
names=NamePool(FrozenMultiset([name])) + sub_construction.names)
if next_item not in seen:
yield next_item
seen.add(next_item)
ConstructionMap = NewType('ConstructionMap', frozendict)
def build_construction_map(words: Iterable[str], names: NamePool) -> ConstructionMap:
result = {}
for word in words:
constructions = frozenset(derive_constructions(word, names))
if constructions:
result[word] = constructions
return ConstructionMap(frozendict(result))
WordList = NewType('WordList', FrozenMultiset)
def generate_wordlists(cmap: ConstructionMap, names: NamePool) -> Iterator[WordList]:
if not names:
yield WordList(FrozenMultiset())
return
seen: Set[WordList] = set()
for word, constructions in sorted(cmap.items()):
for construction in constructions:
if not (construction.names <= names):
continue
sub_names = names - construction.names
for sub_wordlist in generate_wordlists(cmap, sub_names):
next_item = WordList(FrozenMultiset([construction.word])) + sub_wordlist
if next_item not in seen:
yield next_item
seen.add(next_item)
# Main logic
def load_dictionary() -> Set[str]:
result = set()
with open('/usr/share/dict/words', 'rt') as f:
for line in f:
result.add(line.strip())
return result
HARDCODED_DENYLIST = 'acs ada adam adams adas ah ahmad ahmads ams asama asamas assad assam ca cad cads cas christa christas christmas christs dacha dachas davis daviss ha haas haass hale haled hales halest halss hm ma masada masadas mashs saar saars sac sachs sarah sarahs sasha scad scads sh shad shads shasta stu stus'
def normalized_dictionary(names: Set[str]) -> FrozenSet[str]:
words = load_dictionary()
bad_words = set(HARDCODED_DENYLIST.split(' '))
for candidate in words:
# Letters
if len(candidate) == 1:
bad_words.add(candidate)
# Names
bad_words |= names
# Possessives and plurals
bad_words |= set(base + suffix for base in bad_words for suffix in ("'s", 's', 'es', 'ses'))
return frozenset(map(normalize, words)) - frozenset(map(normalize, bad_words))
def main(argv: Sequence[str]):
names = NamePool(FrozenMultiset(argv[1:]))
cmap = build_construction_map(normalized_dictionary(set(names)), names)
seen: Set[str] = set()
for wordlist in generate_wordlists(cmap, names):
print(' '.join(wordlist))
seen |= set(normalize(w) for w in wordlist)
print('=== Unique words ===')
print(' '.join(sorted(seen)))
if __name__ == '__main__':
sys.exit(main(sys.argv))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment