Created
October 3, 2018 01:18
-
-
Save cincodenada/e4d685451eb1bbd124f6eccf08bdd6ec to your computer and use it in GitHub Desktop.
A quick script to find words that are subsets of other words. Designed to find country names that contain other countries, but should be generally applicable.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import fileinput | |
import re | |
from collections import defaultdict | |
class Word: | |
def __init__(self, word): | |
self.word = word | |
self.allcaps = re.sub("[^A-Z]","",word.upper()) | |
self.countified() | |
def countified(self): | |
if(not hasattr(self, '_countified')): | |
self.counts = {} | |
for l in self.allcaps.lower(): | |
if l not in self.counts: | |
self.counts[l] = 0 | |
self.counts[l] += 1 | |
self._countified = "" | |
for k in sorted(self.counts.keys()): | |
self._countified += k.lower() + str(self.counts[k]) | |
return self._countified | |
def contains(self, other): | |
if(self == other): | |
return None | |
else: | |
for l in other.counts: | |
if (l not in self.counts) or (other.counts[l] > self.counts[l]): | |
return False | |
return True | |
def __str__(self): | |
return self.word | |
unscramble = defaultdict(list) | |
allcaps = [] | |
for ctry in fileinput.input(): | |
word = Word(ctry.replace('\n','')) | |
allcaps.append(word.allcaps) | |
unscramble[word.countified()].append(word) | |
subwords = defaultdict(list) | |
included_count = defaultdict(lambda: 0) | |
for containerl in unscramble.values(): | |
container = containerl[0] | |
for subwordl in unscramble.values(): | |
subword = subwordl[0] | |
if container.contains(subword): | |
subwords[container.countified()].append(subword) | |
included_count[subword.word]+=1 | |
containers = reversed(sorted(subwords.keys(), key=lambda k: len(subwords[k]))) | |
for k in containers: | |
print("[spoiler={} ({})]".format( | |
','.join([w.word for w in unscramble[k]]), | |
len(subwords[k]) | |
)) | |
print('\n'.join([w.word for w in subwords[k]])) | |
print("[/spoiler]", end="") | |
included = sorted(included_count.keys(), key=lambda k: included_count[k]) | |
for i in included: | |
print("{} ({})".format(i, included_count[i])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment