Skip to content

Instantly share code, notes, and snippets.

@simoncozens
Last active April 26, 2023 07:03
Show Gist options
  • Save simoncozens/95b1a6a8f4065f099ae6ae30923afee4 to your computer and use it in GitHub Desktop.
Save simoncozens/95b1a6a8f4065f099ae6ae30923afee4 to your computer and use it in GitHub Desktop.
import sys
import os
DEFAULT_NGRAM_SIZE = 5
def all_ngrams(word, size=None):
if size is None:
size = DEFAULT_NGRAM_SIZE
for i in range(max(1, len(word) - size)):
yield word[i : i + size]
class SubstringSet(set):
def __init__(self, init):
self.ngram_set = set()
def add(self, word):
if word in self:
return False
if all(ngram in self.ngram_set for ngram in all_ngrams(word)):
return False
super(SubstringSet, self).add(word)
for ngram in all_ngrams(word):
self.ngram_set.add(ngram)
return True
words = SubstringSet([])
directory = os.fsencode(sys.argv[1])
for file in os.listdir(directory):
filename = os.fsdecode(os.path.join(directory, file))
print("Processing "+filename, file=sys.stderr)
for row in open(filename):
for word in row.split():
words.add(word)
print("\n".join(words))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment