Last active
April 26, 2023 07:03
-
-
Save simoncozens/95b1a6a8f4065f099ae6ae30923afee4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import os | |
DEFAULT_NGRAM_SIZE = 5 | |
def all_ngrams(word, size=None): | |
if size is None: | |
size = DEFAULT_NGRAM_SIZE | |
for i in range(max(1, len(word) - size)): | |
yield word[i : i + size] | |
class SubstringSet(set): | |
def __init__(self, init): | |
self.ngram_set = set() | |
def add(self, word): | |
if word in self: | |
return False | |
if all(ngram in self.ngram_set for ngram in all_ngrams(word)): | |
return False | |
super(SubstringSet, self).add(word) | |
for ngram in all_ngrams(word): | |
self.ngram_set.add(ngram) | |
return True | |
words = SubstringSet([]) | |
directory = os.fsencode(sys.argv[1]) | |
for file in os.listdir(directory): | |
filename = os.fsdecode(os.path.join(directory, file)) | |
print("Processing "+filename, file=sys.stderr) | |
for row in open(filename): | |
for word in row.split(): | |
words.add(word) | |
print("\n".join(words)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment