Last active
June 25, 2024 19:58
-
-
Save michaelchadwick/855fe8e119a7def4a92c2c5c59f01b12 to your computer and use it in GitHub Desktop.
Filter a corpus of words to remove certain ones
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Filter corpus of words to remove certain ones | |
Current filters: | |
* Vulgarity (offensive words) | |
* Estoteria (uncommon words) | |
* Anagrams (re-orderings or existing words) | |
* By default, this is only used for pangrams (9-letter words) | |
Future filters: | |
* Plurals? | |
""" | |
import sys | |
import json | |
output_filename = None | |
input_json_str = None | |
output_json_str = None | |
words = {} | |
def word_count(words: dict): | |
return len(sorted({x for v in words.values() for x in v})) | |
def load_files(): | |
global output_filename | |
global words | |
cli_filename = ''; | |
if len(sys.argv) < 2: | |
print("Error: No filename provided.") | |
return False | |
cli_filename = sys.argv[1] | |
output_filename = f"{cli_filename[:-5]}.filtered.json" | |
try: | |
input_json_str = open(cli_filename, 'r').read() | |
words = json.loads(input_json_str) | |
print(f"checking _{word_count(words)}_ words from input: {cli_filename}") | |
return True | |
except FileNotFoundError: | |
print(f"Error: input file not found: {cli_filename}") | |
return False | |
# remove any words that are potentially offensive | |
def filter_vulgarity(words: dict): | |
vulgarity_file_path = 'vulgarity.txt' | |
filtered_words = {} | |
vulgarity_words = [] | |
if not words: | |
print("Error: No words to check for vulgarity") | |
return filtered_words | |
def is_vulgar(word: str): | |
return word.lower() in vulgarity_words | |
try: | |
vulgarity_words = open(vulgarity_file_path, 'r').read() | |
except FileNotFoundError: | |
print(f"Error: could not open {vulgarity_file_path}") | |
if not vulgarity_words: | |
return words | |
print(f"- removing vulgarity") | |
for cat in words: | |
for i in range(0, len(words[cat])): | |
print(f"\r-- checking words[{cat}][{i}]", end='', flush=True) | |
word = words[cat][i] | |
if not is_vulgar(word): | |
if not cat in filtered_words: | |
filtered_words[cat] = [] | |
filtered_words[cat].append(word) | |
print(f"\r", end='', flush=True) | |
print(f"--- filtered _{word_count(words)}_ -> _{word_count(filtered_words)}_ vulgar word(s)") | |
# print(filtered_words) | |
return filtered_words | |
# remove any words that are too esoteric | |
def filter_esoteria(words: dict): | |
filtered_words = {} | |
if not words: | |
print("Error: No words to check for esoteria") | |
return filtered_words | |
import nltk | |
# corpus types: https://www.nltk.org/nltk_data/ | |
from nltk.corpus import brown | |
try: | |
# Get the list of common words from the Brown corpus | |
corpus_brown = set(w.lower() for w in brown.words()) | |
except LookupError: | |
nltk.download('brown') | |
corpus_brown = set(w.lower() for w in brown.words()) | |
def is_common(word: str, corpus = 'brown'): | |
match corpus: | |
case 'brown': | |
return word.lower() in corpus_brown | |
print(f"- removing esoteria") | |
for cat in words: | |
for i in range(0, len(words[cat])): | |
print(f"\r-- checking words[{cat}][{i}]", end='', flush=True) | |
word = words[cat][i] | |
if is_common(word): | |
if not cat in filtered_words: | |
filtered_words[cat] = [] | |
filtered_words[cat].append(word) | |
print(f"\r", end='', flush=True) | |
print(f"--- filtered _{word_count(words)}_ -> _{word_count(filtered_words)}_ common word(s)") | |
# print(filtered_words) | |
return filtered_words | |
# remove any words that are anagrams of other words in input data | |
def filter_anagrams(words: dict, ignored_cats: list = None): | |
filtered_words = {} | |
if not words: | |
print("Error: No words to check for anagrams") | |
return filtered_words | |
print(f"- removing anagrams") | |
for cat in words: | |
if ignored_cats: | |
if int(cat) not in ignored_cats: | |
for i in range(0, len(words[cat])): | |
print(f"\r-- words[{cat}][{i}]", end='', flush=True) | |
anagramsFound = 0 | |
potentialWord = words[cat][i] | |
# print(f" {potentialWord} vs ...") | |
for word in words[cat]: | |
# skip comparing word to itself | |
if word != potentialWord: | |
# print(f" {word}") | |
list1 = list(word.lower()) | |
list1.sort() | |
word1 = ''.join(map(str, list1)) | |
list2 = list(potentialWord.lower()) | |
list2.sort() | |
word2 = ''.join(map(str, list2)) | |
if word1 == word2: | |
# print(f" anagram found: {word}") | |
anagramsFound += 1 | |
if anagramsFound == 0: | |
if not cat in filtered_words: | |
filtered_words[cat] = [] | |
filtered_words[cat].append(potentialWord) | |
else: | |
filtered_words[cat] = words[cat] | |
print(f"\r", end='', flush=True) | |
print(f"--- filtered _{word_count(words)}_ -> _{word_count(filtered_words)}_ unique word(s)") | |
# print(filtered_words) | |
return filtered_words | |
def main(): | |
global output_filename | |
if not load_files(): | |
exit(1) | |
filtered_words = filter_anagrams( | |
filter_esoteria(filter_vulgarity(words)), | |
[3, 4, 5, 6, 7, 8] | |
) | |
print("----------------------------------------------------------") | |
print(f"FINAL WORD COUNT: {word_count(filtered_words)}") | |
print("----------------------------------------------------------") | |
print('') | |
output_file = open(output_filename, 'w+') | |
if output_file: | |
output_file.write(json.dumps(filtered_words)) | |
print(f"-> output json written: {output_filename}") | |
output_file.close() | |
else: | |
print("Error: Could not write to output file.") | |
exit(1) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment