Created
October 12, 2020 06:08
-
-
Save TheCrether/b2f4d09ae90b4b8c137b1318b2dcbe90 to your computer and use it in GitHub Desktop.
REPLY 2020 Wells-Read
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from time import sleep | |
# parse the wells.txt and remove all non-words and words where special characters are at the beginning/end | |
wells = [] | |
with open("wells.txt", "r") as f: | |
temp = f.read() \ | |
.replace("\n", " ") \ | |
.split(" ") | |
# regex yey | |
begin = re.compile(r"^[\W_]+") | |
end = re.compile(r"[\W_]+$") | |
for word in temp: | |
word = end.sub("", begin.sub("", word)) | |
# also filter empty stuff | |
if len(word.strip()) == 0: | |
continue | |
wells.append(word) | |
# get words and make them lowercase | |
words = [] | |
with open("words.txt", "r") as f: | |
words = f.read().split("\n") | |
for i in range(len(words)): | |
words[i] = words[i].lower() | |
# go through all the tokens and see if they arent in the dictionary | |
wrong1 = [] | |
for word in wells: | |
word = word.strip() | |
if len(word) == 0: | |
continue | |
if word.lower() not in words: | |
wrong1.append(word) | |
# after that, there may be "words" like "like--a" where they actually are good, but just really, messed up | |
# this matches those and removes them if they actually are words with messed up spaces | |
nonword = re.compile(r"[\W]+") | |
wrong2 = [] | |
for word in wrong1: | |
for token in nonword.split(word): | |
if token.lower() not in words: | |
wrong2.append(token) | |
with open("wrong.txt", "w") as f: | |
s = "" | |
for word in wrong2: | |
s += word+"\n" | |
f.write(s) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_diff(word1: str, word2: str) -> [str]: | |
diff = [] | |
for i in range(len(word1)): | |
if not word1[i] == word2[i]: | |
diff.append(word1[i]) | |
return diff | |
# get words and make them lowercase | |
words = [] | |
lower = [] | |
with open("words.txt", "r") as f: | |
words = f.read().split("\n") | |
for i in range(len(words)): | |
lower.append(words[i].lower()) | |
wrong = [] | |
with open("wrong.txt", "r") as f: | |
wrong = f.read().split("\n") | |
for word in wrong: | |
for l in range(len(lower)): | |
if len(word) == len(lower[l]): | |
diff1 = get_diff(word, words[l]) | |
diff2 = get_diff(word.lower(), lower[l]) | |
if len(diff1) == 1 or len(diff2) == 1: | |
print(diff1) | |
print(diff2) | |
print(word) | |
print(words[l]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment