Last active
May 13, 2022 08:20
-
-
Save beatobongco/508a484d205801a6eca1cfb07bf50e21 to your computer and use it in GitHub Desktop.
Replace non-english sentences
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import math | |
import enchant | |
d = enchant.Dict("en_US") | |
def replace_noneng_sents(text: str, replace_with_spaces=True, | |
split_regex=r"[\.\。\!\?\?]", | |
clean_regex=r"[^a-zA-Z\']", | |
threshold=0.5, | |
debug=False) -> str: | |
"""Based on a threshold of non-english words, removes non-english sentences and replaces them with spaces. | |
If replace_with_spaces is False, remove those sentences.""" | |
out = "" | |
for sent in re.split(split_regex, text): | |
splitted = sent.split() | |
engwords = 0 | |
for word in splitted: | |
_word = re.sub(clean_regex, "", word) | |
if _word and d.check(_word.lower()): | |
engwords += 1 | |
if engwords >= math.ceil(len(splitted) * threshold): | |
# TODO: it will always add a period instead of the punctuation used to split | |
out += sent + "." | |
else: | |
if debug: | |
print("Removed:", sent.strip()) | |
if replace_with_spaces: | |
out += " " * (len(sent) + 1) | |
else: | |
out = out[:-1] | |
if replace_with_spaces: | |
assert len(text) == len(out) | |
return out |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Sample output
Debug statements:
Output text: