Created
July 3, 2021 03:58
-
-
Save mishari/fd8bd67c4db28ebd0c4c8ebb10c2d51c to your computer and use it in GitHub Desktop.
text processor for Thai language
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pythainlp.tokenize import sent_tokenize, word_tokenize | |
from pythainlp.util import normalize | |
from pythainlp.util import num_to_thaiword | |
from pythainlp.spell import NorvigSpellChecker | |
import json_lines | |
import json | |
import re | |
import string | |
import sys | |
import fileinput | |
from multiprocessing import Pool | |
# Parameters from https://github.com/common-voice/sentence-collector/blob/main/server/lib/validation/languages/th.js | |
MIN_LENGTH = 6 | |
MAX_LENGTH = 100 | |
INVALIDATION = [{ | |
"regex": '[0-9๐-๙]', | |
"error": 'Sentence should not contain numbers', | |
}, { | |
"regex": '[<>+*\\#@^[\]()/\u0E2F\u0E46\u0E4F\u0E5A\u0E5B]', | |
"error": 'Sentence should not contain symbols, including Paiyannoi and Maiyamok', | |
}, { | |
"regex": '[A-Za-z]', | |
"error": 'Sentence should not contain latin alphabet characters', | |
}, { | |
"regex": '[ก-ฮ]\.[ก-ฮ]+\.', | |
"error": 'Sentence should not contain abbreviations', | |
}, { | |
"regex": '(^|\s)[\u0E30\u0E32\u0E33\u0E45\u0E31\u0E34\u0E35\u0E36\u0E37\u0E4D\u0E47\u0E38\u0E39\u0E48\u0E49\u0E4A\u0E4B\u0E3A\u0E4C\u0E4D\u0E4E]', | |
"error": 'Word should not start with unexpected characters, like follow vowel and tone mark', | |
}, { | |
"regex": '[\u0E40\u0E41\u0E42\u0E43\u0E44](\s|$)', | |
"error": 'Word should not end with leading vowels', | |
}, { | |
"regex": '[\u0E40\u0E41\u0E42\u0E43\u0E44]{2}', | |
"error": 'Sentence should not contain repeating lead vowels', | |
}, { | |
"regex": '[\u0E32\u0E33\u0E45]{2}', | |
"error": 'Sentence should not contain repeating follow vowels', | |
}, { | |
"regex": '\u0E30{2}', | |
"error": 'Sentence should not contain repeating Sara A', | |
}, { | |
"regex": '\u0E3A{2}|\u0E4C{2}|\u0E4D{2}|\u0E4E{2}', | |
"error": 'Sentence should not contain repeating Phinthu / Thanthakhat / Nikhahit / Yamakkan', | |
}, { | |
"regex": '[\u0E31\u0E34\u0E35\u0E36\u0E37\u0E4D\u0E47]{2}', | |
"error": 'Sentence should not contain repeating above vowels', | |
}, { | |
"regex": '[\u0E38\u0E39]{2}', | |
"error": 'Sentence should not contain repeating below vowels', | |
}, { | |
"regex": '[\u0E48\u0E49\u0E4A\u0E4B]{2}', | |
"error": 'Sentence should not contain repeating tone marks', | |
}, { | |
"regex": '[\u0E40\u0E41\u0E42\u0E43\u0E44\u0E30\u0E32\u0E33\u0E45][\u0E48\u0E49\u0E4A\u0E4B\u0E3A\u0E4C\u0E4D\u0E4E]', | |
"error": 'Sentence should not contain invalid symbols after lead/follow vowels', | |
}, { | |
"regex": '[\u0E48\u0E49\u0E4A\u0E4B\u0E3A\u0E4C\u0E4D\u0E4E][\u0E31\u0E34\u0E35\u0E36\u0E37\u0E4D\u0E47\u0E38\u0E39]', | |
"error": 'Sentence should not contain invalid symbols before above/below vowels', | |
}, { | |
"regex": '[\u0E33\u0E45][\u0E30]', | |
"error": 'Sentence should not contain Sara A after Sara Am or Lakkhangyao', | |
}, { | |
"regex": '[\u0E30][\u0E32\u0E33\u0E45]', | |
"error": 'Sentence should not contain Sara Aa, Sara Am or Lakkhangyao after Sara A', | |
}, { | |
"regex": '[\u200b\u200c\u2063\u0E01-\u0E4E]{71}', | |
"error": 'Sentence should not contain more than 70 consonants and vowels running without a space', | |
}, { | |
"regex": """[\u200b\u200c\u2063\u0E01-\u0E4E.,\-"'“”‘’\u0060?!:;]{81}""", | |
"error": 'Sentence should not contain more than 80 characters running without a space', | |
}, { | |
"regex": '[\u200b\u200c\u2063ก-ฮ]{31}', | |
"error": 'Sentence should not contain more than 30 consonants running without a space', | |
}, { | |
"regex": '(.)\1{6}', | |
"error": 'Sentence should not contain more than 7 of the same character in a row', | |
}, { | |
"regex": '(\u00a9|\u00ae|[\u2000-\u3300]|[\u2580-\u27bf]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff]|[\ue000-\uf8ff])', | |
"error": 'Sentence should not contain emojis or other special Unicode symbols', | |
}] | |
def strip_punctuation(old_string): | |
new_string = "" | |
for c in old_string: | |
if c not in string.punctuation: | |
new_string += c | |
return new_string | |
def strip_whitespace(a_string): | |
a_string = a_string.strip() | |
a_string = " ".join(a_string.split()) | |
return a_string | |
def is_length_valid(s): | |
if len(s) < MIN_LENGTH or len(s) > MAX_LENGTH: | |
return False | |
else: | |
return True | |
def remove_wrong_length(sentences): | |
new_s = [] | |
for s in sentences: | |
if not is_length_valid(s): | |
# print("WRONG LENGTH: " + s) | |
pass | |
else: | |
new_s.append(s) | |
return set(new_s) | |
def is_sentence_valid(s): | |
rules = INVALIDATION | |
valid = True | |
if not is_length_valid(s): | |
# print("INVALID LENGTH: " + s) | |
return False | |
for r in rules: | |
if re.search(r["regex"], s): | |
# print(r["error"] + ": " + s) | |
return False | |
if not sentence_spelling_correct(s): | |
return False | |
return valid | |
def check_sentences(sentence): | |
new_s = [] | |
for s in sentences: | |
valid = is_sentence_valid(s) | |
if valid: | |
new_s.append(s) | |
return new_s | |
def split_sentences(text): | |
sentences = sent_tokenize(text) | |
sentences = [strip_whitespace(s) for s in sentences] | |
output = [] | |
for s in sentences: | |
if is_sentence_valid(s): | |
output.append(s) | |
else: | |
o = [x for x in sent_tokenize(s,engine="whitespace") if is_sentence_valid(x)] | |
output.extend(o) | |
return output | |
def number_to_word(text): | |
sentences = text.split(" ") | |
output = [] | |
for s in sentences: | |
if s.isdigit(): | |
output.append(num_to_thaiword(int(s))) | |
else: | |
output.append(s) | |
return ' '.join(output) | |
def sentence_spelling_correct(sentence): | |
checker = NorvigSpellChecker() | |
words = word_tokenize(sentence) | |
words = [w for w in words if w != ' '] | |
if len(checker.known(words)) != len(words): | |
return False | |
return True | |
def normalize_mai_yamok(text): | |
sentences = text.split(" ") | |
sentences = set() | |
# with open(sys.argv[1], 'rb') as f: | |
# for item in json_lines.reader(f): | |
# headline = number_to_word(normalize(strip_punctuation( | |
# item['headline']))) | |
# article = number_to_word(normalize(strip_punctuation( | |
# item['article']))) | |
# sentences.update(split_sentences(headline)) | |
# sentences.update(split_sentences(article)) | |
def process_text(text): | |
return split_sentences(number_to_word(normalize(strip_punctuation( | |
text)))) | |
if __name__ == '__main__': | |
pool = Pool() | |
for i in pool.imap_unordered(process_text,fileinput.input()): | |
sentences.update(i) | |
with open("output.txt","w") as f: | |
for s in sentences: | |
f.write(s) | |
f.write('\n') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment