Skip to content

Instantly share code, notes, and snippets.

@mishari
Created July 3, 2021 03:58
Show Gist options
  • Save mishari/fd8bd67c4db28ebd0c4c8ebb10c2d51c to your computer and use it in GitHub Desktop.
Save mishari/fd8bd67c4db28ebd0c4c8ebb10c2d51c to your computer and use it in GitHub Desktop.
text processor for Thai language
from pythainlp.tokenize import sent_tokenize, word_tokenize
from pythainlp.util import normalize
from pythainlp.util import num_to_thaiword
from pythainlp.spell import NorvigSpellChecker
import json_lines
import json
import re
import string
import sys
import fileinput
from multiprocessing import Pool
# Parameters from https://github.com/common-voice/sentence-collector/blob/main/server/lib/validation/languages/th.js
MIN_LENGTH = 6
MAX_LENGTH = 100
INVALIDATION = [{
"regex": '[0-9๐-๙]',
"error": 'Sentence should not contain numbers',
}, {
"regex": '[<>+*\\#@^[\]()/\u0E2F\u0E46\u0E4F\u0E5A\u0E5B]',
"error": 'Sentence should not contain symbols, including Paiyannoi and Maiyamok',
}, {
"regex": '[A-Za-z]',
"error": 'Sentence should not contain latin alphabet characters',
}, {
"regex": '[ก-ฮ]\.[ก-ฮ]+\.',
"error": 'Sentence should not contain abbreviations',
}, {
"regex": '(^|\s)[\u0E30\u0E32\u0E33\u0E45\u0E31\u0E34\u0E35\u0E36\u0E37\u0E4D\u0E47\u0E38\u0E39\u0E48\u0E49\u0E4A\u0E4B\u0E3A\u0E4C\u0E4D\u0E4E]',
"error": 'Word should not start with unexpected characters, like follow vowel and tone mark',
}, {
"regex": '[\u0E40\u0E41\u0E42\u0E43\u0E44](\s|$)',
"error": 'Word should not end with leading vowels',
}, {
"regex": '[\u0E40\u0E41\u0E42\u0E43\u0E44]{2}',
"error": 'Sentence should not contain repeating lead vowels',
}, {
"regex": '[\u0E32\u0E33\u0E45]{2}',
"error": 'Sentence should not contain repeating follow vowels',
}, {
"regex": '\u0E30{2}',
"error": 'Sentence should not contain repeating Sara A',
}, {
"regex": '\u0E3A{2}|\u0E4C{2}|\u0E4D{2}|\u0E4E{2}',
"error": 'Sentence should not contain repeating Phinthu / Thanthakhat / Nikhahit / Yamakkan',
}, {
"regex": '[\u0E31\u0E34\u0E35\u0E36\u0E37\u0E4D\u0E47]{2}',
"error": 'Sentence should not contain repeating above vowels',
}, {
"regex": '[\u0E38\u0E39]{2}',
"error": 'Sentence should not contain repeating below vowels',
}, {
"regex": '[\u0E48\u0E49\u0E4A\u0E4B]{2}',
"error": 'Sentence should not contain repeating tone marks',
}, {
"regex": '[\u0E40\u0E41\u0E42\u0E43\u0E44\u0E30\u0E32\u0E33\u0E45][\u0E48\u0E49\u0E4A\u0E4B\u0E3A\u0E4C\u0E4D\u0E4E]',
"error": 'Sentence should not contain invalid symbols after lead/follow vowels',
}, {
"regex": '[\u0E48\u0E49\u0E4A\u0E4B\u0E3A\u0E4C\u0E4D\u0E4E][\u0E31\u0E34\u0E35\u0E36\u0E37\u0E4D\u0E47\u0E38\u0E39]',
"error": 'Sentence should not contain invalid symbols before above/below vowels',
}, {
"regex": '[\u0E33\u0E45][\u0E30]',
"error": 'Sentence should not contain Sara A after Sara Am or Lakkhangyao',
}, {
"regex": '[\u0E30][\u0E32\u0E33\u0E45]',
"error": 'Sentence should not contain Sara Aa, Sara Am or Lakkhangyao after Sara A',
}, {
"regex": '[\u200b\u200c\u2063\u0E01-\u0E4E]{71}',
"error": 'Sentence should not contain more than 70 consonants and vowels running without a space',
}, {
"regex": """[\u200b\u200c\u2063\u0E01-\u0E4E.,\-"'“”‘’\u0060?!:;]{81}""",
"error": 'Sentence should not contain more than 80 characters running without a space',
}, {
"regex": '[\u200b\u200c\u2063ก-ฮ]{31}',
"error": 'Sentence should not contain more than 30 consonants running without a space',
}, {
"regex": '(.)\1{6}',
"error": 'Sentence should not contain more than 7 of the same character in a row',
}, {
"regex": '(\u00a9|\u00ae|[\u2000-\u3300]|[\u2580-\u27bf]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff]|[\ue000-\uf8ff])',
"error": 'Sentence should not contain emojis or other special Unicode symbols',
}]
def strip_punctuation(old_string):
new_string = ""
for c in old_string:
if c not in string.punctuation:
new_string += c
return new_string
def strip_whitespace(a_string):
a_string = a_string.strip()
a_string = " ".join(a_string.split())
return a_string
def is_length_valid(s):
if len(s) < MIN_LENGTH or len(s) > MAX_LENGTH:
return False
else:
return True
def remove_wrong_length(sentences):
new_s = []
for s in sentences:
if not is_length_valid(s):
# print("WRONG LENGTH: " + s)
pass
else:
new_s.append(s)
return set(new_s)
def is_sentence_valid(s):
rules = INVALIDATION
valid = True
if not is_length_valid(s):
# print("INVALID LENGTH: " + s)
return False
for r in rules:
if re.search(r["regex"], s):
# print(r["error"] + ": " + s)
return False
if not sentence_spelling_correct(s):
return False
return valid
def check_sentences(sentence):
new_s = []
for s in sentences:
valid = is_sentence_valid(s)
if valid:
new_s.append(s)
return new_s
def split_sentences(text):
sentences = sent_tokenize(text)
sentences = [strip_whitespace(s) for s in sentences]
output = []
for s in sentences:
if is_sentence_valid(s):
output.append(s)
else:
o = [x for x in sent_tokenize(s,engine="whitespace") if is_sentence_valid(x)]
output.extend(o)
return output
def number_to_word(text):
sentences = text.split(" ")
output = []
for s in sentences:
if s.isdigit():
output.append(num_to_thaiword(int(s)))
else:
output.append(s)
return ' '.join(output)
def sentence_spelling_correct(sentence):
checker = NorvigSpellChecker()
words = word_tokenize(sentence)
words = [w for w in words if w != ' ']
if len(checker.known(words)) != len(words):
return False
return True
def normalize_mai_yamok(text):
sentences = text.split(" ")
sentences = set()
# with open(sys.argv[1], 'rb') as f:
# for item in json_lines.reader(f):
# headline = number_to_word(normalize(strip_punctuation(
# item['headline'])))
# article = number_to_word(normalize(strip_punctuation(
# item['article'])))
# sentences.update(split_sentences(headline))
# sentences.update(split_sentences(article))
def process_text(text):
return split_sentences(number_to_word(normalize(strip_punctuation(
text))))
if __name__ == '__main__':
pool = Pool()
for i in pool.imap_unordered(process_text,fileinput.input()):
sentences.update(i)
with open("output.txt","w") as f:
for s in sentences:
f.write(s)
f.write('\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment