Last active
November 20, 2022 12:30
-
-
Save behitek/72ccf3ddf7d179b281fdae6c0b84942b to your computer and use it in GitHub Desktop.
Chuẩn hóa cách gõ dấu câu về kiểu gõ cũ (Python + Java version)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import regex as re | |
uniChars = "àáảãạâầấẩẫậăằắẳẵặèéẻẽẹêềếểễệđìíỉĩịòóỏõọôồốổỗộơờớởỡợùúủũụưừứửữựỳýỷỹỵÀÁẢÃẠÂẦẤẨẪẬĂẰẮẲẴẶÈÉẺẼẸÊỀẾỂỄỆĐÌÍỈĨỊÒÓỎÕỌÔỒỐỔỖỘƠỜỚỞỠỢÙÚỦŨỤƯỪỨỬỮỰỲÝỶỸỴÂĂĐÔƠƯ" | |
unsignChars = "aaaaaaaaaaaaaaaaaeeeeeeeeeeediiiiiooooooooooooooooouuuuuuuuuuuyyyyyAAAAAAAAAAAAAAAAAEEEEEEEEEEEDIIIOOOOOOOOOOOOOOOOOOOUUUUUUUUUUUYYYYYAADOOU" | |
def loaddicchar(): | |
dic = {} | |
char1252 = 'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ'.split( | |
'|') | |
charutf8 = "à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ".split( | |
'|') | |
for i in range(len(char1252)): | |
dic[char1252[i]] = charutf8[i] | |
return dic | |
dicchar = loaddicchar() | |
def convert_unicode(txt): | |
return re.sub( | |
r'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ', | |
lambda x: dicchar[x.group()], txt) | |
""" | |
Start section: Chuyển câu văn về kiểu gõ telex khi không bật Unikey | |
Ví dụ: thủy = thuyr, tượng = tuwowngj | |
""" | |
bang_nguyen_am = [['a', 'à', 'á', 'ả', 'ã', 'ạ', 'a'], | |
['ă', 'ằ', 'ắ', 'ẳ', 'ẵ', 'ặ', 'aw'], | |
['â', 'ầ', 'ấ', 'ẩ', 'ẫ', 'ậ', 'aa'], | |
['e', 'è', 'é', 'ẻ', 'ẽ', 'ẹ', 'e'], | |
['ê', 'ề', 'ế', 'ể', 'ễ', 'ệ', 'ee'], | |
['i', 'ì', 'í', 'ỉ', 'ĩ', 'ị', 'i'], | |
['o', 'ò', 'ó', 'ỏ', 'õ', 'ọ', 'o'], | |
['ô', 'ồ', 'ố', 'ổ', 'ỗ', 'ộ', 'oo'], | |
['ơ', 'ờ', 'ớ', 'ở', 'ỡ', 'ợ', 'ow'], | |
['u', 'ù', 'ú', 'ủ', 'ũ', 'ụ', 'u'], | |
['ư', 'ừ', 'ứ', 'ử', 'ữ', 'ự', 'uw'], | |
['y', 'ỳ', 'ý', 'ỷ', 'ỹ', 'ỵ', 'y']] | |
bang_ky_tu_dau = ['', 'f', 's', 'r', 'x', 'j'] | |
nguyen_am_to_ids = {} | |
for i in range(len(bang_nguyen_am)): | |
for j in range(len(bang_nguyen_am[i]) - 1): | |
nguyen_am_to_ids[bang_nguyen_am[i][j]] = (i, j) | |
def vn_word_to_telex_type(word): | |
dau_cau = 0 | |
new_word = '' | |
for char in word: | |
x, y = nguyen_am_to_ids.get(char, (-1, -1)) | |
if x == -1: | |
new_word += char | |
continue | |
if y != 0: | |
dau_cau = y | |
new_word += bang_nguyen_am[x][-1] | |
new_word += bang_ky_tu_dau[dau_cau] | |
return new_word | |
def vn_sentence_to_telex_type(sentence): | |
""" | |
Chuyển câu tiếng việt có dấu về kiểu gõ telex. | |
:param sentence: | |
:return: | |
""" | |
words = sentence.split() | |
for index, word in enumerate(words): | |
words[index] = vn_word_to_telex_type(word) | |
return ' '.join(words) | |
""" | |
End section: Chuyển câu văn về kiểu gõ telex khi không bật Unikey | |
""" | |
""" | |
Start section: Chuyển câu văn về cách gõ dấu kiểu cũ: dùng òa úy thay oà uý | |
Xem tại đây: https://vi.wikipedia.org/wiki/Quy_t%E1%BA%AFc_%C4%91%E1%BA%B7t_d%E1%BA%A5u_thanh_trong_ch%E1%BB%AF_qu%E1%BB%91c_ng%E1%BB%AF | |
""" | |
def chuan_hoa_dau_tu_tieng_viet(word): | |
if not is_valid_vietnam_word(word): | |
return word | |
chars = list(word) | |
dau_cau = 0 | |
nguyen_am_index = [] | |
qu_or_gi = False | |
for index, char in enumerate(chars): | |
x, y = nguyen_am_to_ids.get(char, (-1, -1)) | |
if x == -1: | |
continue | |
elif x == 9: # check qu | |
if index != 0 and chars[index - 1] == 'q': | |
chars[index] = 'u' | |
qu_or_gi = True | |
elif x == 5: # check gi | |
if index != 0 and chars[index - 1] == 'g': | |
chars[index] = 'i' | |
qu_or_gi = True | |
if y != 0: | |
dau_cau = y | |
chars[index] = bang_nguyen_am[x][0] | |
if not qu_or_gi or index != 1: | |
nguyen_am_index.append(index) | |
if len(nguyen_am_index) < 2: | |
if qu_or_gi: | |
if len(chars) == 2: | |
x, y = nguyen_am_to_ids.get(chars[1]) | |
chars[1] = bang_nguyen_am[x][dau_cau] | |
else: | |
x, y = nguyen_am_to_ids.get(chars[2], (-1, -1)) | |
if x != -1: | |
chars[2] = bang_nguyen_am[x][dau_cau] | |
else: | |
chars[1] = bang_nguyen_am[5][dau_cau] if chars[1] == 'i' else bang_nguyen_am[9][dau_cau] | |
return ''.join(chars) | |
return word | |
for index in nguyen_am_index: | |
x, y = nguyen_am_to_ids[chars[index]] | |
if x == 4 or x == 8: # ê, ơ | |
chars[index] = bang_nguyen_am[x][dau_cau] | |
# for index2 in nguyen_am_index: | |
# if index2 != index: | |
# x, y = nguyen_am_to_ids[chars[index]] | |
# chars[index2] = bang_nguyen_am[x][0] | |
return ''.join(chars) | |
if len(nguyen_am_index) == 2: | |
if nguyen_am_index[-1] == len(chars) - 1: | |
x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]] | |
chars[nguyen_am_index[0]] = bang_nguyen_am[x][dau_cau] | |
# x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]] | |
# chars[nguyen_am_index[1]] = bang_nguyen_am[x][0] | |
else: | |
# x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]] | |
# chars[nguyen_am_index[0]] = bang_nguyen_am[x][0] | |
x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]] | |
chars[nguyen_am_index[1]] = bang_nguyen_am[x][dau_cau] | |
else: | |
# x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]] | |
# chars[nguyen_am_index[0]] = bang_nguyen_am[x][0] | |
x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]] | |
chars[nguyen_am_index[1]] = bang_nguyen_am[x][dau_cau] | |
# x, y = nguyen_am_to_ids[chars[nguyen_am_index[2]]] | |
# chars[nguyen_am_index[2]] = bang_nguyen_am[x][0] | |
return ''.join(chars) | |
def is_valid_vietnam_word(word): | |
chars = list(word) | |
nguyen_am_index = -1 | |
for index, char in enumerate(chars): | |
x, y = nguyen_am_to_ids.get(char, (-1, -1)) | |
if x != -1: | |
if nguyen_am_index == -1: | |
nguyen_am_index = index | |
else: | |
if index - nguyen_am_index != 1: | |
return False | |
nguyen_am_index = index | |
return True | |
def chuan_hoa_dau_cau_tieng_viet(sentence): | |
""" | |
Chuyển câu tiếng việt về chuẩn gõ dấu kiểu cũ. | |
:param sentence: | |
:return: | |
""" | |
sentence = sentence.lower() | |
words = sentence.split() | |
for index, word in enumerate(words): | |
cw = re.sub(r'(^\p{P}*)([p{L}.]*\p{L}+)(\p{P}*$)', r'\1/\2/\3', word).split('/') | |
# print(cw) | |
if len(cw) == 3: | |
cw[1] = chuan_hoa_dau_tu_tieng_viet(cw[1]) | |
words[index] = ''.join(cw) | |
return ' '.join(words) | |
""" | |
End section: Chuyển câu văn về cách gõ dấu kiểu cũ: dùng òa úy thay oà uý | |
Xem tại đây: https://vi.wikipedia.org/wiki/Quy_tắc_đặt_dấu_thanh_trong_chữ_quốc_ngữ | |
""" | |
if __name__ == '__main__': | |
print(chuan_hoa_dau_cau_tieng_viet('anh hoà, đang làm.. gì')) | |
# f = open('/home/lap60313/data/corpus-full.txt', encoding='utf8') | |
# sentence = f.readline() | |
# current_line = 0 | |
# while sentence: | |
# current_line += 1 | |
# if current_line % 1000 == 0: | |
# print('Current line', str(current_line)) | |
# sentence = sentence.lower().strip() | |
# sentence = convertwindown1525toutf8(sentence) | |
# sentence = chuan_hoa_dau_cau_tieng_viet(sentence) | |
# with open('/home/lap60313/data/corpus-full.txt.out', 'a+', encoding='utf8') as fp: | |
# fp.write(sentence + "\n") | |
# sentence = f.readline() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import regex as re | |
uniChars = "àáảãạâầấẩẫậăằắẳẵặèéẻẽẹêềếểễệđìíỉĩịòóỏõọôồốổỗộơờớởỡợùúủũụưừứửữựỳýỷỹỵÀÁẢÃẠÂẦẤẨẪẬĂẰẮẲẴẶÈÉẺẼẸÊỀẾỂỄỆĐÌÍỈĨỊÒÓỎÕỌÔỒỐỔỖỘƠỜỚỞỠỢÙÚỦŨỤƯỪỨỬỮỰỲÝỶỸỴÂĂĐÔƠƯ" | |
unsignChars = "aaaaaaaaaaaaaaaaaeeeeeeeeeeediiiiiooooooooooooooooouuuuuuuuuuuyyyyyAAAAAAAAAAAAAAAAAEEEEEEEEEEEDIIIOOOOOOOOOOOOOOOOOOOUUUUUUUUUUUYYYYYAADOOU" | |
def loaddicchar(): | |
dic = {} | |
char1252 = 'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ'.split( | |
'|') | |
charutf8 = "à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ".split( | |
'|') | |
for i in range(len(char1252)): | |
dic[char1252[i]] = charutf8[i] | |
return dic | |
dicchar = loaddicchar() | |
def convert_unicode(txt): | |
return re.sub( | |
r'à|á|ả|ã|ạ|ầ|ấ|ẩ|ẫ|ậ|ằ|ắ|ẳ|ẵ|ặ|è|é|ẻ|ẽ|ẹ|ề|ế|ể|ễ|ệ|ì|í|ỉ|ĩ|ị|ò|ó|ỏ|õ|ọ|ồ|ố|ổ|ỗ|ộ|ờ|ớ|ở|ỡ|ợ|ù|ú|ủ|ũ|ụ|ừ|ứ|ử|ữ|ự|ỳ|ý|ỷ|ỹ|ỵ|À|Á|Ả|Ã|Ạ|Ầ|Ấ|Ẩ|Ẫ|Ậ|Ằ|Ắ|Ẳ|Ẵ|Ặ|È|É|Ẻ|Ẽ|Ẹ|Ề|Ế|Ể|Ễ|Ệ|Ì|Í|Ỉ|Ĩ|Ị|Ò|Ó|Ỏ|Õ|Ọ|Ồ|Ố|Ổ|Ỗ|Ộ|Ờ|Ớ|Ở|Ỡ|Ợ|Ù|Ú|Ủ|Ũ|Ụ|Ừ|Ứ|Ử|Ữ|Ự|Ỳ|Ý|Ỷ|Ỹ|Ỵ', | |
lambda x: dicchar[x.group()], txt) | |
""" | |
Start section: Chuyển câu văn về kiểu gõ telex khi không bật Unikey | |
Ví dụ: thủy = thuyr, tượng = tuwowngj | |
""" | |
bang_nguyen_am = [['a', 'à', 'á', 'ả', 'ã', 'ạ', 'a'], | |
['ă', 'ằ', 'ắ', 'ẳ', 'ẵ', 'ặ', 'aw'], | |
['â', 'ầ', 'ấ', 'ẩ', 'ẫ', 'ậ', 'aa'], | |
['e', 'è', 'é', 'ẻ', 'ẽ', 'ẹ', 'e'], | |
['ê', 'ề', 'ế', 'ể', 'ễ', 'ệ', 'ee'], | |
['i', 'ì', 'í', 'ỉ', 'ĩ', 'ị', 'i'], | |
['o', 'ò', 'ó', 'ỏ', 'õ', 'ọ', 'o'], | |
['ô', 'ồ', 'ố', 'ổ', 'ỗ', 'ộ', 'oo'], | |
['ơ', 'ờ', 'ớ', 'ở', 'ỡ', 'ợ', 'ow'], | |
['u', 'ù', 'ú', 'ủ', 'ũ', 'ụ', 'u'], | |
['ư', 'ừ', 'ứ', 'ử', 'ữ', 'ự', 'uw'], | |
['y', 'ỳ', 'ý', 'ỷ', 'ỹ', 'ỵ', 'y']] | |
bang_ky_tu_dau = ['', 'f', 's', 'r', 'x', 'j'] | |
nguyen_am_to_ids = {} | |
for i in range(len(bang_nguyen_am)): | |
for j in range(len(bang_nguyen_am[i]) - 1): | |
nguyen_am_to_ids[bang_nguyen_am[i][j]] = (i, j) | |
def vn_word_to_telex_type(word): | |
return convert_unicode(word) | |
def vn_sentence_to_telex_type(sentence): | |
""" | |
Chuyển câu tiếng việt có dấu về kiểu gõ telex. | |
:param sentence: | |
:return: | |
""" | |
return convert_unicode(sentence) | |
""" | |
End section: Chuyển câu văn về kiểu gõ telex khi không bật Unikey | |
""" | |
""" | |
Start section: Chuyển câu văn về cách gõ dấu kiểu cũ: dùng òa úy thay oà uý | |
Xem tại đây: https://vi.wikipedia.org/wiki/Quy_t%E1%BA%AFc_%C4%91%E1%BA%B7t_d%E1%BA%A5u_thanh_trong_ch%E1%BB%AF_qu%E1%BB%91c_ng%E1%BB%AF | |
""" | |
def chuan_hoa_dau_tu_tieng_viet(word): | |
if not is_valid_vietnam_word(word): | |
return word | |
chars = list(word) | |
dau_cau = 0 | |
nguyen_am_index = [] | |
qu_or_gi = False | |
for index, char in enumerate(chars): | |
x, y = nguyen_am_to_ids.get(char, (-1, -1)) | |
if x == -1: | |
continue | |
elif x == 9: # check qu | |
if index != 0 and chars[index - 1] == 'q': | |
chars[index] = 'u' | |
qu_or_gi = True | |
elif x == 5: # check gi | |
if index != 0 and chars[index - 1] == 'g': | |
chars[index] = 'i' | |
qu_or_gi = True | |
if y != 0: | |
dau_cau = y | |
chars[index] = bang_nguyen_am[x][0] | |
if not qu_or_gi or index != 1: | |
nguyen_am_index.append(index) | |
if len(nguyen_am_index) < 2: | |
if qu_or_gi: | |
if len(chars) == 2: | |
x, y = nguyen_am_to_ids.get(chars[1]) | |
chars[1] = bang_nguyen_am[x][dau_cau] | |
else: | |
x, y = nguyen_am_to_ids.get(chars[2], (-1, -1)) | |
if x != -1: | |
chars[2] = bang_nguyen_am[x][dau_cau] | |
else: | |
chars[1] = bang_nguyen_am[5][dau_cau] if chars[1] == 'i' else bang_nguyen_am[9][dau_cau] | |
return ''.join(chars) | |
return word | |
for index in nguyen_am_index: | |
x, y = nguyen_am_to_ids[chars[index]] | |
if x == 4 or x == 8: # ê, ơ | |
chars[index] = bang_nguyen_am[x][dau_cau] | |
# for index2 in nguyen_am_index: | |
# if index2 != index: | |
# x, y = nguyen_am_to_ids[chars[index]] | |
# chars[index2] = bang_nguyen_am[x][0] | |
return ''.join(chars) | |
if len(nguyen_am_index) == 2: | |
if nguyen_am_index[-1] == len(chars) - 1: | |
x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]] | |
chars[nguyen_am_index[0]] = bang_nguyen_am[x][dau_cau] | |
# x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]] | |
# chars[nguyen_am_index[1]] = bang_nguyen_am[x][0] | |
else: | |
# x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]] | |
# chars[nguyen_am_index[0]] = bang_nguyen_am[x][0] | |
x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]] | |
chars[nguyen_am_index[1]] = bang_nguyen_am[x][dau_cau] | |
else: | |
# x, y = nguyen_am_to_ids[chars[nguyen_am_index[0]]] | |
# chars[nguyen_am_index[0]] = bang_nguyen_am[x][0] | |
x, y = nguyen_am_to_ids[chars[nguyen_am_index[1]]] | |
chars[nguyen_am_index[1]] = bang_nguyen_am[x][dau_cau] | |
# x, y = nguyen_am_to_ids[chars[nguyen_am_index[2]]] | |
# chars[nguyen_am_index[2]] = bang_nguyen_am[x][0] | |
return ''.join(chars) | |
def is_valid_vietnam_word(word): | |
chars = list(word) | |
nguyen_am_index = -1 | |
for index, char in enumerate(chars): | |
x, y = nguyen_am_to_ids.get(char, (-1, -1)) | |
if x != -1: | |
if nguyen_am_index == -1: | |
nguyen_am_index = index | |
else: | |
if index - nguyen_am_index != 1: | |
return False | |
nguyen_am_index = index | |
return True | |
def chuan_hoa_dau_cau_tieng_viet(sentence): | |
""" | |
Chuyển câu tiếng việt về chuẩn gõ dấu kiểu cũ. | |
:param sentence: | |
:return: | |
""" | |
# sentence = sentence.lower() | |
words = sentence.split() | |
for index, word in enumerate(words): | |
cw = re.sub(r'(^\p{P}*)([p{L}.]*\p{L}+)(\p{P}*$)', r'\1/\2/\3', word).split('/') | |
# print(cw) | |
if len(cw) == 3: | |
cw[1] = chuan_hoa_dau_tu_tieng_viet(cw[1]) | |
words[index] = ''.join(cw) | |
return ' '.join(words) | |
""" | |
End section: Chuyển câu văn về cách gõ dấu kiểu cũ: dùng òa úy thay oà uý | |
Xem tại đây: https://vi.wikipedia.org/wiki/Quy_tắc_đặt_dấu_thanh_trong_chữ_quốc_ngữ | |
""" | |
if __name__ == '__main__': | |
print(vn_sentence_to_telex_type('anh Hoà, đang làm.. gì')) | |
print(chuan_hoa_dau_cau_tieng_viet('anh Hoà, đang làm.. gì')) | |
# f = open('/home/lap60313/data/corpus-full.txt', encoding='utf8') | |
# sentence = f.readline() | |
# current_line = 0 | |
# while sentence: | |
# current_line += 1 | |
# if current_line % 1000 == 0: | |
# print('Current line', str(current_line)) | |
# sentence = sentence.lower().strip() | |
# sentence = convertwindown1525toutf8(sentence) | |
# sentence = chuan_hoa_dau_cau_tieng_viet(sentence) | |
# with open('/home/lap60313/data/corpus-full.txt.out', 'a+', encoding='utf8') as fp: | |
# fp.write(sentence + "\n") | |
# sentence = f.readline() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Update: Chuẩn hoá encoding trong Python đơn giản hơn