J2TEAM/remove_accents.py

trieuhaivo · 2019-07-23T04:18:32Z

Đóng góp thêm:

import re

def no_accent_vietnamese(s):
    s = re.sub(r'[àáạảãâầấậẩẫăằắặẳẵ]', 'a', s)
    s = re.sub(r'[ÀÁẠẢÃĂẰẮẶẲẴÂẦẤẬẨẪ]', 'A', s)
    s = re.sub(r'[èéẹẻẽêềếệểễ]', 'e', s)
    s = re.sub(r'[ÈÉẸẺẼÊỀẾỆỂỄ]', 'E', s)
    s = re.sub(r'[òóọỏõôồốộổỗơờớợởỡ]', 'o', s)
    s = re.sub(r'[ÒÓỌỎÕÔỒỐỘỔỖƠỜỚỢỞỠ]', 'O', s)
    s = re.sub(r'[ìíịỉĩ]', 'i', s)
    s = re.sub(r'[ÌÍỊỈĨ]', 'I', s)
    s = re.sub(r'[ùúụủũưừứựửữ]', 'u', s)
    s = re.sub(r'[ƯỪỨỰỬỮÙÚỤỦŨ]', 'U', s)
    s = re.sub(r'[ỳýỵỷỹ]', 'y', s)
    s = re.sub(r'[ỲÝỴỶỸ]', 'Y', s)
    s = re.sub(r'[Đ]', 'D', s)
    s = re.sub(r'[đ]', 'd', s)
    return s

if __name__ == '__main__':
    print(no_accent_vietnamese("Việt Nam Đất Nước Con Người"))
    print(no_accent_vietnamese("Welcome to Vietnam !"))
    print(no_accent_vietnamese("VIỆT NAM ĐẤT NƯỚC CON NGƯỜI"))

# Output
# Viet Nam Dat Nuoc Con Nguoi
# Welcome to Vietnam !
# VIET NAM DAT NUOC CON NGUOI

Hoặc có thể cài và sử dụng thư viện unidecode:

pip install unidecode

from unidecode import unidecode

print(unidecode("Việt Nam Đất Nước Con Người"))
print(unidecode("Welcome to Vietnam !"))
print(unidecode("VIỆT NAM ĐẤT NƯỚC CON NGƯỜI"))

# Output
# Viet Nam Dat Nuoc Con Nguoi
# Welcome to Vietnam !
# VIET NAM DAT NUOC CON NGUOI

tmhung-nt · 2019-12-21T06:52:52Z

thanks mates

vietvudanh · 2020-02-03T07:59:48Z

pandas column version

def no_accent_vietnamese_col(df, col):
    s = df[col]
    s = s.replace(r'[àáạảãâầấậẩẫăằắặẳẵ]', 'a', regex=True)
    s = s.replace(r'[ÀÁẠẢÃĂẰẮẶẲẴÂẦẤẬẨẪ]', 'A', regex=True)
    s = s.replace(r'[èéẹẻẽêềếệểễ]', 'e', regex=True)
    s = s.replace(r'[ÈÉẸẺẼÊỀẾỆỂỄ]', 'E', regex=True)
    s = s.replace(r'[òóọỏõôồốộổỗơờớợởỡ]', 'o', regex=True)
    s = s.replace(r'[ÒÓỌỎÕÔỒỐỘỔỖƠỜỚỢỞỠ]', 'O', regex=True)
    s = s.replace(r'[ìíịỉĩ]', 'i', regex=True)
    s = s.replace(r'[ÌÍỊỈĨ]', 'I', regex=True)
    s = s.replace(r'[ùúụủũưừứựửữ]', 'u', regex=True)
    s = s.replace(r'[ƯỪỨỰỬỮÙÚỤỦŨ]', 'U', regex=True)
    s = s.replace(r'[ỳýỵỷỹ]', 'y', regex=True)
    s = s.replace(r'[ỲÝỴỶỸ]', 'Y', regex=True)
    s = s.replace(r'[Đ]', 'D', regex=True)
    s = s.replace(r'[đ]', 'd', regex=True)
    return s

truong0vanchien · 2021-08-06T15:28:32Z

Cam on ban.

truong0vanchien · 2021-08-06T15:29:09Z

Đóng góp thêm:

import re

def no_accent_vietnamese(s):
    s = re.sub(r'[àáạảãâầấậẩẫăằắặẳẵ]', 'a', s)
    s = re.sub(r'[ÀÁẠẢÃĂẰẮẶẲẴÂẦẤẬẨẪ]', 'A', s)
    s = re.sub(r'[èéẹẻẽêềếệểễ]', 'e', s)
    s = re.sub(r'[ÈÉẸẺẼÊỀẾỆỂỄ]', 'E', s)
    s = re.sub(r'[òóọỏõôồốộổỗơờớợởỡ]', 'o', s)
    s = re.sub(r'[ÒÓỌỎÕÔỒỐỘỔỖƠỜỚỢỞỠ]', 'O', s)
    s = re.sub(r'[ìíịỉĩ]', 'i', s)
    s = re.sub(r'[ÌÍỊỈĨ]', 'I', s)
    s = re.sub(r'[ùúụủũưừứựửữ]', 'u', s)
    s = re.sub(r'[ƯỪỨỰỬỮÙÚỤỦŨ]', 'U', s)
    s = re.sub(r'[ỳýỵỷỹ]', 'y', s)
    s = re.sub(r'[ỲÝỴỶỸ]', 'Y', s)
    s = re.sub(r'[Đ]', 'D', s)
    s = re.sub(r'[đ]', 'd', s)
    return s

if __name__ == '__main__':
    print(no_accent_vietnamese("Việt Nam Đất Nước Con Người"))
    print(no_accent_vietnamese("Welcome to Vietnam !"))
    print(no_accent_vietnamese("VIỆT NAM ĐẤT NƯỚC CON NGƯỜI"))

# Output
# Viet Nam Dat Nuoc Con Nguoi
# Welcome to Vietnam !
# VIET NAM DAT NUOC CON NGUOI

Hoặc có thể cài và sử dụng thư viện unidecode:

pip install unidecode

from unidecode import unidecode

print(unidecode("Việt Nam Đất Nước Con Người"))
print(unidecode("Welcome to Vietnam !"))
print(unidecode("VIỆT NAM ĐẤT NƯỚC CON NGƯỜI"))

# Output
# Viet Nam Dat Nuoc Con Nguoi
# Welcome to Vietnam !
# VIET NAM DAT NUOC CON NGUOI

Cam on ban.

lacls · 2022-04-07T09:31:19Z

Error case:
no_accent_vietnamese("Nguyễn Võ Tấn Đạt")
Output: 'Nguyẽn Võ Tán Dạt'

maycuatroi1 · 2022-08-23T09:12:43Z

new method:

pip install unidecode

import unidecode
accented_string = u'Việt Nam đất nước con người'
# accented_string is of type 'unicode'

unaccented_string = unidecode.unidecode(accented_string)
print(unaccented_string)
# output: Viet Nam dat nuoc con nguoi

ejinguyen · 2023-01-03T07:40:17Z

mình cũng đang gặp 1 case như bên dưới:
"Phú Mỹ Hưng"
"Phú Mỹ Hưng"

2 từ trên trông giống nhau nhưng khi encode thì không giống nhau!
Danh sách ký tự lên mở rộng cho nhiều bảng mã khác!

phineas-pta · 2023-05-23T15:42:18Z

mình có làm 1 phiên bản khác hoàn thiện hơn và xử lí dc các trường hợp không thành công ở trên:

https://gist.github.com/phineas-pta/05cad38a29fea000ab6d9e13a6f7e623

	s1 = u'ÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚÝàáâãèéêìíòóôõùúýĂăĐđĨĩŨũƠơƯưẠạẢảẤấẦầẨẩẪẫẬậẮắẰằẲẳẴẵẶặẸẹẺẻẼẽẾếỀềỂểỄễỆệỈỉỊịỌọỎỏỐốỒồỔổỖỗỘộỚớỜờỞởỠỡỢợỤụỦủỨứỪừỬửỮữỰựỲỳỴỵỶỷỸỹ'
	s0 = u'AAAAEEEIIOOOOUUYaaaaeeeiioooouuyAaDdIiUuOoUuAaAaAaAaAaAaAaAaAaAaAaAaEeEeEeEeEeEeEeEeIiIiOoOoOoOoOoOoOoOoOoOoOoOoUuUuUuUuUuUuUuYyYyYyYy'
	def remove_accents(input_str):
	s = ''
	print input_str.encode('utf-8')
	for c in input_str:
	if c in s1:
	s += s0[s1.index(c)]
	else:
	s += c
	return s

J2TEAM/remove_accents.py

trieuhaivo commented Jul 23, 2019

Uh oh!

tmhung-nt commented Dec 21, 2019

Uh oh!

vietvudanh commented Feb 3, 2020

Uh oh!

truong0vanchien commented Aug 6, 2021

Uh oh!

truong0vanchien commented Aug 6, 2021

Uh oh!

lacls commented Apr 7, 2022

Uh oh!

maycuatroi1 commented Aug 23, 2022

Uh oh!

ejinguyen commented Jan 3, 2023

Uh oh!

phineas-pta commented May 23, 2023

Uh oh!