Last active
April 22, 2020 05:57
-
-
Save ceshine/1f8cd81ce34d89f1429d0928c28d97e4 to your computer and use it in GitHub Desktop.
A Simple CJK Language Detector
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
def cjk_detect(texts): | |
# korean | |
if re.search("[\uac00-\ud7a3]", texts): | |
return "ko" | |
# japanese | |
if re.search("[\u3040-\u30ff]", texts): | |
return "ja" | |
# chinese | |
if re.search("[\u4e00-\u9FFF]", texts): | |
return "zh" | |
return None | |
def test_cjk_detect(): | |
# Pure English | |
assert cjk_detect( | |
"Is Obstruction an Impeachable Offense? History Says Yes") is None | |
# Pure French | |
assert cjk_detect( | |
"Damian Lillard a réussi un nouveau shoot de la victoire" | |
" au buzzer à très longue distance") is None | |
# Simplified Chinese | |
assert cjk_detect( | |
"2009年,波音公司(Boeing)在查尔斯顿附近的新厂破土动工时,曾宣扬这里是最先进的制造中心" | |
",将制造一款世界上最先进的飞机。但在接下来的十年里,这家生产787梦想客机的工厂一直受到做" | |
"工粗糙和监管不力的困扰,危及航空安全。") == "zh" | |
# Traditional Chinese | |
assert cjk_detect( | |
"北查爾斯頓工廠的安全漏洞已經引起了航空公司和監管機構的密切關注。") == "zh" | |
# Japanese | |
assert cjk_detect( | |
"日産自動車は24日、2019年3月期の連結業績予想を下方修正した。") == "ja" | |
# Korean | |
assert cjk_detect( | |
"투서로 뜨고 투서에 지나") == "ko" | |
# Korean with a Chinese character | |
assert cjk_detect( | |
"北 외무성 간부 총살설 주민들 사이서 확산…하노이 회담 실패 때문") == "ko" | |
def print_incorrect_cases(): | |
# Japanese | |
texts = "日産自動車、営業益45%減 前期下方修正" | |
print(texts, "expected: ja actual:", cjk_detect(texts)) | |
# Traditional Chinese with Japanese hiragana | |
texts = "健康の油切 好吃の涼麵" | |
print(texts, "expected: zh actual:", cjk_detect(texts)) | |
# Traditional Chinese with Japanese katakana punctuation | |
texts = "鐵腕・都鐸王朝(五):文藝復興最懂穿搭的高富帥——亨利八世" | |
print(texts, "expected: zh actual:", cjk_detect(texts)) | |
if __name__ == "__main__": | |
# Correct cases | |
test_cjk_detect() | |
# Incorrect cases | |
print_incorrect_cases() |
Thanks a lot! I used part of your solution here https://mauricio-271700.appspot.com/ https://github.com/maolopez/ut_anagramma
You're welcome! Thanks for letting me know.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks a lot! I used part of your solution here https://mauricio-271700.appspot.com/ https://github.com/maolopez/ut_anagramma