Last active
November 13, 2019 05:55
-
-
Save cengizhancaliskan/d94b48b475d78a4ee12f1499c3f0cc0c to your computer and use it in GitHub Desktop.
Python has cyrillic, has chinese, get non chinese/cyrillic
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Reference: https://unicode-table.com/en/blocks/ | |
# https://docs.oracle.com/cd/E29584_01/webhelp/mdex_basicDev/src/rbdv_chars_mapping.html | |
import re | |
def has_cyrillic(word): | |
# https://unicode-table.com/en/blocks/cyrillic/ | |
# Languages: russian, ukrainian, bulgarian | |
return bool(re.search('[\u0400-\u04ff]', word)) | |
def has_cyrillic2(word): | |
# Cyrillic alphabet | |
return bool(re.search('[а-яА-Я]', word)) | |
def has_chinese(word): | |
# https://unicode-table.com/en/blocks/cjk-unified-ideographs/ | |
# Languages: chinese, japanese, korean, vietnamese | |
return bool(re.search('[\u4e00-\u9fff]', word)) | |
def get_non_cyrillic(text): | |
return ' '.join([word for word in text.split() if not has_cyrillic(word)]) | |
def get_non_cyrillic2(text): | |
return ' '.join([word for word in text.split() if not has_cyrillic2(word)]) | |
def get_non_chinese(text): | |
return ' '.join([word for word in text.split() if not has_chinese(word)]) | |
# Example | |
get_non_cyrillic("yatak белье") | |
get_non_cyrillic2("yatak белье") | |
get_non_chinese("馬 yata 马 k馬") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment