Skip to content

Instantly share code, notes, and snippets.

@cengizhancaliskan
Last active November 13, 2019 05:55
Show Gist options
  • Save cengizhancaliskan/d94b48b475d78a4ee12f1499c3f0cc0c to your computer and use it in GitHub Desktop.
Save cengizhancaliskan/d94b48b475d78a4ee12f1499c3f0cc0c to your computer and use it in GitHub Desktop.
Python has cyrillic, has chinese, get non chinese/cyrillic
# Reference: https://unicode-table.com/en/blocks/
# https://docs.oracle.com/cd/E29584_01/webhelp/mdex_basicDev/src/rbdv_chars_mapping.html
import re
def has_cyrillic(word):
# https://unicode-table.com/en/blocks/cyrillic/
# Languages: russian, ukrainian, bulgarian
return bool(re.search('[\u0400-\u04ff]', word))
def has_cyrillic2(word):
# Cyrillic alphabet
return bool(re.search('[а-яА-Я]', word))
def has_chinese(word):
# https://unicode-table.com/en/blocks/cjk-unified-ideographs/
# Languages: chinese, japanese, korean, vietnamese
return bool(re.search('[\u4e00-\u9fff]', word))
def get_non_cyrillic(text):
return ' '.join([word for word in text.split() if not has_cyrillic(word)])
def get_non_cyrillic2(text):
return ' '.join([word for word in text.split() if not has_cyrillic2(word)])
def get_non_chinese(text):
return ' '.join([word for word in text.split() if not has_chinese(word)])
# Example
get_non_cyrillic("yatak белье")
get_non_cyrillic2("yatak белье")
get_non_chinese("馬 yata 马 k馬")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment