Created
April 27, 2014 05:33
-
-
Save tkrajina/11338258 to your computer and use it in GitHub Desktop.
Extract unicode words from string
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import unicodedata as mod_unicodedata | |
all_unicode = ''.join(unichr(i) for i in xrange(65536)) | |
UNICODE_LETTERS = ''.join(c for c in all_unicode if mod_unicodedata.category(c)=='Lu' or mod_unicodedata.category(c)=='Ll') | |
UNICODE_LETTERS_AND_NUMBERS = '1234567890' + UNICODE_LETTERS | |
def extract_unicode_words(text, allowed_chars=None): | |
if text.__class__ != unicode: | |
text = text.decode('utf-8') | |
if not allowed_chars: | |
allowed_chars = UNICODE_LETTERS_AND_NUMBERS | |
words = [] | |
current_word = '' | |
for c in text: | |
if c in allowed_chars: | |
current_word += c | |
elif current_word: | |
words.append(current_word) | |
current_word = '' | |
if current_word: | |
words.append(current_word) | |
return words | |
if __name__ == '__main__': | |
print extract_unicode_words('dsuifouio eršešđšĐŠĐŠ zuizu 374637 ČĆeuzu') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment