Skip to content

Instantly share code, notes, and snippets.

@tkrajina
Created April 27, 2014 05:33
Show Gist options
  • Save tkrajina/11338258 to your computer and use it in GitHub Desktop.
Save tkrajina/11338258 to your computer and use it in GitHub Desktop.
Extract unicode words from string
# -*- coding: utf-8 -*-
import unicodedata as mod_unicodedata
all_unicode = ''.join(unichr(i) for i in xrange(65536))
UNICODE_LETTERS = ''.join(c for c in all_unicode if mod_unicodedata.category(c)=='Lu' or mod_unicodedata.category(c)=='Ll')
UNICODE_LETTERS_AND_NUMBERS = '1234567890' + UNICODE_LETTERS
def extract_unicode_words(text, allowed_chars=None):
if text.__class__ != unicode:
text = text.decode('utf-8')
if not allowed_chars:
allowed_chars = UNICODE_LETTERS_AND_NUMBERS
words = []
current_word = ''
for c in text:
if c in allowed_chars:
current_word += c
elif current_word:
words.append(current_word)
current_word = ''
if current_word:
words.append(current_word)
return words
if __name__ == '__main__':
print extract_unicode_words('dsuifouio eršešđšĐŠĐŠ zuizu 374637 ČĆeuzu')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment