tkrajina · April 27, 2014 05:33
diff --git a/extract_unicode_words.py b/extract_unicode_words.py
 # -*- coding: utf-8 -*-

 import unicodedata as mod_unicodedata

 all_unicode = ''.join(unichr(i) for i in xrange(65536))
 UNICODE_LETTERS = ''.join(c for c in all_unicode if mod_unicodedata.category(c)=='Lu' or mod_unicodedata.category(c)=='Ll')
 UNICODE_LETTERS_AND_NUMBERS = '1234567890' + UNICODE_LETTERS

 def extract_unicode_words(text, allowed_chars=None):
    if text.__class__ != unicode:
        text = text.decode('utf-8')
    if not allowed_chars:
        allowed_chars = UNICODE_LETTERS_AND_NUMBERS
    words = []
    current_word = ''
    for c in text:
        if c in allowed_chars:
            current_word += c
        elif current_word:
            words.append(current_word)
            current_word = ''
    if current_word:
        words.append(current_word)
    return words

 if __name__ == '__main__':
    print extract_unicode_words('dsuifouio eršešđšĐŠĐŠ zuizu 374637 ČĆeuzu')
	# -- coding: utf-8 --

	import unicodedata as mod_unicodedata

	all_unicode = ''.join(unichr(i) for i in xrange(65536))
	UNICODE_LETTERS = ''.join(c for c in all_unicode if mod_unicodedata.category(c)=='Lu' or mod_unicodedata.category(c)=='Ll')
	UNICODE_LETTERS_AND_NUMBERS = '1234567890' + UNICODE_LETTERS

	def extract_unicode_words(text, allowed_chars=None):
	if text.__class__ != unicode:
	text = text.decode('utf-8')
	if not allowed_chars:
	allowed_chars = UNICODE_LETTERS_AND_NUMBERS
	words = []
	current_word = ''
	for c in text:
	if c in allowed_chars:
	current_word += c
	elif current_word:
	words.append(current_word)
	current_word = ''
	if current_word:
	words.append(current_word)
	return words

	if __name__ == '__main__':
	print extract_unicode_words('dsuifouio eršešđšĐŠĐŠ zuizu 374637 ČĆeuzu')