Created
October 26, 2017 19:54
-
-
Save dbalduini/b9d68c2bea432d0343ffd85ce5585a12 to your computer and use it in GitHub Desktop.
UTF-8 (Unicode 256) table
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: UTF-8 -*- | |
# | |
# Creates an UTF-8 table, or unicode 256. | |
# Tests if the table can validate each UTF-8 char. | |
# | |
# Resources: | |
# https://unicode-table.com/en/#latin-1-supplement | |
# http://www.umich.edu/~umfandsf/other/ebooks/alice30.txt | |
control_chars = [ unichr(i) for i in xrange(0, 32) ] | |
basic_latin = [ unichr(i) for i in xrange(32, 128) ] | |
latin_1_supplement = [ unichr(i) for i in xrange(128, 256) ] | |
unicode_table = control_chars + basic_latin + latin_1_supplement | |
assert len(unicode_table) == 256 | |
def search_text(text): | |
utf8chars = [ unichar.encode('utf-8') | |
for char in text | |
for unichar in unicode_table | |
if char == unichar ] | |
return ''.join(utf8chars) | |
# Test single word | |
word = u'unicórnio.' | |
result = search_text(word) | |
print word, result | |
# Test alice text file | |
import codecs, time | |
with codecs.open("alice30.txt", "r", "utf-8") as alice30: | |
text = alice30.read() | |
print len(text) | |
start_time = time.time() | |
result = search_text(text) | |
print("%d --- %s seconds" % (len(result), time.time() - start_time)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment