Skip to content

Instantly share code, notes, and snippets.

@dbalduini
Created October 26, 2017 19:54
Show Gist options
  • Save dbalduini/b9d68c2bea432d0343ffd85ce5585a12 to your computer and use it in GitHub Desktop.
Save dbalduini/b9d68c2bea432d0343ffd85ce5585a12 to your computer and use it in GitHub Desktop.
UTF-8 (Unicode 256) table
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
#
# Creates an UTF-8 table, or unicode 256.
# Tests if the table can validate each UTF-8 char.
#
# Resources:
# https://unicode-table.com/en/#latin-1-supplement
# http://www.umich.edu/~umfandsf/other/ebooks/alice30.txt
control_chars = [ unichr(i) for i in xrange(0, 32) ]
basic_latin = [ unichr(i) for i in xrange(32, 128) ]
latin_1_supplement = [ unichr(i) for i in xrange(128, 256) ]
unicode_table = control_chars + basic_latin + latin_1_supplement
assert len(unicode_table) == 256
def search_text(text):
utf8chars = [ unichar.encode('utf-8')
for char in text
for unichar in unicode_table
if char == unichar ]
return ''.join(utf8chars)
# Test single word
word = u'unicórnio.'
result = search_text(word)
print word, result
# Test alice text file
import codecs, time
with codecs.open("alice30.txt", "r", "utf-8") as alice30:
text = alice30.read()
print len(text)
start_time = time.time()
result = search_text(text)
print("%d --- %s seconds" % (len(result), time.time() - start_time))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment