dbalduini · October 26, 2017 19:54
diff --git a/unicode-table.py b/unicode-table.py
 #!/usr/bin/env python 
 # -*- coding: UTF-8 -*-
 #
 # Creates an UTF-8 table, or unicode 256.
 # Tests if the table can validate each UTF-8 char. 
 #
 # Resources:
 # https://unicode-table.com/en/#latin-1-supplement
 # http://www.umich.edu/~umfandsf/other/ebooks/alice30.txt

 control_chars = [ unichr(i) for i in xrange(0, 32) ]
 basic_latin = [ unichr(i) for i in xrange(32, 128) ]
 latin_1_supplement = [ unichr(i) for i in xrange(128, 256) ]

 unicode_table = control_chars + basic_latin + latin_1_supplement

 assert len(unicode_table) == 256


 def search_text(text):
 	utf8chars = [ unichar.encode('utf-8') 
 					for char in text 
 					for unichar in unicode_table 
 					if char == unichar ]
 	return ''.join(utf8chars)

 # Test single word
 word = u'unicórnio.'
 result = search_text(word)
 print word, result

 # Test alice text file
 import codecs, time
 with codecs.open("alice30.txt", "r", "utf-8") as alice30:
 	text = alice30.read()
 	print len(text)
 	start_time = time.time()
 	result = search_text(text)
 	print("%d --- %s seconds" % (len(result), time.time() - start_time))
	#!/usr/bin/env python
	# -- coding: UTF-8 --
	#
	# Creates an UTF-8 table, or unicode 256.
	# Tests if the table can validate each UTF-8 char.
	#
	# Resources:
	# https://unicode-table.com/en/#latin-1-supplement
	# http://www.umich.edu/~umfandsf/other/ebooks/alice30.txt

	control_chars = [ unichr(i) for i in xrange(0, 32) ]
	basic_latin = [ unichr(i) for i in xrange(32, 128) ]
	latin_1_supplement = [ unichr(i) for i in xrange(128, 256) ]

	unicode_table = control_chars + basic_latin + latin_1_supplement

	assert len(unicode_table) == 256


	def search_text(text):
	utf8chars = [ unichar.encode('utf-8')
	for char in text
	for unichar in unicode_table
	if char == unichar ]
	return ''.join(utf8chars)

	# Test single word
	word = u'unicórnio.'
	result = search_text(word)
	print word, result

	# Test alice text file
	import codecs, time
	with codecs.open("alice30.txt", "r", "utf-8") as alice30:
	text = alice30.read()
	print len(text)
	start_time = time.time()
	result = search_text(text)
	print("%d --- %s seconds" % (len(result), time.time() - start_time))