feerrenrut · February 7, 2018 05:30
diff --git a/testSpeedUnicodeCharacterLookup.py b/testSpeedUnicodeCharacterLookup.py
 import bisect

 import unicodeScriptData

 scriptCode = unicodeScriptData.scriptRanges
 unicodeScriptRangeEnd = [ k[1] for k in scriptCode]


 def withBisect(chr):
 	# Based on the following assumptions: 
 	# - ranges must overlap
 	# - range end and start values are included in that range
 	# - there may be gaps between ranges.

 	# Approach: Look for the first index of a range where the range end value is greater
 	# than the code we are searching for. If this is found, and the start value for this range
 	# is less than or equal to the code we are searching for then we have found the range.
 	# That is startValue <= characterUnicodeCode <= endValue
 	
 	characterUnicodeCode = ord(chr)
 	
 	# Number should respect preferred language setting
 	# FullWidthNumber is in Common category, however, it indicates Japanese language context
 	if 0x30 <= characterUnicodeCode <= 0x39:
 		return "Number"
 	elif 0xff10 <= characterUnicodeCode <= 0xff19:
 		return "FullWidthNumber"

 	index = bisect.bisect_left(unicodeScriptRangeEnd, characterUnicodeCode )
 	if index == len(unicodeScriptRangeEnd):
 		# there is no value of index such that: `characterUnicodeCode <= scriptCode[index][1]`
 		# characterUnicodeCode is larger than all of the range end values so a range is not 
 		# found for the value:
 		return None

 	# Since the range at index is the first where `characterUnicodeCode <= rangeEnd` is True,
 	# we now ensure that for the range at the index `characterUnicodeCode >= rangeStart` 
 	# is also True. 
 	candidateRange = scriptCode[index]
 	rangeStart = candidateRange[0]
 	if rangeStart > characterUnicodeCode :
 		# characterUnicodeCode comes before the start of the range at index so a range 
 		# is not found for the value
 		return None
 	rangeName = candidateRange[2]
 	return rangeName

 def customBinarySearch(chr):
 	"""performs a binary search in scripCodes for unicode ranges
 	@param chr: character for which a script should be found
 	@type chr: string
 	@return: script code
 	@rtype: int"""
 	mStart = 0
 	mEnd = len(scriptCode)-1
 	characterUnicodeCode = ord(chr)
 	# Number should respect preferred language setting
 	# FullWidthNumber is in Common category, however, it indicates Japanese language context
 	if 0x30 <= characterUnicodeCode <= 0x39:
 		return "Number"
 	elif 0xff10 <= characterUnicodeCode <= 0xff19:
 		return "FullWidthNumber"
 	while( mEnd >= mStart ):
 		midPoint = (mStart + mEnd ) >> 1
 		if characterUnicodeCode < scriptCode[midPoint][0]: 
 			mEnd = midPoint -1
 		elif characterUnicodeCode > scriptCode[midPoint][1]: 
 			mStart = midPoint + 1
 		else:
 			return scriptCode[midPoint][2] 
 	return None
 	

 start = scriptCode[0][0]
 end = scriptCode[len(scriptCode)-1][1] # gives: 917631
 #start = 0
 end = 0x10000 -1 #values over 65535 dont work with unichr()

 def unicode_literal(n):
 s = "u'\U%08X'" % n
 c = s.decode('unicode-escape')
 return c.encode('utf-8')
 
 def doAllChars(f):
 	numberOfExceptions = 0
 	for c in xrange(start, end):
    # some values can not be converted back to a single unicode character.
 		#s = unicode_literal(c)
 		try:
 			s = unichr(c)
 		except:
 			numberOfExceptions = numberOfExceptions +1
 			pass
 		v = f(s)
 		
 import timeit
 def measureIt():
 	n = 100
 	print("using {} iterations".format(n))
 	print("testing over range {}-{}, a total of {} values".format(start, end, end-start))
 	result = timeit.timeit(lambda: doAllChars(withBisect), number=n)
 	print("withBisect: %s"%result)
 	result = timeit.timeit(lambda: doAllChars(customBinarySearch), number=n)
 	print("customBinarySearch: %s"%result)
 	
 def doIt():
 	doAllChars(withBisect)
 	doAllChars(customBinarySearch)

 #doIt()
 measureIt()
	import bisect

	import unicodeScriptData

	scriptCode = unicodeScriptData.scriptRanges
	unicodeScriptRangeEnd = [ k[1] for k in scriptCode]


	def withBisect(chr):
	# Based on the following assumptions:
	# - ranges must overlap
	# - range end and start values are included in that range
	# - there may be gaps between ranges.

	# Approach: Look for the first index of a range where the range end value is greater
	# than the code we are searching for. If this is found, and the start value for this range
	# is less than or equal to the code we are searching for then we have found the range.
	# That is startValue <= characterUnicodeCode <= endValue

	characterUnicodeCode = ord(chr)

	# Number should respect preferred language setting
	# FullWidthNumber is in Common category, however, it indicates Japanese language context
	if 0x30 <= characterUnicodeCode <= 0x39:
	return "Number"
	elif 0xff10 <= characterUnicodeCode <= 0xff19:
	return "FullWidthNumber"

	index = bisect.bisect_left(unicodeScriptRangeEnd, characterUnicodeCode )
	if index == len(unicodeScriptRangeEnd):
	# there is no value of index such that: `characterUnicodeCode <= scriptCode[index][1]`
	# characterUnicodeCode is larger than all of the range end values so a range is not
	# found for the value:
	return None

	# Since the range at index is the first where `characterUnicodeCode <= rangeEnd` is True,
	# we now ensure that for the range at the index `characterUnicodeCode >= rangeStart`
	# is also True.
	candidateRange = scriptCode[index]
	rangeStart = candidateRange[0]
	if rangeStart > characterUnicodeCode :
	# characterUnicodeCode comes before the start of the range at index so a range
	# is not found for the value
	return None
	rangeName = candidateRange[2]
	return rangeName

	def customBinarySearch(chr):
	"""performs a binary search in scripCodes for unicode ranges
	@param chr: character for which a script should be found
	@type chr: string
	@return: script code
	@rtype: int"""
	mStart = 0
	mEnd = len(scriptCode)-1
	characterUnicodeCode = ord(chr)
	# Number should respect preferred language setting
	# FullWidthNumber is in Common category, however, it indicates Japanese language context
	if 0x30 <= characterUnicodeCode <= 0x39:
	return "Number"
	elif 0xff10 <= characterUnicodeCode <= 0xff19:
	return "FullWidthNumber"
	while( mEnd >= mStart ):
	midPoint = (mStart + mEnd ) >> 1
	if characterUnicodeCode < scriptCode[midPoint][0]:
	mEnd = midPoint -1
	elif characterUnicodeCode > scriptCode[midPoint][1]:
	mStart = midPoint + 1
	else:
	return scriptCode[midPoint][2]
	return None


	start = scriptCode[0][0]
	end = scriptCode[len(scriptCode)-1][1] # gives: 917631
	#start = 0
	end = 0x10000 -1 #values over 65535 dont work with unichr()

	def unicode_literal(n):
	s = "u'\U%08X'" % n
	c = s.decode('unicode-escape')
	return c.encode('utf-8')

	def doAllChars(f):
	numberOfExceptions = 0
	for c in xrange(start, end):
	# some values can not be converted back to a single unicode character.
	#s = unicode_literal(c)
	try:
	s = unichr(c)
	except:
	numberOfExceptions = numberOfExceptions +1
	pass
	v = f(s)

	import timeit
	def measureIt():
	n = 100
	print("using {} iterations".format(n))
	print("testing over range {}-{}, a total of {} values".format(start, end, end-start))
	result = timeit.timeit(lambda: doAllChars(withBisect), number=n)
	print("withBisect: %s"%result)
	result = timeit.timeit(lambda: doAllChars(customBinarySearch), number=n)
	print("customBinarySearch: %s"%result)

	def doIt():
	doAllChars(withBisect)
	doAllChars(customBinarySearch)

	#doIt()
	measureIt()