Skip to content

Instantly share code, notes, and snippets.

@feerrenrut
Last active February 7, 2018 05:30
Show Gist options
  • Save feerrenrut/f66ac4b145ee0d435e557d3361b92440 to your computer and use it in GitHub Desktop.
Save feerrenrut/f66ac4b145ee0d435e557d3361b92440 to your computer and use it in GitHub Desktop.
Hacky script to test various approaches to this discussion: https://github.com/nvaccess/nvda/pull/7629#discussion_r159367500
import bisect
import unicodeScriptData
scriptCode = unicodeScriptData.scriptRanges
unicodeScriptRangeEnd = [ k[1] for k in scriptCode]
def withBisect(chr):
# Based on the following assumptions:
# - ranges must overlap
# - range end and start values are included in that range
# - there may be gaps between ranges.
# Approach: Look for the first index of a range where the range end value is greater
# than the code we are searching for. If this is found, and the start value for this range
# is less than or equal to the code we are searching for then we have found the range.
# That is startValue <= characterUnicodeCode <= endValue
characterUnicodeCode = ord(chr)
# Number should respect preferred language setting
# FullWidthNumber is in Common category, however, it indicates Japanese language context
if 0x30 <= characterUnicodeCode <= 0x39:
return "Number"
elif 0xff10 <= characterUnicodeCode <= 0xff19:
return "FullWidthNumber"
index = bisect.bisect_left(unicodeScriptRangeEnd, characterUnicodeCode )
if index == len(unicodeScriptRangeEnd):
# there is no value of index such that: `characterUnicodeCode <= scriptCode[index][1]`
# characterUnicodeCode is larger than all of the range end values so a range is not
# found for the value:
return None
# Since the range at index is the first where `characterUnicodeCode <= rangeEnd` is True,
# we now ensure that for the range at the index `characterUnicodeCode >= rangeStart`
# is also True.
candidateRange = scriptCode[index]
rangeStart = candidateRange[0]
if rangeStart > characterUnicodeCode :
# characterUnicodeCode comes before the start of the range at index so a range
# is not found for the value
return None
rangeName = candidateRange[2]
return rangeName
def customBinarySearch(chr):
"""performs a binary search in scripCodes for unicode ranges
@param chr: character for which a script should be found
@type chr: string
@return: script code
@rtype: int"""
mStart = 0
mEnd = len(scriptCode)-1
characterUnicodeCode = ord(chr)
# Number should respect preferred language setting
# FullWidthNumber is in Common category, however, it indicates Japanese language context
if 0x30 <= characterUnicodeCode <= 0x39:
return "Number"
elif 0xff10 <= characterUnicodeCode <= 0xff19:
return "FullWidthNumber"
while( mEnd >= mStart ):
midPoint = (mStart + mEnd ) >> 1
if characterUnicodeCode < scriptCode[midPoint][0]:
mEnd = midPoint -1
elif characterUnicodeCode > scriptCode[midPoint][1]:
mStart = midPoint + 1
else:
return scriptCode[midPoint][2]
return None
start = scriptCode[0][0]
end = scriptCode[len(scriptCode)-1][1] # gives: 917631
#start = 0
end = 0x10000 -1 #values over 65535 dont work with unichr()
def unicode_literal(n):
s = "u'\U%08X'" % n
c = s.decode('unicode-escape')
return c.encode('utf-8')
def doAllChars(f):
numberOfExceptions = 0
for c in xrange(start, end):
# some values can not be converted back to a single unicode character.
#s = unicode_literal(c)
try:
s = unichr(c)
except:
numberOfExceptions = numberOfExceptions +1
pass
v = f(s)
import timeit
def measureIt():
n = 100
print("using {} iterations".format(n))
print("testing over range {}-{}, a total of {} values".format(start, end, end-start))
result = timeit.timeit(lambda: doAllChars(withBisect), number=n)
print("withBisect: %s"%result)
result = timeit.timeit(lambda: doAllChars(customBinarySearch), number=n)
print("customBinarySearch: %s"%result)
def doIt():
doAllChars(withBisect)
doAllChars(customBinarySearch)
#doIt()
measureIt()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment