Last active
February 7, 2018 05:30
-
-
Save feerrenrut/f66ac4b145ee0d435e557d3361b92440 to your computer and use it in GitHub Desktop.
Hacky script to test various approaches to this discussion: https://github.com/nvaccess/nvda/pull/7629#discussion_r159367500
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import bisect | |
import unicodeScriptData | |
scriptCode = unicodeScriptData.scriptRanges | |
unicodeScriptRangeEnd = [ k[1] for k in scriptCode] | |
def withBisect(chr): | |
# Based on the following assumptions: | |
# - ranges must overlap | |
# - range end and start values are included in that range | |
# - there may be gaps between ranges. | |
# Approach: Look for the first index of a range where the range end value is greater | |
# than the code we are searching for. If this is found, and the start value for this range | |
# is less than or equal to the code we are searching for then we have found the range. | |
# That is startValue <= characterUnicodeCode <= endValue | |
characterUnicodeCode = ord(chr) | |
# Number should respect preferred language setting | |
# FullWidthNumber is in Common category, however, it indicates Japanese language context | |
if 0x30 <= characterUnicodeCode <= 0x39: | |
return "Number" | |
elif 0xff10 <= characterUnicodeCode <= 0xff19: | |
return "FullWidthNumber" | |
index = bisect.bisect_left(unicodeScriptRangeEnd, characterUnicodeCode ) | |
if index == len(unicodeScriptRangeEnd): | |
# there is no value of index such that: `characterUnicodeCode <= scriptCode[index][1]` | |
# characterUnicodeCode is larger than all of the range end values so a range is not | |
# found for the value: | |
return None | |
# Since the range at index is the first where `characterUnicodeCode <= rangeEnd` is True, | |
# we now ensure that for the range at the index `characterUnicodeCode >= rangeStart` | |
# is also True. | |
candidateRange = scriptCode[index] | |
rangeStart = candidateRange[0] | |
if rangeStart > characterUnicodeCode : | |
# characterUnicodeCode comes before the start of the range at index so a range | |
# is not found for the value | |
return None | |
rangeName = candidateRange[2] | |
return rangeName | |
def customBinarySearch(chr): | |
"""performs a binary search in scripCodes for unicode ranges | |
@param chr: character for which a script should be found | |
@type chr: string | |
@return: script code | |
@rtype: int""" | |
mStart = 0 | |
mEnd = len(scriptCode)-1 | |
characterUnicodeCode = ord(chr) | |
# Number should respect preferred language setting | |
# FullWidthNumber is in Common category, however, it indicates Japanese language context | |
if 0x30 <= characterUnicodeCode <= 0x39: | |
return "Number" | |
elif 0xff10 <= characterUnicodeCode <= 0xff19: | |
return "FullWidthNumber" | |
while( mEnd >= mStart ): | |
midPoint = (mStart + mEnd ) >> 1 | |
if characterUnicodeCode < scriptCode[midPoint][0]: | |
mEnd = midPoint -1 | |
elif characterUnicodeCode > scriptCode[midPoint][1]: | |
mStart = midPoint + 1 | |
else: | |
return scriptCode[midPoint][2] | |
return None | |
start = scriptCode[0][0] | |
end = scriptCode[len(scriptCode)-1][1] # gives: 917631 | |
#start = 0 | |
end = 0x10000 -1 #values over 65535 dont work with unichr() | |
def unicode_literal(n): | |
s = "u'\U%08X'" % n | |
c = s.decode('unicode-escape') | |
return c.encode('utf-8') | |
def doAllChars(f): | |
numberOfExceptions = 0 | |
for c in xrange(start, end): | |
# some values can not be converted back to a single unicode character. | |
#s = unicode_literal(c) | |
try: | |
s = unichr(c) | |
except: | |
numberOfExceptions = numberOfExceptions +1 | |
pass | |
v = f(s) | |
import timeit | |
def measureIt(): | |
n = 100 | |
print("using {} iterations".format(n)) | |
print("testing over range {}-{}, a total of {} values".format(start, end, end-start)) | |
result = timeit.timeit(lambda: doAllChars(withBisect), number=n) | |
print("withBisect: %s"%result) | |
result = timeit.timeit(lambda: doAllChars(customBinarySearch), number=n) | |
print("customBinarySearch: %s"%result) | |
def doIt(): | |
doAllChars(withBisect) | |
doAllChars(customBinarySearch) | |
#doIt() | |
measureIt() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment