Created
July 21, 2019 02:38
-
-
Save jtanx/bfa7489acc1af53f7937047908ca7704 to your computer and use it in GitHub Desktop.
Combiners parsing for combiners.h
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import sys,os,re | |
# http://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values | |
COMBIN = { | |
0x000: '0', | |
0x001: 'FF_UNICODE_Overstrike', | |
0x202: 'FF_UNICODE_Below|FF_UNICODE_Touching', | |
0x214: 'FF_UNICODE_Above|FF_UNICODE_Touching', | |
0x216: 'FF_UNICODE_Above|FF_UNICODE_Right|FF_UNICODE_Touching', | |
0x228: 'FF_UNICODE_Above|FF_UNICODE_Left', | |
0x220: 'FF_UNICODE_Below', | |
0x230: 'FF_UNICODE_Above', | |
0x232: 'FF_UNICODE_Above|FF_UNICODE_Right', | |
0x233: 'FF_UNICODE_Below|FF_UNICODE_Joins2', | |
0x234: 'FF_UNICODE_Above|FF_UNICODE_Joins2', | |
0x240: 'FF_UNICODE_Below', # Apparently we don't have something for Iota_Subscript | |
} | |
def printit(count, uni, parsed): | |
if (count % 16) == 0: | |
print(' {0},\t/* 0x{1:X} */'.format(parsed, uni)) | |
else: | |
print(' {0},'.format(parsed)) | |
def run(fn): | |
distinct = set() | |
count = 0 | |
lo, hi = 0x300, 0x36F | |
with open(fn) as fp: | |
for line in fp: | |
parts = line.strip().split(';') | |
uni = int(parts[0], 16) | |
if uni >= lo and uni <= hi: | |
for off in range(lo + count, uni): | |
printit(count, off, 0) | |
count += 1 | |
desc = parts[1] | |
combining = int(parts[3], 16) | |
parsed = COMBIN[combining] | |
distinct.add(combining) | |
printit(count, uni, parsed) | |
count += 1 | |
#print('{0:x} {1} {2:x} {3}'.format(uni, desc, combining, parsed)) | |
#print(['%x' % x for x in sorted(distinct)], count) | |
if __name__ == '__main__': | |
run('UnicodeData.txt') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment