Last active
August 20, 2020 12:54
-
-
Save marzer/e7649c763cf4522686f85812f06022aa to your computer and use it in GitHub Desktop.
Python script for enumerating and grouping Unicode character categories in ABNF notation.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# dependencies: | |
# pip install --upgrade requests | |
import os.path | |
import sys | |
import re | |
import requests | |
import traceback | |
def print_character_range(s, e, count): | |
if (count > 0): | |
print(" / ", end='') | |
if (count % 4 == 0): | |
print("\n\t", end='') | |
if (s == e or e == 0): | |
print("%x{:X}".format(s), end='') | |
else: | |
print("%x{:X}-{:X}".format(s, e), end='') | |
def print_abnf_for_categories(name, categories, codepoints): | |
print("\n; unicode codepoints from categories {}".format(', '.join(categories))) | |
print("{} = ".format(name), end='') | |
s = -1 | |
e = -1 | |
print_count = 0 | |
count = 0 | |
for codepoint, category in codepoints: | |
if (category in categories): | |
if (s == -1): | |
s = codepoint | |
e = codepoint | |
elif (e == codepoint-1): | |
e = codepoint | |
else: | |
print_character_range(s, e, print_count) | |
count += e - s + 1 | |
print_count += 1 | |
s = codepoint | |
e = codepoint | |
if (s != -1): | |
print_character_range(s, e, print_count) | |
count += e - s + 1 | |
print("\n\t; {} codepoints in total\n".format(count)) | |
def main(): | |
# get unicode character database | |
codepoint_list = '' | |
codepoint_file_path = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])), 'UnicodeData.txt') | |
if (not os.path.exists(codepoint_file_path)): | |
print("Couldn't find unicode database file, will download") | |
response = requests.get( | |
'https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt', | |
timeout=1 | |
) | |
codepoint_list = response.text | |
codepoint_file = open(codepoint_file_path,'w') | |
print(codepoint_list, end='', file=codepoint_file) | |
codepoint_file.close() | |
else: | |
print("Reading unicode database file into memory") | |
codepoint_file = open(codepoint_file_path,'r') | |
codepoint_list = codepoint_file.read() | |
codepoint_file.close() | |
# parse the database file into codepoints | |
re_codepoint = re.compile(r'^([0-9a-fA-F]+);(.+?);([a-zA-Z]+);') | |
current_range_start = -1 | |
codepoints = [] | |
for codepoint_entry in codepoint_list.split('\n'): | |
match = re_codepoint.search(codepoint_entry) | |
if (match is None): | |
if (current_range_start > -1): | |
raise Exception('Previous codepoint indicated the start of a range but the next one was null') | |
continue | |
codepoint = int('0x{}'.format(match.group(1)), 16) | |
if (current_range_start > -1): | |
for cp in range(current_range_start, codepoint+1): | |
codepoints.append((cp, match.group(3))) | |
current_range_start = -1 | |
else: | |
if (match.group(2).endswith(', First>')): | |
current_range_start = codepoint | |
else: | |
codepoints.append((codepoint, match.group(3))) | |
print("Parsed {} codepoints from unicode database file.".format(len(codepoints))) | |
codepoints.sort(key=lambda r:r[0]) | |
# print categories | |
print_abnf_for_categories("letters", ('Ll','Lm','Lo','Lt','Lu'), codepoints) | |
print_abnf_for_categories("numbers", ('Nd', 'Nl'), codepoints) | |
print_abnf_for_categories("combining_marks", ('Mn', 'Mc'), codepoints) | |
if __name__ == '__main__': | |
try: | |
main() | |
except Exception as err: | |
print( | |
'Fatal error: [{}] {}'.format( | |
type(err).__name__, | |
str(err) | |
), | |
file=sys.stderr | |
) | |
traceback.print_exc(file=sys.stderr) | |
sys.exit(1) | |
sys.exit() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment