Created
October 20, 2019 22:51
-
-
Save frankrolf/a46a595b86d1d42019c4175e66f99194 to your computer and use it in GitHub Desktop.
Parse all languages as specified by TK Speakeasy
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# this script is assumed to live in the directory https://github.com/typekit/speakeasy/tree/master/data | |
import os | |
import re | |
def get_code_points(language_file): | |
with open(language_file, 'r') as f: | |
raw_data = f.read().splitlines() | |
split_index = raw_data.index('---') + 5 | |
data = raw_data[split_index:] | |
code_points = [] | |
for line in data: | |
if line.startswith('-'): | |
rx_range = re.match(r'- !ruby/range (\d*)\.\.(\d*)', line) | |
rx_codepoint = re.match(r'- (\d*)', line) | |
if rx_range: | |
range_start = int(rx_range.group(1)) | |
range_end = int(rx_range.group(2)) + 1 | |
code_points.extend(range(range_start, range_end)) | |
elif rx_codepoint: | |
cp = int(rx_codepoint.group(1)) | |
code_points.append(cp) | |
# else: | |
# print(line) | |
return(sorted(code_points)) | |
lang_dict = {} | |
for filename in os.listdir(os.curdir): | |
if os.path.splitext(filename)[-1] is '': | |
lang_dict[filename] = ''.join( | |
chr(code_point) for code_point in get_code_points(filename)) | |
# print(lang_dict.items()) | |
print(lang_dict) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment