Skip to content

Instantly share code, notes, and snippets.

@frankrolf
Created October 20, 2019 22:51
Show Gist options
  • Save frankrolf/a46a595b86d1d42019c4175e66f99194 to your computer and use it in GitHub Desktop.
Save frankrolf/a46a595b86d1d42019c4175e66f99194 to your computer and use it in GitHub Desktop.
Parse all languages as specified by TK Speakeasy
# this script is assumed to live in the directory https://github.com/typekit/speakeasy/tree/master/data
import os
import re
def get_code_points(language_file):
with open(language_file, 'r') as f:
raw_data = f.read().splitlines()
split_index = raw_data.index('---') + 5
data = raw_data[split_index:]
code_points = []
for line in data:
if line.startswith('-'):
rx_range = re.match(r'- !ruby/range (\d*)\.\.(\d*)', line)
rx_codepoint = re.match(r'- (\d*)', line)
if rx_range:
range_start = int(rx_range.group(1))
range_end = int(rx_range.group(2)) + 1
code_points.extend(range(range_start, range_end))
elif rx_codepoint:
cp = int(rx_codepoint.group(1))
code_points.append(cp)
# else:
# print(line)
return(sorted(code_points))
lang_dict = {}
for filename in os.listdir(os.curdir):
if os.path.splitext(filename)[-1] is '':
lang_dict[filename] = ''.join(
chr(code_point) for code_point in get_code_points(filename))
# print(lang_dict.items())
print(lang_dict)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment