|
import os |
|
import re |
|
|
|
|
|
def parse(data): |
|
start = data.find("<tbody>") |
|
end = data.find("</tbody>") |
|
data = data[start+7:end] |
|
for chunk in re.findall(r"<tr>.+?</tr>", data, re.DOTALL): |
|
fields = re.findall(r"<td>(.+?)</td>", chunk, re.DOTALL) |
|
parsedFields = [] |
|
for field in fields: |
|
m = re.search(r'href="(.+?)"', field) |
|
if m is not None: |
|
parsedFields.append(m.group(1)) |
|
tagParts = field.split("'") |
|
if len(tagParts) >= 2: |
|
parsedFields.append(tagParts[1]) |
|
else: |
|
parsedFields.append(field) |
|
if parsedFields: |
|
yield parsedFields |
|
|
|
|
|
def formatFeatures(data, baseURL): |
|
print("features = {") |
|
print(" # tag, friendly name, documentation URL") |
|
for link, tag, friendlyName in data: |
|
if tag == 'cv01': |
|
tags = [f"cv{i:02d}" for i in range(1, 100)] |
|
else: |
|
tags = [tag] |
|
for tag in tags: |
|
print(f" {tag!r}: ({friendlyName!r}, {baseURL+link!r}),") |
|
print("}") |
|
|
|
|
|
def formatScripts(data): |
|
print("scripts = {") |
|
print(" # tag, friendly name") |
|
duplicates = {} |
|
for i, (friendlyName, tag) in enumerate(data): |
|
if tag in duplicates: |
|
duplicates[tag] = duplicates[tag] + ", " + friendlyName |
|
data[i] = (None, None) # skip |
|
else: |
|
duplicates[tag] = friendlyName |
|
for _, tag in data: |
|
if tag is None: |
|
continue |
|
friendlyName = duplicates[tag] |
|
print(f" {tag!r}: {friendlyName!r},") |
|
print("}") |
|
|
|
|
|
def formatLanguages(data): |
|
print("languages = {") |
|
print(" # tag, friendly name, ISO 639 IDs (if applicable)") |
|
for friendlyName, *fields in data: |
|
tag = fields[0] |
|
if len(tag) < 4: |
|
tag += (4 - len(tag)) * " " |
|
assert len(tag) == 4, tag |
|
if len(fields) > 1: |
|
assert len(fields) == 2 |
|
isoCodes = [isoCode.strip() for isoCode in fields[1].split(",")] |
|
else: |
|
isoCodes = [] |
|
t = (friendlyName,) + tuple(isoCodes) |
|
print(f" {tag!r}: {t},") |
|
print("}") |
|
|
|
|
|
# https://docs.microsoft.com/en-us/typography/opentype/spec/featurelist |
|
# https://docs.microsoft.com/en-us/typography/opentype/spec/scripttags |
|
# https://docs.microsoft.com/en-us/typography/opentype/spec/languagetags |
|
|
|
if __name__ == "__main__": |
|
import sys |
|
import time |
|
baseURL = "https://docs.microsoft.com/en-us/typography/opentype/spec/" |
|
if len(sys.argv) > 1: |
|
with open(sys.argv[1]) as f: |
|
html = f.read() |
|
pages = [html] |
|
else: |
|
import urllib.request |
|
pages = [] |
|
print(f"# Generated by {os.path.basename(__file__)}") |
|
print("# Scraped from:") |
|
for page in ["featurelist", "scripttags", "languagetags"]: |
|
url = baseURL + page |
|
print(f"# {url}") |
|
with urllib.request.urlopen(url) as fp: |
|
html = fp.read().decode("utf-8", errors="replace") |
|
pages.append(html) |
|
print() |
|
print() |
|
print("__all__ = ['features', 'scripts', 'languages']") |
|
print() |
|
|
|
for html in pages: |
|
print() |
|
parsed = list(parse(html)) |
|
if "<title>Registered features" in html: |
|
formatFeatures(parsed, baseURL) |
|
elif "<title>Script tags" in html: |
|
formatScripts(parsed) |
|
elif "<title>Language system tags" in html: |
|
formatLanguages(parsed) |
|
else: |
|
assert 0, "huh." |