Last active
October 17, 2023 21:32
-
-
Save Mr0grog/70ec66c2ed0e7ee9a5d50406534dad46 to your computer and use it in GitHub Desktop.
Compare Unicode and WHATWG encoding mappings
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Quickie script for comparing legacy single-byte character encoding definitions | |
from the Unicode Consortium (found at https://unicode.org/Public/MAPPINGS/) and | |
the WHATWG (at https://encoding.spec.whatwg.org/#legacy-single-byte-encodings or | |
https://github.com/whatwg/encoding), since they differ slightly. | |
Typically, you'll want to download a copy of the Unicode mapping files via FTP: | |
ncftpget -R ftp.unicode.org . Public/MAPPINGS | |
mv MAPPINGS unicode | |
And the WHATWG mapping files via git: | |
git clone https://github.com/whatwg/encoding.git whatwg | |
Then you can compare, for example, windows-1255: | |
python compare.py unicode/VENDORS/MICSFT/WINDOWS/CP1255.TXT whatwg/index-windows-1255.txt | |
Which will output something like: | |
✘ Definitions for windows-1255 do not match! | |
Byte 129 (0x81): Unicode = point <UNDEFINED> / WHATWG = point 129 (0x0081) (control character) | |
Byte 138 (0x8a): Unicode = point <UNDEFINED> / WHATWG = point 138 (0x008a) (control character) | |
Byte 140 (0x8c): Unicode = point <UNDEFINED> / WHATWG = point 140 (0x008c) (control character) | |
Byte 141 (0x8d): Unicode = point <UNDEFINED> / WHATWG = point 141 (0x008d) (control character) | |
Byte 142 (0x8e): Unicode = point <UNDEFINED> / WHATWG = point 142 (0x008e) (control character) | |
Byte 143 (0x8f): Unicode = point <UNDEFINED> / WHATWG = point 143 (0x008f) (control character) | |
Byte 144 (0x90): Unicode = point <UNDEFINED> / WHATWG = point 144 (0x0090) (control character) | |
Byte 154 (0x9a): Unicode = point <UNDEFINED> / WHATWG = point 154 (0x009a) (control character) | |
Byte 156 (0x9c): Unicode = point <UNDEFINED> / WHATWG = point 156 (0x009c) (control character) | |
Byte 157 (0x9d): Unicode = point <UNDEFINED> / WHATWG = point 157 (0x009d) (control character) | |
Byte 158 (0x9e): Unicode = point <UNDEFINED> / WHATWG = point 158 (0x009e) (control character) | |
Byte 159 (0x9f): Unicode = point <UNDEFINED> / WHATWG = point 159 (0x009f) (control character) | |
Byte 202 (0xca): Unicode = point <UNDEFINED> / WHATWG = point 1466 (0x05ba) (HEBREW POINT HOLAM HASER FOR VAV) | |
""" | |
import re | |
from typing import Dict, List, Tuple | |
import unicodedata | |
EncodingMap = Dict[int, int] | |
EMPTY_LINE = re.compile(r'^[\s\x00-\x1f]*$') | |
def is_control_character(point: int) -> bool: | |
# Technically this should be: | |
# unicodedata.category(chr(point)) == 'Cc' | |
# But really we care about the control characters at the start of Latin-1 | |
# Supplement section. | |
return point in range(0x80, 0xa0) | |
class MappingParser: | |
""" | |
Base class for parsing encoding mapping files. | |
""" | |
def parse_file(self, path: str) -> EncodingMap: | |
with open(path) as file: | |
return self.parse(file.read(), path) | |
def parse(self, text: str, filename: str = None) -> EncodingMap: | |
mapping = {} | |
for fields, comment, number, line in self.each_table_line(text): | |
try: | |
byte_value, point, metadata = self.parse_line(fields, comment) | |
self.validate_mapping(byte_value, point, metadata, number, line, filename) | |
mapping[byte_value] = point | |
except ValueError as error: | |
raise SyntaxError(f'Error parsing mapping file: {error}', (filename, number, None, line)) from error | |
return mapping | |
def each_table_line(self, text: str): | |
# NOTE: can't use splitlines() because some of the separators it supports | |
# may be content on a line. WHATWG-style mapping files only list characters | |
# above 0x7f and Unicode-style files do not list the actual character (just | |
# the code point as a number), so line feeds are OK. | |
for number, line in enumerate(text.split('\n'), start=1): | |
data, _, comment = line.partition('#') | |
if not EMPTY_LINE.match(data): | |
fields = data.split('\t') | |
yield fields, comment, number, line | |
def parse_line(self, fields: List[str], comment: str) -> Tuple[int, int, str]: | |
byte_value = int(fields[0], base=0) | |
point = int(fields[1], base=16) if fields[1].strip() else None | |
metadata = f'{"\t".join(fields[2:])} {comment}'.strip() | |
return byte_value, point, metadata | |
def validate_mapping(self, byte_value: int, point: int, metadata: str, line_number: int, raw_line: str, filename: str): | |
if point is not None: | |
if is_control_character(point) and 'control' not in metadata.lower(): | |
raise SyntaxError( | |
f'Line maps byte to a control character (0x{point:02x}) but comment did not mention "control"', | |
(filename, line_number, None, raw_line) | |
) | |
elif 'undefined' not in metadata.lower(): | |
raise SyntaxError( | |
'Line maps byte undefined point, but comment did not mention "undefined"', | |
(filename, line_number, None, raw_line) | |
) | |
class WhatwgMappingParser(MappingParser): | |
def parse(self, text: str, filename: str = None) -> EncodingMap: | |
# WHATWG files omit the first 128 values, since they are always ASCII. | |
ascii = {i: i for i in range(128)} | |
return ascii | super().parse(text, filename) | |
def parse_line(self, fields: List[str], comment: str) -> Tuple[int, int, str]: | |
byte_value, point, metadata = super().parse_line(fields, comment) | |
return byte_value + 128, point, metadata | |
class UnicodeMappingParser(MappingParser): | |
... | |
def pretty_code_point(point: int) -> str: | |
if point is not None: | |
text = f'{point} (0x{point:04x})' | |
if is_control_character(point): | |
text += ' (control character)' | |
name = unicodedata.name(chr(point), None) | |
if name: | |
text += f' ({name})' | |
return text | |
else: | |
return '<UNDEFINED>' | |
def compare_mappings(name: str, unicode: EncodingMap, whatwg: EncodingMap, ignore_control_chars=False): | |
same = True | |
for i in range(256): | |
unicode_point = unicode.get(i) | |
whatwg_point = whatwg.get(i) | |
matched = whatwg_point == unicode_point or ( | |
ignore_control_chars | |
and (unicode_point is None or is_control_character(unicode_point)) | |
and (whatwg_point is None or is_control_character(whatwg_point)) | |
) | |
if not matched: | |
if same: | |
print(f'✘ Definitions for {name} do not match!') | |
same = False | |
print(f' Byte {i} (0x{i:02x}): ' | |
f'Unicode = point {pretty_code_point(unicode_point)} / ' | |
f'WHATWG = point {pretty_code_point(whatwg_point)}') | |
if same: | |
print(f'✔︎ Matched: {name}') | |
def compare_encoding_files(name: str, unicode_path: str, whatwg_path: str, ignore_control_chars=False): | |
unicode = UnicodeMappingParser().parse_file(unicode_path) | |
whatwg = WhatwgMappingParser().parse_file(whatwg_path) | |
compare_mappings(name, unicode, whatwg, ignore_control_chars) | |
def compare_chardetng(ignore_control_chars=False): | |
compare_encoding_files("IBM866", 'unicode/VENDORS/MICSFT/PC/CP866.TXT', 'whatwg/index-ibm866.txt', ignore_control_chars) | |
compare_encoding_files("ISO-8859-2", 'unicode/ISO8859/8859-2.TXT', 'whatwg/index-iso-8859-2.txt', ignore_control_chars) | |
compare_encoding_files("ISO-8859-4", 'unicode/ISO8859/8859-4.TXT', 'whatwg/index-iso-8859-4.txt', ignore_control_chars) | |
compare_encoding_files("ISO-8859-5", 'unicode/ISO8859/8859-5.TXT', 'whatwg/index-iso-8859-5.txt', ignore_control_chars) | |
compare_encoding_files("ISO-8859-6", 'unicode/ISO8859/8859-6.TXT', 'whatwg/index-iso-8859-6.txt', ignore_control_chars) | |
compare_encoding_files("ISO-8859-7", 'unicode/ISO8859/8859-7.TXT', 'whatwg/index-iso-8859-7.txt', ignore_control_chars) | |
compare_encoding_files("ISO-8859-8", 'unicode/ISO8859/8859-8.TXT', 'whatwg/index-iso-8859-8.txt', ignore_control_chars) | |
compare_encoding_files("ISO-8859-13", 'unicode/ISO8859/8859-13.TXT', 'whatwg/index-iso-8859-13.txt', ignore_control_chars) | |
compare_encoding_files("KOI8-U", 'unicode/VENDORS/MISC/KOI8-U.TXT', 'whatwg/index-koi8-u.txt', ignore_control_chars) | |
compare_encoding_files("windows-874", 'unicode/VENDORS/MICSFT/WINDOWS/CP874.TXT', 'whatwg/index-windows-874.txt', ignore_control_chars) | |
compare_encoding_files("windows-1250", 'unicode/VENDORS/MICSFT/WINDOWS/CP1250.TXT', 'whatwg/index-windows-1250.txt', ignore_control_chars) | |
compare_encoding_files("windows-1251", 'unicode/VENDORS/MICSFT/WINDOWS/CP1251.TXT', 'whatwg/index-windows-1251.txt', ignore_control_chars) | |
compare_encoding_files("windows-1252", 'unicode/VENDORS/MICSFT/WINDOWS/CP1252.TXT', 'whatwg/index-windows-1252.txt', ignore_control_chars) | |
compare_encoding_files("windows-1253", 'unicode/VENDORS/MICSFT/WINDOWS/CP1253.TXT', 'whatwg/index-windows-1253.txt', ignore_control_chars) | |
compare_encoding_files("windows-1254", 'unicode/VENDORS/MICSFT/WINDOWS/CP1254.TXT', 'whatwg/index-windows-1254.txt', ignore_control_chars) | |
compare_encoding_files("windows-1255", 'unicode/VENDORS/MICSFT/WINDOWS/CP1255.TXT', 'whatwg/index-windows-1255.txt', ignore_control_chars) | |
compare_encoding_files("windows-1256", 'unicode/VENDORS/MICSFT/WINDOWS/CP1256.TXT', 'whatwg/index-windows-1256.txt', ignore_control_chars) | |
compare_encoding_files("windows-1257", 'unicode/VENDORS/MICSFT/WINDOWS/CP1257.TXT', 'whatwg/index-windows-1257.txt', ignore_control_chars) | |
compare_encoding_files("windows-1258", 'unicode/VENDORS/MICSFT/WINDOWS/CP1258.TXT', 'whatwg/index-windows-1258.txt', ignore_control_chars) | |
if __name__ == '__main__': | |
from argparse import ArgumentParser | |
from pathlib import Path | |
parser = ArgumentParser( | |
prog='compare', | |
description='Compare Unicode Consortium vs WHATWG encoding definitions.' | |
) | |
parser.add_argument('--name', type=str, help='Name of encoding to compare.') | |
parser.add_argument('--ignore-controls', action='store_true', help='Consider control characters and undefined mappings to be the same.') | |
parser.add_argument('FILES', nargs='*', help='Path to Unicode Consortium and WHATWG mapping file (in that order).') | |
args = parser.parse_args() | |
if len(args.FILES) == 0: | |
compare_chardetng(args.ignore_controls) | |
elif len(args.FILES) != 2: | |
print('You must name two files: a Unicode mapping file and a WHATWG mapping file.') | |
else: | |
name = args.name or Path(args.FILES[1]).stem[6:] | |
compare_encoding_files(name, args.FILES[0], args.FILES[1], args.ignore_controls) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment