Last active
April 27, 2023 11:05
-
-
Save ZenulAbidin/1a4810eb536ad37062c7c2e1f6204a17 to your computer and use it in GitHub Desktop.
Parses i18n data form CLDR metadata and the local distribution /usr/share/i18n/locales folder.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright 2023 Ali Sherief | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
import os | |
import re | |
import sys | |
import xml.etree.ElementTree as ET | |
def decode_utf8(encoded_string): | |
decoded_string = '' | |
thres = 0 | |
for i in range(len(encoded_string)): | |
if thres > i: | |
continue | |
if encoded_string[i] == "<" and re.search(r'<U[0-9A-Fa-f]{4}>', encoded_string[i:i+7]): | |
code = encoded_string[i+2:i+6] | |
code_int = int(code, 16) # convert to base 16 | |
decoded_string += chr(code_int) | |
thres = i+7 | |
else: | |
decoded_string += encoded_string[i] | |
thres = i+1 | |
return decoded_string | |
def read_locale_file(file_name, language_map, code): | |
# Set the comment_char and escape_char options | |
comment_char = '%' | |
escape_char = '/' | |
# Read in the given file | |
with open(file_name) as f: | |
all_lines = [] | |
line = "1" | |
while line: | |
try: | |
line = f.readline() | |
all_lines.append(line) | |
except UnicodeDecodeError as e: | |
# Some comment has invalid bytes as the real data | |
# is only supposed to be in ASCII - just ignore it | |
continue | |
# Variables to store the current section and key-values | |
current_section = "" | |
kv_pairs = {"" : {}} | |
continue_line = False | |
# Iterate through all the lines | |
for line in all_lines: | |
line = line.replace('\t', ' ') | |
line = decode_utf8(line.strip()) | |
# If the line starts with the comment character, blank, | |
# or an END directive (e.g. END LC_MESSAGES), ignore it | |
if line == '' or line.startswith("END") or not continue_line and line.startswith(comment_char): | |
continue | |
elif continue_line: | |
if line[-1] == escape_char: | |
continue_line = True | |
line = line[:-1] | |
else: | |
continue_line = False | |
value = line.replace('//', '/').split(';') | |
kv_pairs[current_section][key] += [v for v in value if v != ''] | |
elif line.count(' ') > 0: | |
key = line.split(' ')[0] | |
value = line[len(key)+1:] | |
# check for escape char (except for the escape_char key) | |
if len(value) > 0 and value[-1] == escape_char and key != "escape_char": | |
continue_line = True | |
value = value[:-1] | |
else: | |
continue_line = False | |
# Strip any whitespace | |
key = key.strip() | |
value = value.strip().replace('//', '/').split(';') | |
# Add it to the dict | |
if key in kv_pairs[current_section].keys(): | |
kv_pairs[current_section][key] += [v for v in value if v != ''] | |
else: | |
kv_pairs[current_section][key] = [v for v in value if v != ''] | |
else: | |
# The line is a section header | |
# Get the section name | |
current_section = line.strip() | |
kv_pairs[current_section] = {} | |
language_map[code]["locale_info"] = kv_pairs | |
# STOP! Don't just return here, we need to make sure that LC_TIME fields are filled. | |
if "copy" in kv_pairs["LC_TIME"].keys(): | |
ref_code = kv_pairs["LC_TIME"]["copy"][0].replace('"', '') | |
language_map[ref_code] = {} | |
language_map = read_locale_file("/usr/share/i18n/locales/{}".format(ref_code), language_map, ref_code) | |
# Will it work? | |
language_map[code]["locale_info"]["LC_TIME"] = language_map[ref_code]["locale_info"]["LC_TIME"] | |
return language_map | |
def parse_ldml_language_map(file_name): | |
tree = ET.parse(file_name) | |
root = tree.getroot() | |
language_map = {} | |
for node in root.iter('localeDisplayNames'): | |
for language in node.iter('languages'): | |
for name in language.iter('language'): | |
short_name = name.attrib['type'] | |
if "_" not in short_name: | |
long_name = name.text | |
language_map[short_name] = {} | |
language_map[short_name]["name"] = long_name.upper() | |
return language_map | |
def parse_ldml_locales(language_map, file_name, iso_code): | |
tree = ET.parse(file_name) | |
root = tree.getroot() | |
months_long = {} | |
months_short = {} | |
weeks_long = {} | |
weeks_short = {} | |
for node in root.iter('dates'): | |
for node in node.find('calendars').iter('calendar'): | |
calendar_name = node.attrib['type'] | |
if calendar_name == 'gregorian': # we only support gregorian calendars for now | |
for month_node in node.iter('months'): | |
for month in month_node.iter('monthContext'): | |
month_name = month.attrib['type'] | |
if month_name == 'format': | |
for names in month.iter('monthWidth'): | |
name_type = names.attrib['type'] | |
if name_type == 'wide': | |
for name in names.iter('month'): | |
month_string = name.attrib['type'] | |
months_long[month_string] = name.text | |
elif name_type == 'abbreviated': | |
for name in names.iter('month'): | |
month_string = name.attrib['type'] | |
months_short[month_string] = name.text | |
for week_node in node.iter('days'): | |
for week in week_node.iter('dayContext'): | |
week_name = week.attrib['type'] | |
if week_name == 'format': | |
for names in week.iter('dayWidth'): | |
name_type = names.attrib['type'] | |
if name_type == 'wide': | |
for name in names.iter('month'): | |
month_string = name.attrib['type'] | |
months_long[month_string] = name.text | |
elif name_type == 'abbreviated': | |
for name in names.iter('month'): | |
month_string = name.attrib['type'] | |
months_short[month_string] = name.text | |
for week_node in node.iter('days'): | |
for week in week_node.iter('dayContext'): | |
week_name = week.attrib['type'] | |
if week_name == 'format': | |
for names in week.iter('dayWidth'): | |
name_type = names.attrib['type'] | |
if name_type == 'wide': | |
for name in names.iter('day'): | |
week_string = name.attrib['type'] | |
weeks_long[week_string] = name.text | |
elif name_type == 'abbreviated': | |
for name in names.iter('day'): | |
week_string = name.attrib['type'] | |
weeks_short[week_string] = name.text | |
language_map[iso_code]["months_long"] = months_long | |
language_map[iso_code]["months_short"] = months_short | |
language_map[iso_code]["weeks_long"] = weeks_long | |
language_map[iso_code]["weeks_short"] = weeks_short | |
return language_map | |
def print_xdatetime_macros(language_map): | |
print("// You must define an X_DATETIME_ONLY_LOCALE_* macro, which is only read if you don't want locales.") | |
print("// English is set by default in the event that locales are disabled - which are also enabled by default.") | |
print("// This means if you want to disable the English locale, you must undefine this macro before including this file.") | |
print("#define X_DATETIME_ONLY_LOCALE_ENGLISH\n") | |
print("#ifndef X_DATETIME_NO_LOCALES") | |
for code, language in language_map.items(): | |
if "months_long" not in language.keys() or ("months_long" in language.keys() and len(language["months_long"]) == 0) or \ | |
"months_short" not in language.keys() or ("months_short" in language.keys() and len(language["months_short"]) == 0) or \ | |
"weeks_long" not in language.keys() or ("weeks_long" in language.keys() and len(language["weeks_long"]) == 0) or \ | |
"weeks_short" not in language.keys() or ("weeks_short" in language.keys() and len(language["weeks_short"]) == 0): | |
continue | |
name = language["name"] | |
name = name.replace(',', '_') | |
name = name.replace(' ', '_') | |
name = name.replace('-', '_') | |
for identifier, data in language["months_long"].items(): | |
print(" data.long_months[\"{}\"][\"{}\"] = u8\"{}\";".format(code, identifier, data)) | |
print("") # empty line | |
for identifier, data in language["months_short"].items(): | |
print(" data.short_months[\"{}\"][\"{}\"] = u8\"{}\";".format(code, identifier, data)) | |
print("") | |
for identifier, data in language["weeks_long"].items(): | |
print(" data.long_weeks[\"{}\"][\"{}\"] = u8\"{}\";".format(code, identifier, data)) | |
print("") | |
for identifier, data in language["weeks_short"].items(): | |
print(" data.short_months[\"{}\"][\"{}\"] = u8\"{}\";".format(code, identifier, data)) | |
print("\n") # two empty lines | |
print("#else") | |
for code, language in language_map.items(): | |
if "months_long" not in language.keys() or ("months_long" in language.keys() and len(language["months_long"]) == 0) or \ | |
"months_short" not in language.keys() or ("months_short" in language.keys() and len(language["months_short"]) == 0) or \ | |
"weeks_long" not in language.keys() or ("weeks_long" in language.keys() and len(language["weeks_long"]) == 0) or \ | |
"weeks_short" not in language.keys() or ("weeks_short" in language.keys() and len(language["weeks_short"]) == 0): | |
continue | |
name = language["name"] | |
name = name.replace(',', '_') | |
name = name.replace(' ', '_') | |
name = name.replace('-', '_') | |
print("#ifdef X_DATETIME_ONLY_LOCALE_{}".format(name)) | |
for identifier, data in language["months_long"].items(): | |
print(" data.long_months[\"{}\"][\"{}\"] = u8\"{}\";".format(code, identifier, data)) | |
print("") # empty line | |
for identifier, data in language["months_short"].items(): | |
print(" data.short_months[\"{}\"][\"{}\"] = u8\"{}\";".format(code, identifier, data)) | |
print("") | |
for identifier, data in language["weeks_long"].items(): | |
print(" data.long_weeks[\"{}\"][\"{}\"] = u8\"{}\";".format(code, identifier, data)) | |
print("") | |
for identifier, data in language["weeks_short"].items(): | |
print(" data.short_weeks[\"{}\"][\"{}\"] = u8\"{}\";".format(code, identifier, data)) | |
print("#endif\n") | |
print("\n") # two empty lines | |
print("#endif\n") | |
# The collective GNU C library community wisdom regarding abday, day, week, first_weekday, and first_workday states at https://sourceware.org/glibc/wiki/Locales the following: | |
# | |
# * The value of the second week list item specifies the base of the abday and day lists. | |
# | |
# * first_weekday specifies the offset of the first day-of-week in the abday and day lists. | |
# | |
# * For compatibility reasons, all glibc locales should set the value of the second week list item to 19971130 (Sunday) and base the abday and day lists appropriately, and set first_weekday and first_workday to | |
# 1 or 2, depending on whether the week and work week actually starts on Sunday or Monday for the locale. | |
def print_xdatetime_macros2(language_map): | |
for code, language in language_map.items(): | |
if "locale_info" not in language.keys() or ("locale_info" in language.keys() and len(language["locale_info"]) == 0): | |
continue | |
language = language["locale_info"] | |
name = code.upper() | |
print(" data.am[\"{}\"] = u8\"{}\";".format(code, language["LC_TIME"]["am_pm"][0].replace('"', ''))) | |
print(" data.pm[\"{}\"] = u8\"{}\";".format(code, language["LC_TIME"]["am_pm"][1].replace('"', ''))) | |
print(" data.date_time_format[\"{}\"] = u8\"{}\";".format(code, language["LC_TIME"]["d_t_fmt"][0].replace('"', ''))) | |
print(" data.date_format[\"{}\"] = u8\"{}\";".format(code, language["LC_TIME"]["d_fmt"][0].replace('"', ''))) | |
print(" data.time24_format[\"{}\"] = u8\"{}\";".format(code, language["LC_TIME"]["t_fmt"][0].replace('"', ''))) | |
if "t_fmt_ampm" in language["LC_TIME"].keys(): | |
print(" data.time12_format[\"{}\"] = u8\"{}\";".format(code, language["LC_TIME"]["t_fmt_ampm"][0].replace('"', ''))) | |
else: | |
print(" data.time12_format[\"{}\"] = u8\"{}\";".format(code, language["LC_TIME"]["t_fmt"][0].replace('"', ''))) | |
if "week" in language["LC_TIME"].keys(): | |
print(" data.days_in_week[\"{}\"] = {};".format(code, int(language["LC_TIME"]["week"][0]))) | |
print(" data.first_weekday_ref[\"{}\"] = {};".format(code, int(language["LC_TIME"]["week"][1]))) | |
print(" data.first_week_year_min_days[\"{}\"] = {};".format(code, int(language["LC_TIME"]["week"][2]))) | |
else: | |
print(" data.days_in_week[\"{}\"] = 7;".format(code)) | |
print(" data.first_weekday_ref[\"{}\"] = 11971130;".format(code)) | |
print(" data.first_week_year_min_days[\"{}\"] = 4;".format(code)) | |
if "first_weekday" in language["LC_TIME"].keys(): | |
print(" data.first_weekday[\"{}\"] = {};".format(code, int(language["LC_TIME"]["first_weekday"][0]))) | |
else: | |
print(" data.first_weekday[\"{}\"] = 1;".format(code)) | |
i = 0 | |
for data in language["LC_TIME"]["mon"]: | |
print(" data.long_months[\"{}\"][{}] = u8\"{}\";".format(code, i, data.replace('"', ''))) | |
i += 1 | |
print("") # empty line | |
i = 0 | |
for data in language["LC_TIME"]["abmon"]: | |
print(" data.short_months[\"{}\"][{}] = u8\"{}\";".format(code, i, data.replace('"', ''))) | |
i += 1 | |
print("") | |
i = 0 | |
for data in language["LC_TIME"]["day"]: | |
print(" data.long_weekdays[\"{}\"][{}] = u8\"{}\";".format(code, i, data.replace('"', ''))) | |
i += 1 | |
print("") | |
i = 0 | |
for data in language["LC_TIME"]["abday"]: | |
print(" data.short_weekdays[\"{}\"][{}] = u8\"{}\";".format(code, i, data.replace('"', ''))) | |
i += 1 | |
print("") | |
print("\n") # two empty lines | |
def print_autogenerated_code(language_map): | |
print("// Automatically generated by cldr-gen-locale-data.py. DO NOT MODIFY.\n") | |
print("#ifndef X_DATETIME_LOCALE_DATA_H") | |
print("#define X_DATETIME_LOCALE_DATA_H") | |
print("#include <map>") | |
print("#include <string>\n") | |
print("namespace xDateTime {") | |
print("struct _LocaleData {") | |
print(" std::map<std::string, std::string> am;") | |
print(" std::map<std::string, std::string> pm;") | |
print(" std::map<std::string, std::string> date_time_format;") | |
print(" std::map<std::string, std::string> date_format;") | |
print(" std::map<std::string, std::string> time24_format;") | |
print(" std::map<std::string, std::string> time12_format;") | |
print(" std::map<std::string, int> days_in_week;") | |
print(" std::map<std::string, int> first_weekday_ref;") | |
print(" std::map<std::string, int> first_weekday;") | |
print(" std::map<std::string, int> first_week_year_min_days;") | |
print(" std::map<std::string, std::map<int, std::string>> long_months;") | |
print(" std::map<std::string, std::map<int, std::string>> short_months;") | |
print(" std::map<std::string, std::map<int, std::string>> long_weekdays;") | |
print(" std::map<std::string, std::map<int, std::string>> short_weekdays;") | |
print("};\n") | |
print("_LocaleData InitializeLocaleData() {") | |
print(" static _LocaleData data;") | |
print(" static bool initialized = false;") | |
print(" if (initialized) return data;\n") | |
print_xdatetime_macros2(language_map) | |
print(" initialized = true;") | |
print(" return data;") | |
print("}\n") | |
print("std::string GetLocaleLongMonth(const std::string& locale, int key) {") | |
print(" _LocaleData data = InitializeLocaleData();") | |
print(" return data.long_months[locale][key];") | |
print("}\n") | |
print("std::string GetLocaleShortMonth(const std::string& locale, int key) {") | |
print(" _LocaleData data = InitializeLocaleData();") | |
print(" return data.short_months[locale][key];") | |
print("}\n") | |
print("std::string GetLocaleLongWeekday(const std::string& locale, int key) {") | |
print(" _LocaleData data = InitializeLocaleData();") | |
print(" return data.long_weekdays[locale][key];") | |
print("}\n") | |
print("std::string GetLocaleShortWeekday(const std::string& locale, int key) {") | |
print(" _LocaleData data = InitializeLocaleData();") | |
print(" return data.short_weekdays[locale][key];") | |
print("}\n") | |
print("std::string GetLocaleAM(const std::string& locale) {") | |
print(" _LocaleData data = InitializeLocaleData();") | |
print(" return data.am[locale];") | |
print("}\n") | |
print("std::string GetLocalePM(const std::string& locale) {") | |
print(" _LocaleData data = InitializeLocaleData();") | |
print(" return data.pm[locale];") | |
print("}\n") | |
print("std::string GetLocaleDateTimeFormat(const std::string& locale) {") | |
print(" _LocaleData data = InitializeLocaleData();") | |
print(" return data.date_time_format[locale];") | |
print("}\n") | |
print("std::string GetLocaleDateFormat(const std::string& locale) {") | |
print(" _LocaleData data = InitializeLocaleData();") | |
print(" return data.date_format[locale];") | |
print("}\n") | |
print("std::string GetLocaleTime24Format(const std::string& locale) {") | |
print(" _LocaleData data = InitializeLocaleData();") | |
print(" return data.time24_format[locale];") | |
print("}\n") | |
print("std::string GetLocaleTime12Format(const std::string& locale) {") | |
print(" _LocaleData data = InitializeLocaleData();") | |
print(" return data.time12_format[locale];") | |
print("}\n") | |
print("int GetLocaleDaysInWeeks(const std::string& locale) {") | |
print(" _LocaleData data = InitializeLocaleData();") | |
print(" return data.days_in_week[locale];") | |
print("}\n") | |
print("int GetLocaleFirstWeekdayReference(const std::string& locale) {") | |
print(" _LocaleData data = InitializeLocaleData();") | |
print(" return data.first_weekday_ref[locale];") | |
print("}\n") | |
print("int GetLocaleFirstWeekOfYearMinDays(const std::string& locale) {") | |
print(" _LocaleData data = InitializeLocaleData();") | |
print(" return data.first_week_year_min_days[locale];") | |
print("}\n") | |
print("int GetLocaleFirstWeekday(const std::string& locale) {") | |
print(" _LocaleData data = InitializeLocaleData();") | |
print(" return data.first_weekday[locale];") | |
print("}\n") | |
print("}") # namespace | |
print("#endif /* X_DATETIME_LOCALE_DATA_H */") | |
import pprint | |
def print_locale_info(language_map): | |
for key, value in language_map.items(): | |
if "locale_info" in value.keys(): | |
for vkey, vvalue in value["locale_info"].items(): | |
if vkey == "LC_TIME": | |
print(key, "$$$$$$$") | |
pprint.pprint(vvalue) | |
def main(): | |
dir_path = sys.argv[1] | |
language_map = parse_ldml_language_map(dir_path + '/en.xml') | |
# search the CLDR first... | |
for file_name in os.listdir(dir_path): | |
# Only parse the language files e.g. "en", "es", "fr". | |
# The language-country files such as en_US do not have language information | |
# and should be skipped. | |
if file_name == "root.xml": | |
continue | |
if file_name.endswith('.xml') and '_' not in file_name: | |
language_map = parse_ldml_locales(language_map, dir_path + '/' + file_name, file_name[:-4]) | |
#...now search the OS-specific locales | |
locales_folder = "/usr/share/i18n/locales" | |
for file_name2 in os.listdir(locales_folder): | |
# The OS locales do not have a file extension. | |
if code in language_map.keys(): | |
language_map = read_locale_file(locales_folder + '/' + file_name2, language_map, file_name2) | |
print_autogenerated_code(language_map) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment