Last active
July 10, 2024 13:45
-
-
Save ArtemAvramenko/ec3b5358221f8b6e9f3e9efe1d0a3066 to your computer and use it in GitHub Desktop.
Generator for mapping national alphabets to the Basic Latin
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var unidecodeLines = File.ReadAllLines(@"C:\Users\username\Desktop\Unidecode\unidecode.tsv"); | |
var outputFile = @"C:\Users\username\Desktop\Unidecode\onlyletters.tsv"; | |
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance); | |
var encodings = new[] { | |
10000, 10006, 10007, 10010, 10017, 10029, 10079, 10081, 10082, | |
1250, 1251, 1252, 1253, 1254, 1257, 1258 | |
}.Select(i => Encoding.GetEncoding(i)).ToArray(); | |
bool isSupportedEncoding(string s) | |
=> encodings.Any(e => e.GetString(e.GetBytes(s)) == s); | |
// Hangul consonant and vowels: 매니저 -> ᄆ ᅢ ᄂ ᅵ ᄌ ᅥ -> MAeNIJEo (manager) | |
//HashSet<char> jamos = Enumerable | |
// .Range(0xAC00, 0xD7B0 - 0xAC00) | |
// .SelectMany(ch => ((char)ch).ToString().Normalize(NormalizationForm.FormKD)) | |
// .Where(ch => ch < 0xD000) | |
// .ToHashSet(); | |
var latinRegex = new Regex(@"^[a-z]+$"); | |
var letterRegex = new Regex(@"\p{L}"); | |
var map = new Dictionary<string, string>(); | |
var originalMap = new List<(string Key, string Value)>(); | |
foreach (var line in unidecodeLines) | |
{ | |
var values = line.Split('\t'); | |
if (values.Length >= 2) | |
{ | |
var key = values[0]; | |
var value = values[1].Trim('\''); | |
originalMap.Add((key, value)); | |
} | |
} | |
foreach (var (key, value) in originalMap | |
.OrderBy(_ => _.Key.Normalize(NormalizationForm.FormKD) != _.Key) | |
.ThenBy(_ => _.Key)) | |
{ | |
var decKey = key.Normalize(NormalizationForm.FormKD); | |
if (decKey != key) | |
{ | |
decKey = decKey[..1]; | |
if (decKey == value || | |
map.TryGetValue(decKey, out var decValue) && value == decValue) | |
{ | |
continue; | |
} | |
} | |
if (key.Length == 1 && | |
letterRegex.IsMatch(key) && | |
isSupportedEncoding(key) && | |
latinRegex.IsMatch(value)) | |
{ | |
map[key] = value; | |
} | |
} | |
var res = map | |
.GroupBy(pair => pair.Value, pair => pair.Key) | |
.OrderBy(g => g.Key) | |
.Select(g => g.Key.ToUpperInvariant() + string.Join("", g.OrderBy(s => s))); | |
File.WriteAllText(outputFile, string.Join("", res)); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment