Skip to content

Instantly share code, notes, and snippets.

@ArtemAvramenko
Last active July 10, 2024 13:45
Show Gist options
  • Save ArtemAvramenko/ec3b5358221f8b6e9f3e9efe1d0a3066 to your computer and use it in GitHub Desktop.
Save ArtemAvramenko/ec3b5358221f8b6e9f3e9efe1d0a3066 to your computer and use it in GitHub Desktop.
Generator for mapping national alphabets to the Basic Latin
var unidecodeLines = File.ReadAllLines(@"C:\Users\username\Desktop\Unidecode\unidecode.tsv");
var outputFile = @"C:\Users\username\Desktop\Unidecode\onlyletters.tsv";
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
var encodings = new[] {
10000, 10006, 10007, 10010, 10017, 10029, 10079, 10081, 10082,
1250, 1251, 1252, 1253, 1254, 1257, 1258
}.Select(i => Encoding.GetEncoding(i)).ToArray();
bool isSupportedEncoding(string s)
=> encodings.Any(e => e.GetString(e.GetBytes(s)) == s);
// Hangul consonant and vowels: 매니저 -> ᄆ ᅢ ᄂ ᅵ ᄌ ᅥ -> MAeNIJEo (manager)
//HashSet<char> jamos = Enumerable
// .Range(0xAC00, 0xD7B0 - 0xAC00)
// .SelectMany(ch => ((char)ch).ToString().Normalize(NormalizationForm.FormKD))
// .Where(ch => ch < 0xD000)
// .ToHashSet();
var latinRegex = new Regex(@"^[a-z]+$");
var letterRegex = new Regex(@"\p{L}");
var map = new Dictionary<string, string>();
var originalMap = new List<(string Key, string Value)>();
foreach (var line in unidecodeLines)
{
var values = line.Split('\t');
if (values.Length >= 2)
{
var key = values[0];
var value = values[1].Trim('\'');
originalMap.Add((key, value));
}
}
foreach (var (key, value) in originalMap
.OrderBy(_ => _.Key.Normalize(NormalizationForm.FormKD) != _.Key)
.ThenBy(_ => _.Key))
{
var decKey = key.Normalize(NormalizationForm.FormKD);
if (decKey != key)
{
decKey = decKey[..1];
if (decKey == value ||
map.TryGetValue(decKey, out var decValue) && value == decValue)
{
continue;
}
}
if (key.Length == 1 &&
letterRegex.IsMatch(key) &&
isSupportedEncoding(key) &&
latinRegex.IsMatch(value))
{
map[key] = value;
}
}
var res = map
.GroupBy(pair => pair.Value, pair => pair.Key)
.OrderBy(g => g.Key)
.Select(g => g.Key.ToUpperInvariant() + string.Join("", g.OrderBy(s => s)));
File.WriteAllText(outputFile, string.Join("", res));
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment