-
-
Save josemarcosrf/71aafa07440d92199ecc3d1993e9ea9e to your computer and use it in GitHub Desktop.
From language name to ISO code
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pycountry | |
from pprint import pformat | |
from data.model_languages import bert_languages, xlm_lang_codes | |
def get_codes(language): | |
lang = pycountry.languages.get(name=language) | |
alpha_2 = alpha_3 = None | |
try: | |
alpha_2 = lang.alpha_2 | |
except: | |
pass | |
try: | |
alpha_3 = lang.alpha_3 | |
except: | |
pass | |
return { | |
"language_name": language, | |
"alpha_2": alpha_2, | |
"alpha_3": alpha_3, | |
} | |
if __name__ == "__main__": | |
# Transform mBERT languages to lang. ISO codes | |
bert_codes = [get_codes(lang) for lang in bert_languages] | |
print(pformat(bert_codes)) | |
# Check intersection between mBERT and XLM | |
bert_alpha_2_codes = [c.get('alpha_2') or c.get('alpha_3') for c in bert_codes] | |
# in Both | |
intersection = set(xlm_lang_codes).intersection(bert_alpha_2_codes) | |
print("Present in both:") | |
print(intersection) | |
print("({})".format(len(intersection))) | |
# Only in mBERT | |
only_bert = set(bert_alpha_2_codes).difference(xlm_lang_codes) | |
print("\n\nPresent only in mBERT:") | |
print(only_bert) | |
print("({})".format(len(only_bert))) | |
# Only in XLM | |
only_xlm = set(xlm_lang_codes).difference(bert_alpha_2_codes) | |
print("\n\nPresent only in XLM:") | |
print(only_xlm) | |
print("({})".format(len(only_xlm))) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
bert_languages = [ | |
"Afrikaans", | |
"Albanian", | |
"Arabic", | |
"Aragonese", | |
"Armenian", | |
"Asturian", | |
"Azerbaijani", | |
"Bashkir", | |
"Basque", | |
"Bavarian", | |
"Belarusian", | |
"Bengali", | |
"Bishnupriya", | |
"Bosnian", | |
"Breton", | |
"Bulgarian", | |
"Burmese", | |
"Catalan", | |
"Cebuano", | |
"Chechen", | |
"Chinese (Simplified)", | |
"Chinese (Traditional)", | |
"Chuvash", | |
"Croatian", | |
"Czech", | |
"Danish", | |
"Dutch", | |
"English", | |
"Estonian", | |
"Finnish", | |
"French", | |
"Galician", | |
"Georgian", | |
"German", | |
"Modern Greek (1453-)", | |
"Gujarati", | |
"Haitian", | |
"Hebrew", | |
"Hindi", | |
"Hungarian", | |
"Icelandic", | |
"Ido", | |
"Indonesian", | |
"Irish", | |
"Italian", | |
"Japanese", | |
"Javanese", | |
"Kannada", | |
"Kazakh", | |
"Kirghiz", | |
"Korean", | |
"Latin", | |
"Latvian", | |
"Lithuanian", | |
"Lombard", | |
"Low German", | |
"Luxembourgish", | |
"Macedonian", | |
"Malagasy", | |
"Malay (macrolanguage)", | |
"Malayalam", | |
"Marathi", | |
"Minangkabau", | |
"Nepali (macrolanguage)", | |
"Newari", | |
"Norwegian Bokmål", | |
"Norwegian Nynorsk", | |
"Occitan (post 1500)", | |
"Persian", | |
"Piemontese", | |
"Polish", | |
"Portuguese", | |
"Panjabi", | |
"Romanian", | |
"Russian", | |
"Scots", | |
"Serbian", | |
"Serbo-Croatian", | |
"Sicilian", | |
"Slovak", | |
"Slovenian", | |
"South Azerbaijani", | |
"Spanish", | |
"Sundanese", | |
"Swahili (macrolanguage)", | |
"Swedish", | |
"Tagalog", | |
"Tajik", | |
"Tamil", | |
"Tatar", | |
"Telugu", | |
"Turkish", | |
"Ukrainian", | |
"Urdu", | |
"Uzbek", | |
"Vietnamese", | |
"Volapük", | |
"Waray (Philippines)", | |
"Welsh", | |
"Western Frisian", | |
"Western Panjabi", | |
"Yoruba", | |
] | |
xlm_lang_codes = [ | |
"en", | |
"es", | |
"fr", | |
"de", | |
"zh", | |
"ru", | |
"pt", | |
"it", | |
"ar", | |
"ja", | |
"id", | |
"tr", | |
"nl", | |
"pl", | |
"simple", | |
"fa", | |
"vi", | |
"sv", | |
"ko", | |
"he", | |
"ro", | |
"no", | |
"hi", | |
"uk", | |
"cs", | |
"fi", | |
"hu", | |
"th", | |
"da", | |
"ca", | |
"el", | |
"bg", | |
"sr", | |
"ms", | |
"bn", | |
"hr", | |
"sl", | |
"zh_yue", | |
"az", | |
"sk", | |
"eo", | |
"ta", | |
"sh", | |
"lt", | |
"et", | |
"ml", | |
"la", | |
"bs", | |
"sq", | |
"arz", | |
"af", | |
"ka", | |
"mr", | |
"eu", | |
"tl", | |
"ang", | |
"gl", | |
"nn", | |
"ur", | |
"kk", | |
"be", | |
"hy", | |
"te", | |
"lv", | |
"mk", | |
"zh_classical", | |
"als", | |
"is", | |
"wuu", | |
"my", | |
"sco", | |
"mn", | |
"ceb", | |
"ast", | |
"cy", | |
"kn", | |
"br", | |
"an", | |
"gu", | |
"bar", | |
"uz", | |
"lb", | |
"ne", | |
"si", | |
"war", | |
"jv", | |
"ga", | |
"zh_min_nan", | |
"oc", | |
"ku", | |
"sw", | |
"nds", | |
"ckb", | |
"ia", | |
"yi", | |
"fy", | |
"scn", | |
"gan", | |
"tt", | |
"am", | |
] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment