Created
November 20, 2023 04:54
-
-
Save santhoshtr/20548fa75687b266e098a5f62daea74e to your computer and use it in GitHub Desktop.
Print a Tab seperated file with all languages supported by MT providers of WMF Cxserver
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Print a Tab seperated file with all languages supported by MT providers | |
import requests | |
from typing import List | |
mtlabels = { | |
"Apertium": "Ⓐ", | |
"Elia": "Ⓔ", | |
"Google": "Ⓖ", | |
"MinT": "Ⓜ", | |
"Yandex": "Ⓨ", | |
"LingoCloud": "Ⓛ", | |
} | |
def get_wiki_sites(project: str = "wiki") -> List[str]: | |
"""Get all language codes for a given Wikimedia project. | |
Valid project codes: | |
* wiki = Wikipedia | |
* wiktionary = Wiktionary | |
* wikibooks = Wikibooks | |
* wikinews = Wikinews | |
* wikiquote = Wikiquote | |
* wikisource = Wikisource | |
* wikiversity = Wikiversity | |
* wikivoyage = Wikivoyage | |
""" | |
session = requests.Session() | |
base_url = "https://meta.wikimedia.org/w/api.php" | |
params = { | |
"action": "sitematrix", | |
"smlangprop": "|".join(["code", "site"]), | |
"smsiteprop": "|".join(["code"]), | |
"format": "json", | |
"formatversion": "2", | |
} | |
result = session.get(url=base_url, params=params).json() | |
wiki_languages = set() | |
if "sitematrix" in result: | |
for lang in result["sitematrix"]: | |
try: | |
int(lang) # weirdly, wikis are keyed as numbers in the results | |
for wiki in result["sitematrix"][lang].get("site", []): | |
if "closed" not in wiki and wiki["code"] == project: | |
code = result["sitematrix"][lang]["code"] | |
if code == "simple": | |
# Simple is English language | |
continue | |
wiki_languages.add(code) | |
break | |
except ( | |
ValueError | |
): # skip count metadata and special wikis like Commons, Affiliates, etc. | |
continue | |
return sorted(wiki_languages) | |
def get_mt_coverage_info(): | |
# Send a GET request to the API endpoint | |
response = requests.get("https://cxserver.wikimedia.org/v1/list/mt") | |
# get the JSON data returned by the API | |
data = response.json() | |
coverage = {} | |
# loop through the key-value pairs of the data | |
for provider, value in data.items(): | |
if provider == "defaults" or provider == "TestClient" or provider == "Youdao" : | |
continue | |
# loop through the sub key-value pairs of the current key-value pair | |
for source_lang, target_langs in value.items(): | |
if source_lang not in coverage: | |
coverage[source_lang] = {} | |
for target_lang in target_langs: | |
if target_lang not in coverage[source_lang]: | |
coverage[source_lang][target_lang] = [] | |
coverage[source_lang][target_lang].append(mtlabels[provider]) | |
return coverage | |
def print_mt_coverage(show_providers=False): | |
all_langs = sorted(set(get_wiki_sites())) | |
coverage = get_mt_coverage_info() | |
print("\t", end="") | |
for lang in all_langs: | |
print(f"{lang}", end="\t") | |
print("") | |
for source_lang in all_langs: | |
print(f"{source_lang}", end="\t") | |
for target_lang in all_langs: | |
providers = [] | |
if source_lang in coverage and target_lang in coverage[source_lang]: | |
providers = coverage[source_lang][target_lang] | |
if show_providers: | |
print(f"{''.join(providers)}", end="\t") | |
else: | |
if len(providers) > 0: | |
print("✅", end="\t") | |
else: | |
print("", end="\t") | |
print("") | |
if __name__ == "__main__": | |
print_mt_coverage(show_providers=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment