Last active
February 4, 2021 22:49
-
-
Save joeminicucci/fd84dfc0d4d9438a00abfad8f42f8d2f to your computer and use it in GitHub Desktop.
Tatoeba Corpora Merger
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import bz2 | |
import csv | |
import io | |
import os | |
import tarfile | |
import requests | |
import argparse | |
from termcolor import colored | |
# https://downloads.tatoeba.org/exports/per_language/eng/eng_sentences.tsv.bz2 | |
def get_lang_corpora(langCode): | |
lang_filename = langCode + "_sentences.tsv.bz2" | |
try: | |
print (colored(('Downloading %s corpora..' % (langCode)), 'blue')) | |
tatoeba_corpus_download_url = "https://downloads.tatoeba.org:443/exports/per_language/" + langCode + "/" + lang_filename | |
headers = {"Accept-Encoding": "gzip, deflate"} | |
corpora_bz2 = requests.get(tatoeba_corpus_download_url, headers=headers).content | |
print (colored(('Unpacking & De-Scaling %s corpora..' % (langCode)), 'blue')) | |
with bz2.BZ2File(io.BytesIO(corpora_bz2), mode='r') as bz2_file: | |
csv_file = bz2_file.read().decode('utf-8').splitlines() | |
corpora_dict = {cols[0]: cols[2] for cols in csv.reader(csv_file, delimiter="\t")} | |
return corpora_dict | |
except: | |
raise Exception(colored('Failed to download and de-scale ' + langCode + ' Corpora Dict.', 'red')) | |
def get_corpora_links(): | |
# https://downloads.tatoeba.org/exports/links.tar.bz2 | |
try: | |
print (colored('Downloading Link File..', 'blue')) | |
tatoeba_corpora_links_download_url = 'https://downloads.tatoeba.org/exports/links.tar.bz2' | |
# headers = {"Accept-Encoding": "gzip, deflate"} | |
corpora_links_bz2 = requests.get(tatoeba_corpora_links_download_url).content | |
print (colored('Unpacking & De-Scaling link file..', 'blue')) | |
with tarfile.open(fileobj=io.BytesIO(corpora_links_bz2), mode="r:bz2") as link_tar: | |
link_csv = link_tar.extractfile(link_tar.getmember('links.csv')).read().decode('ascii').splitlines() | |
link_dict = {cols[0]: cols[1] for cols in csv.reader(link_csv, delimiter="\t")} | |
return link_dict | |
except: | |
raise Exception(colored('Failed to download and de-scale Corpora Link File.', 'red')) | |
def merge_corporas(corpora1: dict, corpora2: dict, links: dict): | |
print(colored('Merging Corporas..', 'blue')) | |
corpora1_keys_set = set(corpora1.keys()) | |
corpora2_keys_set = set(corpora2.keys()) | |
return {corpora1[link_key]: corpora2[links[link_key]] for link_key, link_value in links.items() | |
if link_key in corpora1_keys_set and link_value in corpora2_keys_set} | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser( | |
description='Merge two Tatoeba corpora language dictionaries.') | |
parser.add_argument('--lang-code-1', '-l1', type=str, required=True, | |
help='The first language code', dest='langCode1') | |
parser.add_argument('--lang-code-2', '-l2', type=str, required=True, | |
help='The second language code', dest='langCode2') | |
parser.add_argument('--output-dir', '-o', type=str, required=False, | |
help='The directory to output to', dest='output') | |
options = parser.parse_args() | |
english_corp = get_lang_corpora(options.langCode1) | |
chinese_corp = get_lang_corpora(options.langCode2) | |
link_dict = get_corpora_links() | |
mergedCorpora_dict = merge_corporas(english_corp, chinese_corp, link_dict) | |
output = os.getcwd() + '/' + options.langCode1 + '_' + options.langCode2 + '.csv' | |
if options.output is not None and os.path.isdir(options.output): | |
output = options.output + '/' + options.langCode1 + '_' + options.langCode2 + '.csv' | |
print(colored('Outputing to %s' % output, 'blue')) | |
with open(output, 'w', encoding='utf-8') as f: | |
for key, value in mergedCorpora_dict.items(): | |
f.write('%s\t%s\n' % (key, value)) | |
print(colored('Corpora Merge complete.', 'green')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment