Skip to content

Instantly share code, notes, and snippets.

@yzhangcs
Created January 7, 2021 09:57
Show Gist options
  • Save yzhangcs/4343911e2a754b9265f2e6b8e2399e01 to your computer and use it in GitHub Desktop.
Save yzhangcs/4343911e2a754b9265f2e6b8e2399e01 to your computer and use it in GitHub Desktop.
Script for translation
# -*- coding: utf-8 -*-
import argparse
from functools import partial
from multiprocessing.dummy import Pool
import nltk
from google_trans_new import google_translator
from tqdm import tqdm
translator = google_translator()
def translate(text, src, tgt):
tgt_text = translator.translate(text, tgt, src)
if isinstance(tgt_text, list):
return tgt_text[0]
return tgt_text
def run(fin, fout, src, tgt):
r'''Paraphrase data via back-translation using Google Translation.
Parameters:
fin (str): the file to convert.
fout (str): the file to save.
src (str): source language.
tgt (str): target language.
'''
with open(fin, 'r') as f:
lines = [i.strip() for i in f]
fn = partial(translate, src=src, tgt=tgt)
pool = Pool(8)
lang_map = {'en': 'english', 'de': 'german', 'es': 'spanish', 'nl': 'dutch'}
with open(fout, 'w') as f:
for i in tqdm(pool.imap(fn, lines)):
try:
f.write(' '.join(nltk.word_tokenize(i, language=lang_map[tgt])) + '\n')
except Exception as e:
print(i)
raise e
pool.close()
pool.join()
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='Script for translation.'
)
parser.add_argument('--fin', help='the file to convert')
parser.add_argument('--fout', help='the file to save')
parser.add_argument('--src', default='en', help='source language')
parser.add_argument('--tgt', default='de', help='target language')
args = parser.parse_args()
run(args.fin, args.fout, args.src, args.tgt)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment