Skip to content

Instantly share code, notes, and snippets.

@yzhangcs
Last active December 2, 2022 12:54
Show Gist options
  • Save yzhangcs/e2fcc157ba1b714303e7b242e84d7e6e to your computer and use it in GitHub Desktop.
Save yzhangcs/e2fcc157ba1b714303e7b242e84d7e6e to your computer and use it in GitHub Desktop.
Paraphrase data via back-translation using Google Translation
# -*- coding: utf-8 -*-
import argparse
from functools import partial
from multiprocessing.dummy import Pool
import nltk
from google_trans_new import google_translator
from tqdm import tqdm
translator = google_translator()
def backtranslate(text, src, tgt):
tgt_text = translator.translate(text, tgt, src)
if isinstance(tgt_text, list):
tgt_text = tgt_text[0]
tgt_text = translator.translate(tgt_text, src, tgt)
if isinstance(tgt_text, list):
tgt_text = tgt_text[0]
return tgt_text
def paraphrase(fin, fout, src, tgt):
r'''Paraphrase data via back-translation using Google Translation.
Parameters:
fin (str): the file to convert.
fout (str): the file to save.
src (str): source language.
tgt (str): target language.
'''
with open(fin, 'r') as f:
lines = [i.strip() for i in f]
fn = partial(backtranslate, src=src, tgt=tgt)
pool = Pool(8)
lang_map = {'en': 'english', 'de': 'german',
'es': 'spanish', 'nl': 'dutch'}
with open(fout, 'w') as f:
for i in tqdm(pool.imap(fn, tqdm(lines))):
f.write(' '.join(nltk.word_tokenize(i, language=lang_map[tgt])) + '\n')
pool.close()
pool.join()
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='Paraphrase data via back-translation using Google Translation.'
)
parser.add_argument('--fin', help='the file to convert')
parser.add_argument('--fout', help='the file to save')
parser.add_argument('--src', default='en', help='source language')
parser.add_argument('--tgt', default='de', help='target language')
args = parser.parse_args()
paraphrase(args.fin, args.fout, args.src, args.tgt)
@mvneobux
Copy link

mvneobux commented Dec 2, 2022

Thankss friend :))) (Y)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment