Skip to content

Instantly share code, notes, and snippets.

@Norod
Created January 24, 2022 17:25
Show Gist options
  • Select an option

  • Save Norod/aff94564639f7baf81c6f43efbce5a5e to your computer and use it in GitHub Desktop.

Select an option

Save Norod/aff94564639f7baf81c6f43efbce5a5e to your computer and use it in GitHub Desktop.
Translate a csv file using Helsinki-NLP's hugging-face models
# !pip install sentencepiece transformers tokenizers
from transformers import MarianTokenizer, MarianMTModel
from typing import List
import csv
src = "en" # source language
trg = "he" # target language
model_name = f"Helsinki-NLP/opus-mt-{src}-{trg}"
empty_string = " - "
model = MarianMTModel.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)
input_file = open('profiles_revised_100l.csv', 'r')
csv_reader = csv.reader(input_file)
output_file = open('profiles_revised_100l_t.csv', 'w')
csv_writer = csv.writer(output_file, delimiter=',')
for row in csv_reader:
current_line = csv_reader.line_num
input_row = []
for item in row:
if len(item) > 0:
item = item.replace('&rsquo', '\'')
input_row.append(item)
else:
input_row.append(empty_string)
batch = tokenizer(input_row , padding=True, return_tensors="pt")
gen = model.generate(**batch)
results = tokenizer.batch_decode(gen, skip_special_tokens=True)
csv_writer.writerow(results)
if current_line % 25 == 0 or current_line == 1:
print(f"{current_line}")
output_file.flush()
input_file.close()
output_file.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment