Skip to content

Instantly share code, notes, and snippets.

@ymoslem
Last active February 23, 2022 08:33
Show Gist options
  • Save ymoslem/9784d1c2d2b67320b007838a6c643554 to your computer and use it in GitHub Desktop.
Save ymoslem/9784d1c2d2b67320b007838a6c643554 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sentencepiece as spm
import ctranslate2
def tokenize(text, sp_source_model):
sp = spm.SentencePieceProcessor(sp_source_model)
tokens =sp.encode(text, out_type=str)
return tokens
def detokenize(text, sp_target_model):
sp = spm.SentencePieceProcessor(sp_target_model)
translation = sp.decode(text)
return translation
def translate(source_sents, model_path, sp_source_model, sp_target_model, beam_size):
source_sents_tok = tokenize(source_sents, sp_source_model)
translator = ctranslate2.Translator(model_path, "cpu") #or cuda
translations_tok = translator.translate_batch(source=source_sents_tok,
beam_size=beam_size,
batch_type="tokens",
max_batch_size=1204,
replace_unknowns=True)
translations = [detokenize(translation[0]["tokens"], sp_target_model) for translation in translations_tok]
return translations
def translate_with_prefix(source_sents, prefix_phrases, model_path, sp_source_model, sp_target_model, beam_size):
source_sents_tok = tokenize(source_sents, sp_source_model)
prefix_phrases_tok = tokenize(prefix_phrases, sp_target_model)
translator = ctranslate2.Translator(model_path, "cpu") #or cuda
translations_tok = translator.translate_batch(source=source_sents_tok,
target_prefix=prefix_phrases_tok,
num_hypotheses=10,
return_alternatives=True,
beam_size=beam_size)
translations = [detokenize(translation[0]["tokens"], sp_target_model) for translation in translations_tok]
return translations
source_sents = ["la crise liée à la covid-19 a creusé les inégalités préexistantes"]
prefix_phrases = ["the covid-19 crisis has intensified"]
model_path = "./un_fren/model"
sp_source_model = "./un_fren/subword/fr.model"
sp_target_model = "./un_fren/subword/en.model"
beam_size = 3
print("• Source:", *source_sents, sep="\n")
original_translation = translate(source_sents, model_path, sp_source_model, sp_target_model, beam_size)
print("• Translation without a prefix:", *original_translation, sep="\n")
hypotheses = translate_with_prefix(source_sents, prefix_phrases, model_path, sp_source_model, sp_target_model, beam_size)
print("• Translation with the prefix':", *hypotheses, sep="\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment