ymoslem · February 23, 2022 08:33
diff --git a/CTranslate2-example-adv.py b/CTranslate2-example-adv.py
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-

 import sentencepiece as spm
 import ctranslate2


 def tokenize(text, sp_source_model):
    sp = spm.SentencePieceProcessor(sp_source_model)
    tokens =sp.encode(text, out_type=str)
    return tokens


 def detokenize(text, sp_target_model):
    sp = spm.SentencePieceProcessor(sp_target_model)
    translation = sp.decode(text)
    return translation


 def translate(source_sents, model_path, sp_source_model, sp_target_model, beam_size):
    source_sents_tok = tokenize(source_sents, sp_source_model)
    translator = ctranslate2.Translator(model_path, "cpu") #or cuda
    translations_tok = translator.translate_batch(source=source_sents_tok,
                                                  beam_size=beam_size,
                                                  batch_type="tokens",
                                                  max_batch_size=1204,
                                                  replace_unknowns=True)
    translations = [detokenize(translation[0]["tokens"], sp_target_model) for translation in translations_tok]
    return translations


 def translate_with_prefix(source_sents, prefix_phrases, model_path, sp_source_model, sp_target_model, beam_size):
    source_sents_tok = tokenize(source_sents, sp_source_model)
    prefix_phrases_tok = tokenize(prefix_phrases, sp_target_model)
    translator = ctranslate2.Translator(model_path, "cpu") #or cuda
    translations_tok = translator.translate_batch(source=source_sents_tok,
                                                  target_prefix=prefix_phrases_tok,
                                                  num_hypotheses=10,
                                                  return_alternatives=True,
                                                  beam_size=beam_size)
    translations = [detokenize(translation[0]["tokens"], sp_target_model) for translation in translations_tok]
    return translations


 source_sents = ["la crise liée à la covid-19 a creusé les inégalités préexistantes"]
 prefix_phrases = ["the covid-19 crisis has intensified"]

 model_path = "./un_fren/model"
 sp_source_model = "./un_fren/subword/fr.model"
 sp_target_model = "./un_fren/subword/en.model"

 beam_size = 3


 print("• Source:", *source_sents, sep="\n")

 original_translation = translate(source_sents, model_path, sp_source_model, sp_target_model, beam_size)
 print("• Translation without a prefix:", *original_translation,  sep="\n")

 hypotheses = translate_with_prefix(source_sents, prefix_phrases, model_path, sp_source_model, sp_target_model, beam_size)
 print("• Translation with the prefix':", *hypotheses, sep="\n")
	#!/usr/bin/env python3
	# -- coding: utf-8 --

	import sentencepiece as spm
	import ctranslate2


	def tokenize(text, sp_source_model):
	sp = spm.SentencePieceProcessor(sp_source_model)
	tokens =sp.encode(text, out_type=str)
	return tokens


	def detokenize(text, sp_target_model):
	sp = spm.SentencePieceProcessor(sp_target_model)
	translation = sp.decode(text)
	return translation


	def translate(source_sents, model_path, sp_source_model, sp_target_model, beam_size):
	source_sents_tok = tokenize(source_sents, sp_source_model)
	translator = ctranslate2.Translator(model_path, "cpu") #or cuda
	translations_tok = translator.translate_batch(source=source_sents_tok,
	beam_size=beam_size,
	batch_type="tokens",
	max_batch_size=1204,
	replace_unknowns=True)
	translations = [detokenize(translation[0]["tokens"], sp_target_model) for translation in translations_tok]
	return translations


	def translate_with_prefix(source_sents, prefix_phrases, model_path, sp_source_model, sp_target_model, beam_size):
	source_sents_tok = tokenize(source_sents, sp_source_model)
	prefix_phrases_tok = tokenize(prefix_phrases, sp_target_model)
	translator = ctranslate2.Translator(model_path, "cpu") #or cuda
	translations_tok = translator.translate_batch(source=source_sents_tok,
	target_prefix=prefix_phrases_tok,
	num_hypotheses=10,
	return_alternatives=True,
	beam_size=beam_size)
	translations = [detokenize(translation[0]["tokens"], sp_target_model) for translation in translations_tok]
	return translations


	source_sents = ["la crise liée à la covid-19 a creusé les inégalités préexistantes"]
	prefix_phrases = ["the covid-19 crisis has intensified"]

	model_path = "./un_fren/model"
	sp_source_model = "./un_fren/subword/fr.model"
	sp_target_model = "./un_fren/subword/en.model"

	beam_size = 3


	print("• Source:", *source_sents, sep="\n")

	original_translation = translate(source_sents, model_path, sp_source_model, sp_target_model, beam_size)
	print("• Translation without a prefix:", *original_translation, sep="\n")

	hypotheses = translate_with_prefix(source_sents, prefix_phrases, model_path, sp_source_model, sp_target_model, beam_size)
	print("• Translation with the prefix':", *hypotheses, sep="\n")