ymoslem · September 1, 2024 13:23 · ymoslem · Feb 16, 2022
diff --git a/M2M-100-example.py b/M2M-100-example.py
 # This example uses M2M-100 models converted to the CTranslate2 format.
 # Download CTranslate2 models:
 # • M2M-100 418M-parameter model: https://bit.ly/33fM1AO
 # • M2M-100 1.2B-parameter model: https://bit.ly/3GYiaed


 import ctranslate2
 import sentencepiece as spm


 # [Modify] Set file paths of the source and target
 source_file_path = "source_test.en"
 target_file_path = "target_test.ja.mt"

 # [Modify] Set paths to the CTranslate2 and SentencePiece models
 ct_model_path = "m2m100_ct2/"
 sp_model_path = "m2m100_ct2/sentencepiece.model"

 # [Modify] Set language prefixes of the source and target
 src_prefix = "__en__"
 tgt_prefix = "__ja__"

 # [Modify] Set the device and beam size
 device = "cpu"  # or "cuda" for GPU
 beam_size = 5


 # Load the source SentecePiece model
 sp = spm.SentencePieceProcessor()
 sp.load(sp_model_path)

 # Open the source file
 with open(source_file_path, "r") as source:
  lines = source.readlines()

 source_sents = [line.strip() for line in lines]
 target_prefix = [[tgt_prefix]] * len(source_sents)

 # Subword the source sentences
 source_sents_subworded = sp.encode(source_sents, out_type=str)
 source_sents_subworded = [[src_prefix] + sent for sent in source_sents_subworded]
 print("First sentence:", source_sents_subworded[0])

 # Translate the source sentences
 translator = ctranslate2.Translator(ct_model_path, device=device)
 translations = translator.translate_batch(source_sents_subworded, batch_type="tokens", max_batch_size=2024, beam_size=beam_size, target_prefix=target_prefix)
 translations = [translation[0]['tokens'] for translation in translations]

 # Desubword the target sentences
 translations_desubword = sp.decode(translations)
 translations_desubword = [sent[len(tgt_prefix):] for sent in translations_desubword]
 print("First translation:", translations_desubword[0])

 # Save the translations to the a file
 with open(target_file_path, "w+", encoding="utf-8") as target:
  for line in translations_desubword:
    target.write(line.strip() + "\n")

 print("Done! Target file saved at:", target_file_path)
	# This example uses M2M-100 models converted to the CTranslate2 format.
	# Download CTranslate2 models:
	# • M2M-100 418M-parameter model: https://bit.ly/33fM1AO
	# • M2M-100 1.2B-parameter model: https://bit.ly/3GYiaed


	import ctranslate2
	import sentencepiece as spm


	# [Modify] Set file paths of the source and target
	source_file_path = "source_test.en"
	target_file_path = "target_test.ja.mt"

	# [Modify] Set paths to the CTranslate2 and SentencePiece models
	ct_model_path = "m2m100_ct2/"
	sp_model_path = "m2m100_ct2/sentencepiece.model"

	# [Modify] Set language prefixes of the source and target
	src_prefix = "__en__"
	tgt_prefix = "__ja__"

	# [Modify] Set the device and beam size
	device = "cpu" # or "cuda" for GPU
	beam_size = 5


	# Load the source SentecePiece model
	sp = spm.SentencePieceProcessor()
	sp.load(sp_model_path)

	# Open the source file
	with open(source_file_path, "r") as source:
	lines = source.readlines()

	source_sents = [line.strip() for line in lines]
	target_prefix = [[tgt_prefix]] * len(source_sents)

	# Subword the source sentences
	source_sents_subworded = sp.encode(source_sents, out_type=str)
	source_sents_subworded = [[src_prefix] + sent for sent in source_sents_subworded]
	print("First sentence:", source_sents_subworded[0])

	# Translate the source sentences
	translator = ctranslate2.Translator(ct_model_path, device=device)
	translations = translator.translate_batch(source_sents_subworded, batch_type="tokens", max_batch_size=2024, beam_size=beam_size, target_prefix=target_prefix)
	translations = [translation[0]['tokens'] for translation in translations]

	# Desubword the target sentences
	translations_desubword = sp.decode(translations)
	translations_desubword = [sent[len(tgt_prefix):] for sent in translations_desubword]
	print("First translation:", translations_desubword[0])

	# Save the translations to the a file
	with open(target_file_path, "w+", encoding="utf-8") as target:
	for line in translations_desubword:
	target.write(line.strip() + "\n")

	print("Done! Target file saved at:", target_file_path)