Last active
August 31, 2025 19:59
-
-
Save ymoslem/a414a0ead0d3e50f4d7ff7110b1d1c0d to your computer and use it in GitHub Desktop.
Example of translating a file with M2M-100 using CTranslate2
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # This example uses M2M-100 models converted to the CTranslate2 format. | |
| # Download CTranslate2 models: | |
| # • M2M-100 418M-parameter model: https://bit.ly/33fM1AO | |
| # • M2M-100 1.2B-parameter model: https://bit.ly/3GYiaed | |
| import ctranslate2 | |
| import sentencepiece as spm | |
| # [Modify] Set file paths of the source and target | |
| source_file_path = "source_test.en" | |
| target_file_path = "target_test.ja.mt" | |
| # [Modify] Set paths to the CTranslate2 and SentencePiece models | |
| ct_model_path = "m2m100_ct2/" | |
| sp_model_path = "m2m100_ct2/sentencepiece.model" | |
| # [Modify] Set language prefixes of the source and target | |
| src_prefix = "__en__" | |
| tgt_prefix = "__ja__" | |
| # [Modify] Set the device and beam size | |
| device = "cpu" # or "cuda" for GPU | |
| beam_size = 5 | |
| # Load the source SentecePiece model | |
| sp = spm.SentencePieceProcessor() | |
| sp.load(sp_model_path) | |
| # Open the source file | |
| with open(source_file_path, "r") as source: | |
| lines = source.readlines() | |
| source_sents = [line.strip() for line in lines] | |
| target_prefix = [[tgt_prefix]] * len(source_sents) | |
| # Subword the source sentences | |
| source_sents_subworded = sp.encode(source_sents, out_type=str) | |
| source_sents_subworded = [[src_prefix] + sent for sent in source_sents_subworded] | |
| print("First sentence:", source_sents_subworded[0]) | |
| # Translate the source sentences | |
| translator = ctranslate2.Translator(ct_model_path, device=device) | |
| translations = translator.translate_batch(source_sents_subworded, batch_type="tokens", max_batch_size=2024, beam_size=beam_size, target_prefix=target_prefix) | |
| translations = [translation[0]['tokens'] for translation in translations] | |
| # Desubword the target sentences | |
| translations_desubword = sp.decode(translations) | |
| translations_desubword = [sent[len(tgt_prefix):] for sent in translations_desubword] | |
| print("First translation:", translations_desubword[0]) | |
| # Save the translations to the a file | |
| with open(target_file_path, "w+", encoding="utf-8") as target: | |
| for line in translations_desubword: | |
| target.write(line.strip() + "\n") | |
| print("Done! Target file saved at:", target_file_path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks! And for pointing me to NLLB-200 as well, I've only just started to explore the space and didn't come across that one yet. I found your forum post now and will certainly include it in my tests.