Skip to content

Instantly share code, notes, and snippets.

View ymoslem's full-sized avatar
👩‍🎓

Yasmin Moslem ymoslem

👩‍🎓
View GitHub Profile
@ymoslem
ymoslem / CTranslate2-mwe.py
Created April 20, 2021 12:47
CTranslate2 MWE
import ctranslate2
def detokenize(result):
translation = " ".join([t for t in result])
return translation
def tokenize(input_sentence):
tokens = input_sentence.split(" ")
return tokens
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import sentencepiece as spm
import ctranslate2
def tokenize(text, sp_source_model):
sp = spm.SentencePieceProcessor(sp_source_model)
tokens =sp.encode(text, out_type=str)
@ymoslem
ymoslem / language_detection.py
Last active March 13, 2022 15:53
Runtime test of language detection libraries.
# -*- coding: utf-8 -*-
# pip3 install gdown langdetect fasttext pycld2 py3langid
import gdown
from datetime import datetime
# Download fasttext models
url = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz"
output = "lid.176.ftz"
gdown.download(url, output, quiet=False)
# Remove duplicate, lines with bad characters, and shuffle
# Find the number of CPUs/cores to add to parallel: nproc --all
# sort -S 50% --parallel=4 dataset.es | uniq -u > dataset.unique.es
# shuf dataset.unique.es > dataset.unique.shuf.es
# !perl -ne '/�/ or print' dataset.unique.shuf.es > dataset.unique.shuf.cleaner.es
import re
import fasttext
# https://webz.io/free-datasets/
# Spanish: https://s3.amazonaws.com/webhose-archive/datasets/645_20170904091816.zip
# Extract text from the JSON files
import os
import json
from sentence_splitter import split_text_into_sentences
from tqdm import tqdm
@ymoslem
ymoslem / CTranslate2-example.py
Last active January 18, 2023 00:23
Example of using CTranslate2 as a translation inference engine
# First convert your OpenNMT-py or OpenNMT-tf model to a CTranslate2 model.
# pip3 install ctranslate2
# • OpenNMT-py:
# ct2-opennmt-py-converter --model_path model.pt --output_dir enja_ctranslate2 --quantization int8
# • OpenNMT-tf:
# ct2-opennmt-tf-converter --model_path model --output_dir enja_ctranslate2 --src_vocab source.vocab --tgt_vocab target.vocab --model_type TransformerBase --quantization int8
import ctranslate2
import sentencepiece as spm
@ymoslem
ymoslem / mBART-example.py
Last active January 5, 2022 00:59
Use nBART pre-trained multilingual model for translation
#!pip install transformers sentencepiece torch -U -q
# Replace "test_source.txt" with your source file.
# Change src_lang, tgt_lang, and lang_code_to_id to the source and target languages you need.
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
import torch
from tqdm import tqdm
# Function to split source lines into chunks to avoid out-of-memory errors
### Loader functions:
_VERSION = '2.0.0'
def setversion(version):
if version != _VERSION:
raise ValueError('Dynamic versioning not available')
def setalphaversions(*alpha_versions):
if alpha_versions != ():
@ymoslem
ymoslem / M2M-100-example.py
Last active August 31, 2025 19:59
Example of translating a file with M2M-100 using CTranslate2
# This example uses M2M-100 models converted to the CTranslate2 format.
# Download CTranslate2 models:
# • M2M-100 418M-parameter model: https://bit.ly/33fM1AO
# • M2M-100 1.2B-parameter model: https://bit.ly/3GYiaed
import ctranslate2
import sentencepiece as spm
import ctranslate2
# Replace with your tokenize function and source tokenization model
def tokenize(input_sentences):
tokens = [input_sentence.split(" ") for input_sentence in input_sentences]
return tokens
# Replace with your detokenize function and target tokenization model
def detokenize(outputs):