👩‍🎓

Yasmin Moslem ymoslem

👩‍🎓

Senior NLP Researcher | PhD

ymoslem / ChatGPT-translation.py

Last active May 4, 2024 16:50

Minimal working code for translation with GPT-4, "gpt-3.5-turbo" (a.k.a. ChatGPT) and "text-davinci-003"

	# pip3 install openai

	import openai
	import time


	OPENAI_API_KEY = "your_api_key_here"
	openai.api_key = OPENAI_API_KEY

	prompt = """French: La semaine dernière, quelqu’un m’a fait part de sa gratitude envers notre travail.

ymoslem / GPT-3_term_extractor.py

Last active December 27, 2023 03:50

Bilignual Terminology Extraction with GPT-3 and Streamlit GPU

	import streamlit as st
	import openai


	# Change to your OpenAI API key
	OPENAI_API_KEY = "your_OpenAI_API_key_here"
	openai.api_key = OPENAI_API_KEY

	# Set the page layout to wide
	st.set_page_config(page_title="Extract Terms", page_icon=None, layout="wide")

ymoslem / ctranslate2_mwe.py

Last active January 18, 2023 00:21

	import ctranslate2


	# Replace with your tokenize function and source tokenization model
	def tokenize(input_sentences):
	tokens = [input_sentence.split(" ") for input_sentence in input_sentences]
	return tokens

	# Replace with your detokenize function and target tokenization model
	def detokenize(outputs):

ymoslem / M2M-100-example.py

Last active September 1, 2024 13:23

Example of translating a file with M2M-100 using CTranslate2

	# This example uses M2M-100 models converted to the CTranslate2 format.
	# Download CTranslate2 models:
	# • M2M-100 418M-parameter model: https://bit.ly/33fM1AO
	# • M2M-100 1.2B-parameter model: https://bit.ly/3GYiaed


	import ctranslate2
	import sentencepiece as spm

ymoslem / Pmw.py

Created January 4, 2022 10:08

	### Loader functions:

	_VERSION = '2.0.0'

	def setversion(version):
	if version != _VERSION:
	raise ValueError('Dynamic versioning not available')

	def setalphaversions(*alpha_versions):
	if alpha_versions != ():

ymoslem / mBART-example.py

Last active January 5, 2022 00:59

Use nBART pre-trained multilingual model for translation

	#!pip install transformers sentencepiece torch -U -q
	# Replace "test_source.txt" with your source file.
	# Change src_lang, tgt_lang, and lang_code_to_id to the source and target languages you need.

	from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
	import torch
	from tqdm import tqdm


	# Function to split source lines into chunks to avoid out-of-memory errors

ymoslem / CTranslate2-example.py

Last active January 18, 2023 00:23

Example of using CTranslate2 as a translation inference engine

	# First convert your OpenNMT-py or OpenNMT-tf model to a CTranslate2 model.
	# pip3 install ctranslate2
	# • OpenNMT-py:
	# ct2-opennmt-py-converter --model_path model.pt --output_dir enja_ctranslate2 --quantization int8
	# • OpenNMT-tf:
	# ct2-opennmt-tf-converter --model_path model --output_dir enja_ctranslate2 --src_vocab source.vocab --tgt_vocab target.vocab --model_type TransformerBase --quantization int8


	import ctranslate2
	import sentencepiece as spm

ymoslem / webz-dataset-text-extraction.py

Created December 2, 2021 14:25

	# https://webz.io/free-datasets/
	# Spanish: https://s3.amazonaws.com/webhose-archive/datasets/645_20170904091816.zip
	# Extract text from the JSON files


	import os
	import json
	from sentence_splitter import split_text_into_sentences
	from tqdm import tqdm

ymoslem / filter-monolingual.py

Last active August 30, 2022 12:52

	# Remove duplicate, lines with bad characters, and shuffle
	# Find the number of CPUs/cores to add to parallel: nproc --all
	# sort -S 50% --parallel=4 dataset.es \| uniq -u > dataset.unique.es
	# shuf dataset.unique.es > dataset.unique.shuf.es
	# !perl -ne '/�/ or print' dataset.unique.shuf.es > dataset.unique.shuf.cleaner.es


	import re
	import fasttext

ymoslem / language_detection.py

Last active March 13, 2022 15:53

Runtime test of language detection libraries.

	# -- coding: utf-8 --
	# pip3 install gdown langdetect fasttext pycld2 py3langid

	import gdown
	from datetime import datetime

	# Download fasttext models
	url = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz"
	output = "lid.176.ftz"
	gdown.download(url, output, quiet=False)