This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# pip3 install openai | |
import openai | |
import time | |
OPENAI_API_KEY = "your_api_key_here" | |
openai.api_key = OPENAI_API_KEY | |
prompt = """French: La semaine dernière, quelqu’un m’a fait part de sa gratitude envers notre travail. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import streamlit as st | |
import openai | |
# Change to your OpenAI API key | |
OPENAI_API_KEY = "your_OpenAI_API_key_here" | |
openai.api_key = OPENAI_API_KEY | |
# Set the page layout to wide | |
st.set_page_config(page_title="Extract Terms", page_icon=None, layout="wide") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import ctranslate2 | |
# Replace with your tokenize function and source tokenization model | |
def tokenize(input_sentences): | |
tokens = [input_sentence.split(" ") for input_sentence in input_sentences] | |
return tokens | |
# Replace with your detokenize function and target tokenization model | |
def detokenize(outputs): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This example uses M2M-100 models converted to the CTranslate2 format. | |
# Download CTranslate2 models: | |
# • M2M-100 418M-parameter model: https://bit.ly/33fM1AO | |
# • M2M-100 1.2B-parameter model: https://bit.ly/3GYiaed | |
import ctranslate2 | |
import sentencepiece as spm | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### Loader functions: | |
_VERSION = '2.0.0' | |
def setversion(version): | |
if version != _VERSION: | |
raise ValueError('Dynamic versioning not available') | |
def setalphaversions(*alpha_versions): | |
if alpha_versions != (): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!pip install transformers sentencepiece torch -U -q | |
# Replace "test_source.txt" with your source file. | |
# Change src_lang, tgt_lang, and lang_code_to_id to the source and target languages you need. | |
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast | |
import torch | |
from tqdm import tqdm | |
# Function to split source lines into chunks to avoid out-of-memory errors |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# First convert your OpenNMT-py or OpenNMT-tf model to a CTranslate2 model. | |
# pip3 install ctranslate2 | |
# • OpenNMT-py: | |
# ct2-opennmt-py-converter --model_path model.pt --output_dir enja_ctranslate2 --quantization int8 | |
# • OpenNMT-tf: | |
# ct2-opennmt-tf-converter --model_path model --output_dir enja_ctranslate2 --src_vocab source.vocab --tgt_vocab target.vocab --model_type TransformerBase --quantization int8 | |
import ctranslate2 | |
import sentencepiece as spm |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# https://webz.io/free-datasets/ | |
# Spanish: https://s3.amazonaws.com/webhose-archive/datasets/645_20170904091816.zip | |
# Extract text from the JSON files | |
import os | |
import json | |
from sentence_splitter import split_text_into_sentences | |
from tqdm import tqdm |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Remove duplicate, lines with bad characters, and shuffle | |
# Find the number of CPUs/cores to add to parallel: nproc --all | |
# sort -S 50% --parallel=4 dataset.es | uniq -u > dataset.unique.es | |
# shuf dataset.unique.es > dataset.unique.shuf.es | |
# !perl -ne '/�/ or print' dataset.unique.shuf.es > dataset.unique.shuf.cleaner.es | |
import re | |
import fasttext |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# pip3 install gdown langdetect fasttext pycld2 py3langid | |
import gdown | |
from datetime import datetime | |
# Download fasttext models | |
url = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.ftz" | |
output = "lid.176.ftz" | |
gdown.download(url, output, quiet=False) |
NewerOlder