Skip to content

Instantly share code, notes, and snippets.

View eustlb's full-sized avatar

eustlb

  • Hugging Face
  • Paris, France
View GitHub Profile
@eustlb
eustlb / benchmark_transformers_whisper.py
Last active November 22, 2024 10:36
Benchmark WER and RTFx for transformers whisper.
TRANSFORMERS_SRC_PATH = "/admin/home/eustache_lebihan/dev/benchmark-whisper/transformers-fix/src"
import sys
sys.path.insert(0, TRANSFORMERS_SRC_PATH)
import wandb
from tqdm import tqdm
import evaluate
import os
import torch
@eustlb
eustlb / benchmark_openai_whisper.py
Last active November 21, 2024 22:40
Benchmark WER and RTFx for openai whisper.
OPENAI_SRC_PATH = "/admin/home/eustache_lebihan/dev/benchmark-whisper/whisper"
import sys
sys.path.insert(0, OPENAI_SRC_PATH)
import wandb
from tqdm import tqdm
import evaluate
import os
import torch
OPENAI_SRC_PATH = "/admin/home/eustache_lebihan/dev/benchmark-whisper/whisper-myfork"
import sys
sys.path.insert(0, OPENAI_SRC_PATH)
import wandb
from tqdm import tqdm
import evaluate
import os
@eustlb
eustlb / benchmark_moonshine_tiny.py
Last active January 21, 2025 13:40
Benchmark moonshine/ whisper for varying batch sizes (FLEURS test set)
import torch
import evaluate
from transformers.models.whisper.english_normalizer import EnglishTextNormalizer
from transformers import MoonshineForConditionalGeneration, AutoProcessor, WhisperProcessor
from datasets import load_dataset, Audio
from tqdm import tqdm
import json
wer_metric = evaluate.load("wer")
device = "cuda:0" if torch.cuda.is_available() else "cpu"
@eustlb
eustlb / benchmark_moonshine_tiny_artif.py
Last active January 21, 2025 16:13
Benchmark moonshine/ whisper for varying audio length and artificial number of generate tokens.
import torch
from transformers import MoonshineForConditionalGeneration, AutoProcessor
from tqdm import tqdm
import json
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float32
attn_implementation = "sdpa"
model = MoonshineForConditionalGeneration.from_pretrained("UsefulSensors/moonshine-tiny", attn_implementation=attn_implementation).to(device, torch_dtype)
@eustlb
eustlb / .gitignore
Last active February 25, 2025 15:07
Benchmark seq vs batched - Style TTS2
traces-*/
@eustlb
eustlb / hub_to_s3.py
Created February 26, 2025 15:55
HF hub to s3 bucket
from typing import IO
from datatrove.io import get_datafolder
from datatrove.executor import SlurmPipelineExecutor
from datatrove.pipeline.readers import ParquetReader
from datatrove.pipeline.writers import ParquetWriter
from datatrove.utils.typeshelper import StatHints
class ParquetReaderInMemory(ParquetReader):
from datasets import load_dataset, Audio
from transformers import (
CsmForConditionalGeneration,
TrainingArguments,
CsmProcessor,
Trainer
)
processor = CsmProcessor.from_pretrained("eustlb/csm-1b")
model = CsmForConditionalGeneration.from_pretrained("eustlb/csm-1b")
@eustlb
eustlb / infer_whisper.py
Last active March 31, 2025 12:44
infer whisper
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
from datasets import load_dataset
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model_id = "openai/whisper-large-v3-turbo"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
@eustlb
eustlb / csm_test_1b_model_integration_generate.py
Last active April 23, 2025 08:42
Reproducer for CSM Transformers integration
# TEST GREEDY FLOAT 32
# make sure to clone [email protected]:eustlb/csm.git and checkout compare-trfms
import sys
sys.path.insert(0, "./csm")
from generator import load_csm_1b, Segment
from datasets import load_dataset, Audio
from huggingface_hub import hf_hub_download
import torch