This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #run: uv pip install nemo_toolkit[asr] | |
| from nemo.collections.asr.modules import AudioToMelSpectrogramPreprocessor | |
| from transformers import ParakeetFeatureExtractor, ParakeetProcessor | |
| from datasets import load_dataset, Audio | |
| import torch | |
| import numpy as np | |
| torch.use_deterministic_algorithms(True) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from datasets import load_dataset, Audio | |
| from transformers import VoxtralForConditionalGeneration, VoxtralProcessor | |
| import os | |
| import torch | |
| from whisper.normalizers import EnglishTextNormalizer | |
| import jiwer | |
| os.environ["CUDA_VISIBLE_DEVICES"] = "0" | |
| torch_device = "cuda" if torch.cuda.is_available() else "cpu" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| STATE_DICT_MAPPING = { | |
| # Subsampling layer | |
| r"encoder\.pre_encode\.": r"encoder.subsampling.", | |
| # Subsampling specific mappings | |
| r"encoder\.subsampling\.conv\.": r"encoder.subsampling.layers.", | |
| r"encoder\.subsampling\.out\.": r"encoder.subsampling.linear.", | |
| # # Positional encoding (skip pe buffer) | |
| # r"encoder\.pos_enc\.pe$": None, # Skip buffer | |
| r"encoder\.pos_enc\.": r"encoder.encode_positions.", | |
| # Conformer layers - attention (NeMo already uses self_attn) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # To install NeMo, run: | |
| # uv pip install git+https://github.com/NVIDIA/NeMo.git@b97e42b3dd1c9bcdf37c81c63220744af474c9c0 | |
| from nemo.collections.asr.models import ASRModel | |
| import torch | |
| import os | |
| from datasets import load_dataset | |
| import soundfile as sf | |
| TMP_DIR = "./tmp" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # To install NeMo, run: | |
| # uv pip install git+https://github.com/NVIDIA/NeMo.git@b97e42b3dd1c9bcdf37c81c63220744af474c9c0 | |
| from nemo.collections.asr.models import ASRModel | |
| import torch | |
| import os | |
| from datasets import load_dataset | |
| import soundfile as sf | |
| TMP_DIR = "./tmp" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import torch | |
| import torch.nn as nn | |
| from torch.nn.utils.parametrizations import weight_norm | |
| from torch.nn.utils.parametrize import remove_parametrizations | |
| # Define dtypes to test | |
| dtypes_to_test = [torch.float64, torch.float32, torch.float16] | |
| for dtype in dtypes_to_test: | |
| print(f"\nTesting with dtype: {dtype}") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import torch | |
| import torch.nn as nn | |
| from torch.nn.utils.parametrizations import weight_norm | |
| from torch.nn.utils.parametrize import remove_parametrizations | |
| # Check if CUDA is available | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| print(f"Using device: {device}") | |
| # 1. Create conv layer and move to device |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from mistral_common.protocol.instruct.messages import TextChunk, AudioChunk, UserMessage, AssistantMessage, RawAudio | |
| from mistral_common.audio import Audio | |
| from huggingface_hub import hf_hub_download | |
| from openai import OpenAI | |
| # Modify OpenAI's API key and API base to use vLLM's API server. | |
| openai_api_key = "EMPTY" | |
| openai_api_base = "http://0.0.0.0:8000/v1" | |
| client = OpenAI( | |
| api_key=openai_api_key, |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from datasets import load_dataset, Audio | |
| from transformers import VoxtralForConditionalGeneration, VoxtralProcessor | |
| import os | |
| import torch | |
| from whisper.normalizers import EnglishTextNormalizer | |
| import jiwer | |
| os.environ["CUDA_VISIBLE_DEVICES"] = "0" | |
| torch_device = "cuda" if torch.cuda.is_available() else "cpu" # "cpu" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from datasets import load_dataset, Audio | |
| from transformers import KyutaiSpeechToTextProcessor, KyutaiSpeechToTextForConditionalGeneration | |
| import os | |
| import torch | |
| from whisper.normalizers import EnglishTextNormalizer | |
| import jiwer | |
| os.environ["CUDA_VISIBLE_DEVICES"] = "3" | |
| torch_device = "cuda" if torch.cuda.is_available() else "cpu" # "cpu" |