Skip to content

Instantly share code, notes, and snippets.

@bigsnarfdude
Created May 19, 2025 02:57
Show Gist options
  • Save bigsnarfdude/90e22245addc6ed50d7fa86128b051a4 to your computer and use it in GitHub Desktop.
Save bigsnarfdude/90e22245addc6ed50d7fa86128b051a4 to your computer and use it in GitHub Desktop.
p2.py
from nemo.collections.asr import models as nemo_asr
import numpy as np
import librosa
import soundfile as sf
import os
def transcribe_with_chunking(audio_path, asr_model, chunk_duration=30, overlap_duration=2):
"""
Transcribe audio file by breaking it into overlapping chunks
Parameters:
- audio_path: Path to the audio file
- asr_model: Loaded NVIDIA Parakeet TDT model
- chunk_duration: Duration of each chunk in seconds
- overlap_duration: Overlap between chunks in seconds
Returns:
- Full transcription text
"""
# Load audio file
audio, sr = librosa.load(audio_path, sr=16000) # Parakeet expects 16kHz
# Calculate chunk size in samples
chunk_size = int(chunk_duration * sr)
overlap_size = int(overlap_duration * sr)
# Calculate the number of chunks
total_samples = len(audio)
num_chunks = max(1, int(np.ceil((total_samples - overlap_size) / (chunk_size - overlap_size))))
full_transcription = []
# Process each chunk
for i in range(num_chunks):
# Calculate start and end positions for this chunk
start = i * (chunk_size - overlap_size)
end = min(start + chunk_size, total_samples)
# Extract the chunk
chunk = audio[start:end]
# Save chunk temporarily
temp_path = f"temp_chunk_{i}.wav"
sf.write(temp_path, chunk, sr)
# Transcribe the chunk
print(f"Transcribing chunk {i+1}/{num_chunks}...")
result = asr_model.transcribe([temp_path])
full_transcription.append(result[0].text)
# Clean up temp file
os.remove(temp_path)
# Join all transcriptions
return " ".join(full_transcription)
# Load the NVIDIA Parakeet TDT model
print("Loading NVIDIA Parakeet TDT model...")
asr_model = nemo_asr.ASRModel.from_pretrained(model_name="nvidia/parakeet-tdt-0.6b-v2")
# Optional: Enable local attention for better memory efficiency with long audio
# This is recommended for longer files to reduce memory usage
asr_model.change_attention_model("rel_pos_local_attn", [128, 128]) # Local attention
asr_model.change_subsampling_conv_chunking_factor(1) # Auto select
# Transcribe with chunking
print("Starting transcription...")
transcription = transcribe_with_chunking("video.mp4", asr_model)
print(transcription)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment