Created
May 19, 2025 02:57
-
-
Save bigsnarfdude/90e22245addc6ed50d7fa86128b051a4 to your computer and use it in GitHub Desktop.
p2.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from nemo.collections.asr import models as nemo_asr | |
import numpy as np | |
import librosa | |
import soundfile as sf | |
import os | |
def transcribe_with_chunking(audio_path, asr_model, chunk_duration=30, overlap_duration=2): | |
""" | |
Transcribe audio file by breaking it into overlapping chunks | |
Parameters: | |
- audio_path: Path to the audio file | |
- asr_model: Loaded NVIDIA Parakeet TDT model | |
- chunk_duration: Duration of each chunk in seconds | |
- overlap_duration: Overlap between chunks in seconds | |
Returns: | |
- Full transcription text | |
""" | |
# Load audio file | |
audio, sr = librosa.load(audio_path, sr=16000) # Parakeet expects 16kHz | |
# Calculate chunk size in samples | |
chunk_size = int(chunk_duration * sr) | |
overlap_size = int(overlap_duration * sr) | |
# Calculate the number of chunks | |
total_samples = len(audio) | |
num_chunks = max(1, int(np.ceil((total_samples - overlap_size) / (chunk_size - overlap_size)))) | |
full_transcription = [] | |
# Process each chunk | |
for i in range(num_chunks): | |
# Calculate start and end positions for this chunk | |
start = i * (chunk_size - overlap_size) | |
end = min(start + chunk_size, total_samples) | |
# Extract the chunk | |
chunk = audio[start:end] | |
# Save chunk temporarily | |
temp_path = f"temp_chunk_{i}.wav" | |
sf.write(temp_path, chunk, sr) | |
# Transcribe the chunk | |
print(f"Transcribing chunk {i+1}/{num_chunks}...") | |
result = asr_model.transcribe([temp_path]) | |
full_transcription.append(result[0].text) | |
# Clean up temp file | |
os.remove(temp_path) | |
# Join all transcriptions | |
return " ".join(full_transcription) | |
# Load the NVIDIA Parakeet TDT model | |
print("Loading NVIDIA Parakeet TDT model...") | |
asr_model = nemo_asr.ASRModel.from_pretrained(model_name="nvidia/parakeet-tdt-0.6b-v2") | |
# Optional: Enable local attention for better memory efficiency with long audio | |
# This is recommended for longer files to reduce memory usage | |
asr_model.change_attention_model("rel_pos_local_attn", [128, 128]) # Local attention | |
asr_model.change_subsampling_conv_chunking_factor(1) # Auto select | |
# Transcribe with chunking | |
print("Starting transcription...") | |
transcription = transcribe_with_chunking("video.mp4", asr_model) | |
print(transcription) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment