Skip to content

Instantly share code, notes, and snippets.

@twobob
Created January 26, 2025 23:49
Show Gist options
  • Save twobob/e788f403512810ede2cb70eb6bffb83c to your computer and use it in GitHub Desktop.
Save twobob/e788f403512810ede2cb70eb6bffb83c to your computer and use it in GitHub Desktop.
ideas for audio pipelines
#Phase 1: Data Preprocessing & Label Refinement
#1. Scraping Audio from YouTube
from yt_dlp import YoutubeDL
# Download audio from YouTube
def download_audio(url, output_dir="audio"):
ydl_opts = {
'format': 'bestaudio/best',
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3',
'preferredquality': '192',
}],
'outtmpl': f'{output_dir}/%(title)s.%(ext)s',
}
with YoutubeDL(ydl_opts) as ydl:
ydl.download([url])
# Example usage
download_audio("https://www.youtube.com/watch?v=example_video_id")
#2. Transcription with Whisper ASR
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torchaudio
# Load Whisper model
processor = WhisperProcessor.from_pretrained("openai/whisper-medium")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-medium")
# Transcribe audio
def transcribe_audio(file_path):
waveform, sample_rate = torchaudio.load(file_path)
inputs = processor(waveform.squeeze().numpy(), sampling_rate=sample_rate, return_tensors="pt")
predicted_ids = model.generate(inputs.input_features)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
return transcription[0]
# Example usage
transcription = transcribe_audio("audio/example.mp3")
print(transcription)
#3. fastText for Text Classification
import fasttext
# Train a fastText classifier
def train_fasttext_classifier(data_path, model_path="classifier.bin"):
model = fasttext.train_supervised(input=data_path, epoch=25, lr=1.0, wordNgrams=2)
model.save_model(model_path)
return model
# Example usage
train_fasttext_classifier("transcriptions.txt")
# Classify a new transcript
def classify_transcript(model_path, text):
model = fasttext.load_model(model_path)
return model.predict(text)
# Example usage
label, confidence = classify_transcript("classifier.bin", "This is a technical lecture about AI.")
print(f"Label: {label}, Confidence: {confidence}")
#Phase 2: Model Pre-training
#1. Preprocessing Audio with Librosa
import librosa
import numpy as np
# Extract MFCC features
def extract_mfcc(file_path, n_mfcc=13):
y, sr = librosa.load(file_path, sr=None)
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
return mfcc
# Example usage
mfcc_features = extract_mfcc("audio/example.mp3")
print(mfcc_features.shape)
#2. Pre-training Wav2Vec 2.0
from transformers import Wav2Vec2ForPreTraining, Wav2Vec2Processor
import torch
# Load Wav2Vec 2.0 model
model = Wav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-large")
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large")
# Preprocess and pre-train
def preprocess_and_train(file_path):
waveform, sample_rate = torchaudio.load(file_path)
inputs = processor(waveform.squeeze().numpy(), sampling_rate=sample_rate, return_tensors="pt")
outputs = model(**inputs)
return outputs
# Example usage
outputs = preprocess_and_train("audio/example.mp3")
#Phase 3: Fine-tuning & Evaluation
#1. Fine-tuning Wav2Vec 2.0 for ASR
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch
# Load model and processor
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
# Fine-tune on custom data
def fine_tune_asr(file_path, transcription):
waveform, sample_rate = torchaudio.load(file_path)
inputs = processor(waveform.squeeze().numpy(), sampling_rate=sample_rate, return_tensors="pt", text=transcription)
labels = processor(inputs["text"], return_tensors="pt").input_ids
outputs = model(**inputs, labels=labels)
loss = outputs.loss
return loss
# Example usage
loss = fine_tune_asr("audio/example.mp3", "This is a transcription.")
print(f"Loss: {loss}")
#2. Evaluation on Validation Set
from sklearn.metrics import accuracy_score
# Evaluate ASR model
def evaluate_asr(model, processor, file_paths, transcriptions):
predictions = []
for file_path in file_paths:
waveform, sample_rate = torchaudio.load(file_path)
inputs = processor(waveform.squeeze().numpy(), sampling_rate=sample_rate, return_tensors="pt")
predicted_ids = model.generate(inputs.input_features)
predictions.append(processor.batch_decode(predicted_ids, skip_special_tokens=True)[0])
accuracy = accuracy_score(transcriptions, predictions)
return accuracy
# Example usage
file_paths = ["audio/example1.mp3", "audio/example2.mp3"]
transcriptions = ["This is the first transcription.", "This is the second transcription."]
accuracy = evaluate_asr(model, processor, file_paths, transcriptions)
print(f"Accuracy: {accuracy}")
#Hardware Stack Considerations
#1. Distributed Training with PyTorch
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
# Initialize distributed training
def init_distributed():
dist.init_process_group(backend="nccl")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")
model = DDP(model)
return model
# Example usage
model = init_distributed()
#Points of Potential Disagreement or Further Refinement
#1. Fusion of Textual and Acoustic Features
from sklearn.ensemble import RandomForestClassifier
# Combine MFCC features and fastText embeddings
def combine_features(mfcc_features, fasttext_embeddings):
combined = np.hstack([mfcc_features.mean(axis=1), fasttext_embeddings])
return combined
# Train a classifier on combined features
def train_combined_classifier(features, labels):
clf = RandomForestClassifier()
clf.fit(features, labels)
return clf
# Example usage
mfcc_features = extract_mfcc("audio/example.mp3")
fasttext_embeddings = model.get_sentence_vector("This is a transcription.")
combined_features = combine_features(mfcc_features, fasttext_embeddings)
clf = train_combined_classifier([combined_features], ["relevant"])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment