Skip to content

Instantly share code, notes, and snippets.

@titipata
Last active January 9, 2025 15:34
Show Gist options
  • Save titipata/c0dc0763c5e4cf6119f22737cdf4dbce to your computer and use it in GitHub Desktop.
Save titipata/c0dc0763c5e4cf6119f22737cdf4dbce to your computer and use it in GitHub Desktop.

Chunks audio files

Original code from Surachet.

import librosa
import soundfile as sf
from pathlib import Path
from pydub import AudioSegment

def convert_mp4_to_wav(input_path, output_path=None):
   """Convert MP4 video to WAV audio using librosa"""
   if output_path is None:
       output_path = input_path.rsplit('.', 1)[0] + '.wav'

   # Load audio and convert to wav
   y, sr = librosa.load(input_path)
   sf.write(output_path, y, sr)
   print(f"Converted {input_path} to {output_path}")

def split_audio(audio_path, output_folder, duration=300, overlap=30):
    """
    Split audio file into segments with specified duration and overlap interval.
    
    Parameters:
        audio_path (str): Path to input audio file
        output_folder (str): Directory to save segments
        duration (int): Duration of each segment in seconds (default: 300)
        overlap (int): Overlap interval between segments in seconds (default: 30)
    """
    # Create output directory using pathlib
    output_dir = Path(output_folder)
    output_dir.mkdir(exist_ok=True)

    print(f"Loading: {audio_path}")
    audio = AudioSegment.from_file(audio_path)

    # Convert to milliseconds
    duration_ms = duration * 1000
    overlap_ms = overlap * 1000
    
    # Calculate step size (duration - overlap)
    step_ms = duration_ms - overlap_ms
    
    # Process segments
    segments = []
    for i, start in enumerate(range(0, len(audio) - overlap_ms, step_ms)):
        end = start + duration_ms
        if end > len(audio):
            end = len(audio)

        segment = audio[start:end]
        path = output_dir / f"segment_{i+1:02d}.wav"
        segments.append((i, segment, path))

    # Export segments
    for i, segment, path in segments:
        segment.export(path, format="wav")
        print(f"Segment {i+1} saved: {path}")
    
    print("Audio splitting completed.")

Usage

input_file = "../../example_file.mp4"
convert_mp4_to_wav(input_file)  # convert the MP4 to WAV format
split_audio("example_file.wav", "segments")  # segment into chunks

Gemini Transcription

from pathlib import Path
import json
import google.generativeai as genai
import pandas as pd
from glob import glob
from tqdm.auto import tqdm

genai.configure(api_key="API_KEY")
model = genai.GenerativeModel(model_name="gemini-1.5-flash")
prompt = "Please transcribe the given recorded interview speech."


def transcribe_audio_folder(folder_path, prompt="Please transcribe the given recorded interview speech."):
    """
    Transcribe all WAV files in a folder using Google's Generative AI.
    
    Args:
        folder_path (str): Path to folder containing WAV files
        prompt (str): Prompt for the transcription (default: "Generate a transcript of the speech.")
    
    Returns:
        pd.DataFrame: DataFrame containing audio filenames and their transcriptions
    """
    # Ensure folder path is valid
    folder = Path(folder_path)
    if not folder.exists():
        raise ValueError(f"Folder not found: {folder_path}")
    
    # Get all WAV files in the folder
    audio_files = list(folder.glob("*.wav"))
    if not audio_files:
        raise ValueError(f"No WAV files found in {folder_path}")
    
    transcriptions = []
    
    # Process each audio file
    for audio_file in tqdm(audio_files, desc="Transcribing audio files"):
        try:            
            # Upload and process audio file
            audio_upload = genai.upload_file(str(audio_file))
            response = model.generate_content([prompt, audio_upload])
            transcription = response.text.replace("```json", "").replace("```", "").strip()
            transcriptions.append({
                "audio_file": audio_file.name,
                "text": transcription
            })
        except Exception as e:
            print(f"Error processing {audio_file.name}: {str(e)}")
            transcriptions.append({
                "audio_file": audio_file.name,
                "text": f"ERROR: {str(e)}"
            })

    # Convert to DataFrame
    return pd.DataFrame(transcriptions)

# Example usage:
df = transcribe_audio_folder("path/to/segments")
df.to_csv("transcriptions.csv", index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment