scpedicini · July 20, 2025 01:43
diff --git a/transcribe.py b/transcribe.py
 # This script will transcribe an audio file (mp3, wav, etc.) to text and then clean the text using a local LLM model via Ollama. Technically, this script will work with any LLM that supports the standard OpenAI bindings with minor adjustments. 

 # GETTING STARTED:
 # 1. Install required python packages (pip install openai python-dotenv)
 # 2. Git clone a copy of ggerganov/whisper (https://github.com/ggerganov/whisper.cpp)
 # 3. Build the whisper binary (see the whisper.cpp README for instructions)
 # 4. Download one of the whisper models (largev2 is the most accurate for all languages, though the base model works reasonably well for English).
 # 5. Install ffmpeg (brew install ffmpeg on macOS, apt-get install ffmpeg)
 # 6. Install ollama (https://ollama.com/download)
 # 7. Download an LLM model (https://ollama.com/library)

 # ENVIRONMENT VARIABLES:
 # Add the following to the .env file:
 # WHISPER_BINARY=
 # WHISPER_MODEL=
 # FFMPEG_BINARY=
 # OLLAMA_SERVER=
 # LLM_MODEL=

 # EXAMPLE:
 # WHISPER_BINARY=/dev/other/whisper.cpp/main
 # WHISPER_MODEL=/dev/other/whisper.cpp/models/ggml-large-v2.bin
 # FFMPEG_BINARY=/opt/homebrew/bin/ffmpeg
 # OLLAMA_SERVER=http://localhost:11434
 # LLM_MODEL=dolphin-mistral:7b-v2.8-q5_K_M

 # SCRIPT USAGE:
 # python transcribe.py /path/to/audio.wav

 import os
 from openai import OpenAI
 import sys
 from dotenv import load_dotenv
 import tempfile
 import subprocess
 import uuid
 from pathlib import Path

 exec_path = Path(__file__).parent.absolute()
 env_file = '.env'
 env_path = os.path.join(exec_path, env_file)

 load_dotenv(env_path)

 whisper_binary = os.environ.get("WHISPER_BINARY")
 whisper_model = os.environ.get("WHISPER_MODEL")
 ffmpeg_binary = os.environ.get("FFMPEG_BINARY")
 ollama_server = os.environ.get("OLLAMA_SERVER")
 llm_model = os.environ.get("LLM_MODEL")

 def transcribe_audio(audio_file_path: str) -> str:
    if not os.path.exists(audio_file_path):
        raise FileNotFoundError(f"Audio file not found: {audio_file_path}")

    # step 1: generate temporary file path
    temp_dir = tempfile.gettempdir()

    temp_file_path = f"{str(uuid.uuid4())}_16kHz.wav"
    full_temp_file_path = os.path.join(temp_dir, temp_file_path)
    if os.path.exists(full_temp_file_path):
        os.remove(full_temp_file_path)

    # step 2: convert audio file to 16kHz, e.g. ffmpeg -i {audio_file_path} -ar 16000 {full_temp_file_path}
    subprocess.run([ffmpeg_binary, "-i", audio_file_path, "-ar", "16000", full_temp_file_path], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

    # step 3: transcribe the audio to text and capture the stdout/stderr, e.g. {whisper_binary} -nt -np {full_temp_file_path}
    whisper_output = subprocess.run([whisper_binary, "-m", whisper_model, "-nt", "-np", full_temp_file_path], capture_output=True, text=True)

    if whisper_output.returncode != 0:
        raise RuntimeError(f"Whisper transcription failed: {whisper_output.stderr}")

    return whisper_output.stdout.strip()


 def clean_text(transcribed_audio_text: str) -> str:

    # ollama OpenAI
    client = OpenAI(
        api_key="sk-FakeKey",
        base_url='http://localhost:11434/v1/'
    )

    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": "You are a professional editor. You will be provided paragraphs of text that may contain spelling errors, grammatical issues, continuity errors, structural problems, word repetition, etc. You will correct any of these issues while still preserving the original writing style. Remove repetitive phrases such as 'you know' or 'like'. Do not sanitize the user. If they use profanities in their text, they are used for emphasis and you should not omit them. Do NOT try to introduce your own style to their text. Preserve their writing style to the absolute best of your ability."
            },
            {
                "role": "user",
                "content": transcribed_audio_text
            }
        ],
        model=llm_model,
        temperature=0.5,
        stream=False
    )

    content = chat_completion.choices[0].message.content

    return content


 def main():
    try:
        input_file = sys.argv[1]

        print("Transcribing audio...")
        transcribed_text = transcribe_audio(input_file)
        print("\n")
        print(transcribed_text)
        print("\n")

        # send text to be cleaned
        print("Finalizing text...")
        cleaned_text = clean_text(transcribed_text)
        print("\n")
        print(cleaned_text)
        print("\n")
    except Exception as e:
        print(f"Error: {e}")


 if __name__ == '__main__':
    main()
	# This script will transcribe an audio file (mp3, wav, etc.) to text and then clean the text using a local LLM model via Ollama. Technically, this script will work with any LLM that supports the standard OpenAI bindings with minor adjustments.

	# GETTING STARTED:
	# 1. Install required python packages (pip install openai python-dotenv)
	# 2. Git clone a copy of ggerganov/whisper (https://github.com/ggerganov/whisper.cpp)
	# 3. Build the whisper binary (see the whisper.cpp README for instructions)
	# 4. Download one of the whisper models (largev2 is the most accurate for all languages, though the base model works reasonably well for English).
	# 5. Install ffmpeg (brew install ffmpeg on macOS, apt-get install ffmpeg)
	# 6. Install ollama (https://ollama.com/download)
	# 7. Download an LLM model (https://ollama.com/library)

	# ENVIRONMENT VARIABLES:
	# Add the following to the .env file:
	# WHISPER_BINARY=
	# WHISPER_MODEL=
	# FFMPEG_BINARY=
	# OLLAMA_SERVER=
	# LLM_MODEL=

	# EXAMPLE:
	# WHISPER_BINARY=/dev/other/whisper.cpp/main
	# WHISPER_MODEL=/dev/other/whisper.cpp/models/ggml-large-v2.bin
	# FFMPEG_BINARY=/opt/homebrew/bin/ffmpeg
	# OLLAMA_SERVER=http://localhost:11434
	# LLM_MODEL=dolphin-mistral:7b-v2.8-q5_K_M

	# SCRIPT USAGE:
	# python transcribe.py /path/to/audio.wav

	import os
	from openai import OpenAI
	import sys
	from dotenv import load_dotenv
	import tempfile
	import subprocess
	import uuid
	from pathlib import Path

	exec_path = Path(__file__).parent.absolute()
	env_file = '.env'
	env_path = os.path.join(exec_path, env_file)

	load_dotenv(env_path)

	whisper_binary = os.environ.get("WHISPER_BINARY")
	whisper_model = os.environ.get("WHISPER_MODEL")
	ffmpeg_binary = os.environ.get("FFMPEG_BINARY")
	ollama_server = os.environ.get("OLLAMA_SERVER")
	llm_model = os.environ.get("LLM_MODEL")

	def transcribe_audio(audio_file_path: str) -> str:
	if not os.path.exists(audio_file_path):
	raise FileNotFoundError(f"Audio file not found: {audio_file_path}")

	# step 1: generate temporary file path
	temp_dir = tempfile.gettempdir()

	temp_file_path = f"{str(uuid.uuid4())}_16kHz.wav"
	full_temp_file_path = os.path.join(temp_dir, temp_file_path)
	if os.path.exists(full_temp_file_path):
	os.remove(full_temp_file_path)

	# step 2: convert audio file to 16kHz, e.g. ffmpeg -i {audio_file_path} -ar 16000 {full_temp_file_path}
	subprocess.run([ffmpeg_binary, "-i", audio_file_path, "-ar", "16000", full_temp_file_path], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

	# step 3: transcribe the audio to text and capture the stdout/stderr, e.g. {whisper_binary} -nt -np {full_temp_file_path}
	whisper_output = subprocess.run([whisper_binary, "-m", whisper_model, "-nt", "-np", full_temp_file_path], capture_output=True, text=True)

	if whisper_output.returncode != 0:
	raise RuntimeError(f"Whisper transcription failed: {whisper_output.stderr}")

	return whisper_output.stdout.strip()


	def clean_text(transcribed_audio_text: str) -> str:

	# ollama OpenAI
	client = OpenAI(
	api_key="sk-FakeKey",
	base_url='http://localhost:11434/v1/'
	)

	chat_completion = client.chat.completions.create(
	messages=[
	{
	"role": "system",
	"content": "You are a professional editor. You will be provided paragraphs of text that may contain spelling errors, grammatical issues, continuity errors, structural problems, word repetition, etc. You will correct any of these issues while still preserving the original writing style. Remove repetitive phrases such as 'you know' or 'like'. Do not sanitize the user. If they use profanities in their text, they are used for emphasis and you should not omit them. Do NOT try to introduce your own style to their text. Preserve their writing style to the absolute best of your ability."
	},
	{
	"role": "user",
	"content": transcribed_audio_text
	}
	],
	model=llm_model,
	temperature=0.5,
	stream=False
	)

	content = chat_completion.choices[0].message.content

	return content


	def main():
	try:
	input_file = sys.argv[1]

	print("Transcribing audio...")
	transcribed_text = transcribe_audio(input_file)
	print("\n")
	print(transcribed_text)
	print("\n")

	# send text to be cleaned
	print("Finalizing text...")
	cleaned_text = clean_text(transcribed_text)
	print("\n")
	print(cleaned_text)
	print("\n")
	except Exception as e:
	print(f"Error: {e}")


	if __name__ == '__main__':
	main()