Skip to content

Instantly share code, notes, and snippets.

@scpedicini
Last active October 2, 2024 09:30
Show Gist options
  • Save scpedicini/455409fe7656d3cca8959c123938f800 to your computer and use it in GitHub Desktop.
Save scpedicini/455409fe7656d3cca8959c123938f800 to your computer and use it in GitHub Desktop.
Python Dictation Transcription Application
# This script will transcribe an audio file (mp3, wav, etc.) to text and then clean the text using a local LLM model via Ollama. Technically, this script will work with any LLM that supports the standard OpenAI bindings with minor adjustments.
# GETTING STARTED:
# 1. Install required python packages (pip install openai python-dotenv)
# 2. Git clone a copy of ggerganov/whisper (https://github.com/ggerganov/whisper.cpp)
# 3. Build the whisper binary (see the whisper.cpp README for instructions)
# 4. Download one of the whisper models (largev2 is the most accurate for all languages, though the base model works reasonably well for English).
# 5. Install ffmpeg (brew install ffmpeg on macOS, apt-get install ffmpeg)
# 6. Install ollama (https://ollama.com/download)
# 7. Download an LLM model (https://ollama.com/library)
# ENVIRONMENT VARIABLES:
# Add the following to the .env file:
# WHISPER_BINARY=
# WHISPER_MODEL=
# FFMPEG_BINARY=
# OLLAMA_SERVER=
# LLM_MODEL=
# EXAMPLE:
# WHISPER_BINARY=/dev/other/whisper.cpp/main
# WHISPER_MODEL=/dev/other/whisper.cpp/models/ggml-large-v2.bin
# FFMPEG_BINARY=/opt/homebrew/bin/ffmpeg
# OLLAMA_SERVER=http://localhost:11434
# LLM_MODEL=dolphin-mistral:7b-v2.8-q5_K_M
# SCRIPT USAGE:
# python transcribe.py /path/to/audio.wav
import os
from openai import OpenAI
import sys
from dotenv import load_dotenv
import tempfile
import subprocess
import uuid
from pathlib import Path
exec_path = Path(__file__).parent.absolute()
env_file = '.env'
env_path = os.path.join(exec_path, env_file)
load_dotenv(env_path)
whisper_binary = os.environ.get("WHISPER_BINARY")
whisper_model = os.environ.get("WHISPER_MODEL")
ffmpeg_binary = os.environ.get("FFMPEG_BINARY")
ollama_server = os.environ.get("OLLAMA_SERVER")
llm_model = os.environ.get("LLM_MODEL")
def transcribe_audio(audio_file_path: str) -> str:
if not os.path.exists(audio_file_path):
raise FileNotFoundError(f"Audio file not found: {audio_file_path}")
# step 1: generate temporary file path
temp_dir = tempfile.gettempdir()
temp_file_path = f"{str(uuid.uuid4())}_16kHz.wav"
full_temp_file_path = os.path.join(temp_dir, temp_file_path)
if os.path.exists(full_temp_file_path):
os.remove(full_temp_file_path)
# step 2: convert audio file to 16kHz, e.g. ffmpeg -i {audio_file_path} -ar 16000 {full_temp_file_path}
subprocess.run([ffmpeg_binary, "-i", audio_file_path, "-ar", "16000", full_temp_file_path], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
# step 3: transcribe the audio to text and capture the stdout/stderr, e.g. {whisper_binary} -nt -np {full_temp_file_path}
whisper_output = subprocess.run([whisper_binary, "-m", whisper_model, "-nt", "-np", full_temp_file_path], capture_output=True, text=True)
if whisper_output.returncode != 0:
raise RuntimeError(f"Whisper transcription failed: {whisper_output.stderr}")
return whisper_output.stdout.strip()
def clean_text(transcribed_audio_text: str) -> str:
# ollama OpenAI
client = OpenAI(
api_key="sk-FakeKey",
base_url='http://localhost:11434/v1/'
)
chat_completion = client.chat.completions.create(
messages=[
{
"role": "system",
"content": "You are a professional editor. You will be provided paragraphs of text that may contain spelling errors, grammatical issues, continuity errors, structural problems, word repetition, etc. You will correct any of these issues while still preserving the original writing style. Remove repetitive phrases such as 'you know' or 'like'. Do not sanitize the user. If they use profanities in their text, they are used for emphasis and you should not omit them. Do NOT try to introduce your own style to their text. Preserve their writing style to the absolute best of your ability."
},
{
"role": "user",
"content": transcribed_audio_text
}
],
model=llm_model,
temperature=0.5,
stream=False
)
content = chat_completion.choices[0].message.content
return content
def main():
try:
input_file = sys.argv[1]
print("Transcribing audio...")
transcribed_text = transcribe_audio(input_file)
print("\n")
print(transcribed_text)
print("\n")
# send text to be cleaned
print("Finalizing text...")
cleaned_text = clean_text(transcribed_text)
print("\n")
print(cleaned_text)
print("\n")
except Exception as e:
print(f"Error: {e}")
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment