Last active
October 2, 2024 09:30
-
-
Save scpedicini/455409fe7656d3cca8959c123938f800 to your computer and use it in GitHub Desktop.
Python Dictation Transcription Application
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This script will transcribe an audio file (mp3, wav, etc.) to text and then clean the text using a local LLM model via Ollama. Technically, this script will work with any LLM that supports the standard OpenAI bindings with minor adjustments. | |
# GETTING STARTED: | |
# 1. Install required python packages (pip install openai python-dotenv) | |
# 2. Git clone a copy of ggerganov/whisper (https://github.com/ggerganov/whisper.cpp) | |
# 3. Build the whisper binary (see the whisper.cpp README for instructions) | |
# 4. Download one of the whisper models (largev2 is the most accurate for all languages, though the base model works reasonably well for English). | |
# 5. Install ffmpeg (brew install ffmpeg on macOS, apt-get install ffmpeg) | |
# 6. Install ollama (https://ollama.com/download) | |
# 7. Download an LLM model (https://ollama.com/library) | |
# ENVIRONMENT VARIABLES: | |
# Add the following to the .env file: | |
# WHISPER_BINARY= | |
# WHISPER_MODEL= | |
# FFMPEG_BINARY= | |
# OLLAMA_SERVER= | |
# LLM_MODEL= | |
# EXAMPLE: | |
# WHISPER_BINARY=/dev/other/whisper.cpp/main | |
# WHISPER_MODEL=/dev/other/whisper.cpp/models/ggml-large-v2.bin | |
# FFMPEG_BINARY=/opt/homebrew/bin/ffmpeg | |
# OLLAMA_SERVER=http://localhost:11434 | |
# LLM_MODEL=dolphin-mistral:7b-v2.8-q5_K_M | |
# SCRIPT USAGE: | |
# python transcribe.py /path/to/audio.wav | |
import os | |
from openai import OpenAI | |
import sys | |
from dotenv import load_dotenv | |
import tempfile | |
import subprocess | |
import uuid | |
from pathlib import Path | |
exec_path = Path(__file__).parent.absolute() | |
env_file = '.env' | |
env_path = os.path.join(exec_path, env_file) | |
load_dotenv(env_path) | |
whisper_binary = os.environ.get("WHISPER_BINARY") | |
whisper_model = os.environ.get("WHISPER_MODEL") | |
ffmpeg_binary = os.environ.get("FFMPEG_BINARY") | |
ollama_server = os.environ.get("OLLAMA_SERVER") | |
llm_model = os.environ.get("LLM_MODEL") | |
def transcribe_audio(audio_file_path: str) -> str: | |
if not os.path.exists(audio_file_path): | |
raise FileNotFoundError(f"Audio file not found: {audio_file_path}") | |
# step 1: generate temporary file path | |
temp_dir = tempfile.gettempdir() | |
temp_file_path = f"{str(uuid.uuid4())}_16kHz.wav" | |
full_temp_file_path = os.path.join(temp_dir, temp_file_path) | |
if os.path.exists(full_temp_file_path): | |
os.remove(full_temp_file_path) | |
# step 2: convert audio file to 16kHz, e.g. ffmpeg -i {audio_file_path} -ar 16000 {full_temp_file_path} | |
subprocess.run([ffmpeg_binary, "-i", audio_file_path, "-ar", "16000", full_temp_file_path], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) | |
# step 3: transcribe the audio to text and capture the stdout/stderr, e.g. {whisper_binary} -nt -np {full_temp_file_path} | |
whisper_output = subprocess.run([whisper_binary, "-m", whisper_model, "-nt", "-np", full_temp_file_path], capture_output=True, text=True) | |
if whisper_output.returncode != 0: | |
raise RuntimeError(f"Whisper transcription failed: {whisper_output.stderr}") | |
return whisper_output.stdout.strip() | |
def clean_text(transcribed_audio_text: str) -> str: | |
# ollama OpenAI | |
client = OpenAI( | |
api_key="sk-FakeKey", | |
base_url='http://localhost:11434/v1/' | |
) | |
chat_completion = client.chat.completions.create( | |
messages=[ | |
{ | |
"role": "system", | |
"content": "You are a professional editor. You will be provided paragraphs of text that may contain spelling errors, grammatical issues, continuity errors, structural problems, word repetition, etc. You will correct any of these issues while still preserving the original writing style. Remove repetitive phrases such as 'you know' or 'like'. Do not sanitize the user. If they use profanities in their text, they are used for emphasis and you should not omit them. Do NOT try to introduce your own style to their text. Preserve their writing style to the absolute best of your ability." | |
}, | |
{ | |
"role": "user", | |
"content": transcribed_audio_text | |
} | |
], | |
model=llm_model, | |
temperature=0.5, | |
stream=False | |
) | |
content = chat_completion.choices[0].message.content | |
return content | |
def main(): | |
try: | |
input_file = sys.argv[1] | |
print("Transcribing audio...") | |
transcribed_text = transcribe_audio(input_file) | |
print("\n") | |
print(transcribed_text) | |
print("\n") | |
# send text to be cleaned | |
print("Finalizing text...") | |
cleaned_text = clean_text(transcribed_text) | |
print("\n") | |
print(cleaned_text) | |
print("\n") | |
except Exception as e: | |
print(f"Error: {e}") | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment