Created
July 13, 2025 22:47
-
-
Save tivrfoa/a62366cb0c0322c81cef29163f3ad080 to your computer and use it in GitHub Desktop.
Transcribing Audio Files using Python speech_recognition and ffmpeg, by ChatGPT
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import speech_recognition as sr | |
import sys | |
import os | |
import subprocess | |
import tempfile | |
def convert_to_wav(input_file): | |
"""Convert input audio to a temporary WAV file using ffmpeg""" | |
tmp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") | |
tmp_wav.close() | |
command = [ | |
"ffmpeg", "-y", # overwrite if needed | |
"-i", input_file, | |
"-ac", "1", # mono | |
"-ar", "16000", # 16kHz sample rate | |
tmp_wav.name | |
] | |
try: | |
subprocess.run(command, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) | |
except subprocess.CalledProcessError: | |
print("β ffmpeg failed to convert the audio file.") | |
sys.exit(1) | |
return tmp_wav.name | |
if len(sys.argv) < 2: | |
print("Usage: python3 speech1.py <audio_file>") | |
sys.exit(1) | |
input_path = sys.argv[1] | |
file_ext = os.path.splitext(input_path)[1].lower() | |
if file_ext != ".wav": | |
print(f"π Converting {file_ext} to .wav...") | |
input_path = convert_to_wav(input_path) | |
cleanup_needed = True | |
else: | |
cleanup_needed = False | |
recognizer = sr.Recognizer() | |
with sr.AudioFile(input_path) as source: | |
audio = recognizer.record(source) | |
print("π Transcribing...") | |
try: | |
print(recognizer.recognize_google(audio)) | |
except sr.UnknownValueError: | |
print("π Could not understand the audio.") | |
except sr.RequestError as e: | |
print(f"π API error: {e}") | |
# Clean up temp file | |
if cleanup_needed: | |
os.remove(input_path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
It works with many languages, eg:
recognizer.recognize_google(audio, language="zh-CN")
https://cloud.google.com/speech-to-text/docs/speech-to-text-supported-languages