Created
July 17, 2023 01:12
-
-
Save arctic-hen7/5580d5452f77d4e6b206caf90f8d73c6 to your computer and use it in GitHub Desktop.
A Python script that records audio from the microphone into a temporary file until the user presses Enter, and that then transcribes that audio using Whisper. A plug-and-play way of adding voice non-real-time voice command to Python apps!
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import whisper | |
import pyaudio | |
import wave | |
import sys | |
import tempfile | |
from ctypes import * | |
# Load the Whisper model once | |
model = whisper.load_model("base.en") | |
# Records audio directly from the microphone and then transcribes it to text using Whisper, returning that transcription. | |
def transcribe_directly(): | |
# Create a temporary file to store the recorded audio (this will be deleted once we've finished transcription) | |
temp_file = tempfile.NamedTemporaryFile(suffix=".wav") | |
sample_rate = 16000 | |
bits_per_sample = 16 | |
chunk_size = 1024 | |
audio_format = pyaudio.paInt16 | |
channels = 1 | |
def callback(in_data, frame_count, time_info, status): | |
wav_file.writeframes(in_data) | |
return None, pyaudio.paContinue | |
# Open the wave file for writing | |
wav_file = wave.open(temp_file.name, 'wb') | |
wav_file.setnchannels(channels) | |
wav_file.setsampwidth(bits_per_sample // 8) | |
wav_file.setframerate(sample_rate) | |
# Suppress ALSA warnings (https://stackoverflow.com/a/13453192) | |
ERROR_HANDLER_FUNC = CFUNCTYPE(None, c_char_p, c_int, c_char_p, c_int, c_char_p) | |
def py_error_handler(filename, line, function, err, fmt): | |
return | |
c_error_handler = ERROR_HANDLER_FUNC(py_error_handler) | |
asound = cdll.LoadLibrary('libasound.so') | |
asound.snd_lib_error_set_handler(c_error_handler) | |
# Initialize PyAudio | |
audio = pyaudio.PyAudio() | |
# Start recording audio | |
stream = audio.open(format=audio_format, | |
channels=channels, | |
rate=sample_rate, | |
input=True, | |
frames_per_buffer=chunk_size, | |
stream_callback=callback) | |
input("Press Enter to stop recording...") | |
# Stop and close the audio stream | |
stream.stop_stream() | |
stream.close() | |
audio.terminate() | |
# Close the wave file | |
wav_file.close() | |
# And transcribe the audio to text (suppressing warnings about running on a CPU) | |
result = model.transcribe(temp_file.name, fp16=False) | |
temp_file.close() | |
return result["text"].strip() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment