Created
December 30, 2023 04:44
-
-
Save aminnj/dda66cdb159e94a478c9719f64846d83 to your computer and use it in GitHub Desktop.
STT on Mac using Speech framework
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from datetime import datetime, timedelta | |
from Speech import SFSpeechRecognizer, SFSpeechAudioBufferRecognitionRequest | |
from AppKit import NSRunLoop | |
from PyObjCTools import AppHelper | |
import AVFoundation | |
# https://developer.apple.com/documentation/speech/recognizing_speech_in_live_audio | |
# https://github.com/SKaplanOfficial/PyXA/blob/11c7d4db4623b91415bd962d41ff3747a2808163/PyXA/Additions/Speech.py#L85 | |
# to be able to ctrl-c | |
AppHelper.installMachInterrupt() | |
class STT(object): | |
def __init__(self, live_panel=None): | |
SFSpeechRecognizer.requestAuthorization_(None) | |
self.audio_session = AVFoundation.AVAudioSession.sharedInstance() | |
self.audio_engine = AVFoundation.AVAudioEngine.alloc().init() | |
self.input_node = self.audio_engine.inputNode() | |
self.recording_format = self.input_node.outputFormatForBus_(0) | |
self.audio_engine.prepare() | |
self.audio_engine.startAndReturnError_(None) | |
self.input_node.installTapOnBus_bufferSize_format_block_( | |
0, 1024, self.recording_format, self._process_buffer_callback | |
) | |
self.live_panel = live_panel | |
def _process_buffer_callback(self, buffer, timing): | |
self.recognition_request.appendAudioPCMBuffer_(buffer) | |
def _process_speech_detection(self, result, error): | |
if error is not None: | |
print(f"Error! {error}") | |
return | |
best = result.bestTranscription() | |
if self.live_panel is not None: | |
self.live_panel.update(best.formattedString()) | |
def _setup(self): | |
self.recognizer = SFSpeechRecognizer.alloc().init() | |
self.recognition_request = SFSpeechAudioBufferRecognitionRequest.alloc().init() | |
self.recognition_request.setShouldReportPartialResults_(True) | |
self.recognition_request.setAddsPunctuation_(True) | |
self.recognition_task = self.recognizer.recognitionTaskWithRequest_resultHandler_(self.recognition_request, self._process_speech_detection) | |
def run(self): | |
self._setup() | |
NSRunLoop.currentRunLoop().runUntilDate_(datetime.now() + timedelta(seconds = 30)) | |
if __name__ == "__main__": | |
from rich.live import Live | |
live = Live() | |
live.start() | |
stt = STT(live_panel=live) | |
stt.run() | |
live.stop() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment