Skip to content

Instantly share code, notes, and snippets.

@aminnj
Created December 30, 2023 04:44
Show Gist options
  • Save aminnj/dda66cdb159e94a478c9719f64846d83 to your computer and use it in GitHub Desktop.
Save aminnj/dda66cdb159e94a478c9719f64846d83 to your computer and use it in GitHub Desktop.
STT on Mac using Speech framework
from datetime import datetime, timedelta
from Speech import SFSpeechRecognizer, SFSpeechAudioBufferRecognitionRequest
from AppKit import NSRunLoop
from PyObjCTools import AppHelper
import AVFoundation
# https://developer.apple.com/documentation/speech/recognizing_speech_in_live_audio
# https://github.com/SKaplanOfficial/PyXA/blob/11c7d4db4623b91415bd962d41ff3747a2808163/PyXA/Additions/Speech.py#L85
# to be able to ctrl-c
AppHelper.installMachInterrupt()
class STT(object):
def __init__(self, live_panel=None):
SFSpeechRecognizer.requestAuthorization_(None)
self.audio_session = AVFoundation.AVAudioSession.sharedInstance()
self.audio_engine = AVFoundation.AVAudioEngine.alloc().init()
self.input_node = self.audio_engine.inputNode()
self.recording_format = self.input_node.outputFormatForBus_(0)
self.audio_engine.prepare()
self.audio_engine.startAndReturnError_(None)
self.input_node.installTapOnBus_bufferSize_format_block_(
0, 1024, self.recording_format, self._process_buffer_callback
)
self.live_panel = live_panel
def _process_buffer_callback(self, buffer, timing):
self.recognition_request.appendAudioPCMBuffer_(buffer)
def _process_speech_detection(self, result, error):
if error is not None:
print(f"Error! {error}")
return
best = result.bestTranscription()
if self.live_panel is not None:
self.live_panel.update(best.formattedString())
def _setup(self):
self.recognizer = SFSpeechRecognizer.alloc().init()
self.recognition_request = SFSpeechAudioBufferRecognitionRequest.alloc().init()
self.recognition_request.setShouldReportPartialResults_(True)
self.recognition_request.setAddsPunctuation_(True)
self.recognition_task = self.recognizer.recognitionTaskWithRequest_resultHandler_(self.recognition_request, self._process_speech_detection)
def run(self):
self._setup()
NSRunLoop.currentRunLoop().runUntilDate_(datetime.now() + timedelta(seconds = 30))
if __name__ == "__main__":
from rich.live import Live
live = Live()
live.start()
stt = STT(live_panel=live)
stt.run()
live.stop()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment