Created
February 25, 2024 02:35
-
-
Save max-arnold/02c381bdf9f787aff83c009659ebf659 to your computer and use it in GitHub Desktop.
Continuous voice recognition in Emacs using Yandex Speechkit
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Based on https://cloud.yandex.ru/ru/docs/speechkit/stt/api/microphone-streaming | |
import pyaudio | |
import wave | |
import argparse | |
import grpc | |
from datetime import datetime | |
import time | |
import queue | |
import sys | |
import yandex.cloud.ai.stt.v3.stt_pb2 as stt_pb2 | |
import yandex.cloud.ai.stt.v3.stt_service_pb2_grpc as stt_service_pb2_grpc | |
FORMAT = pyaudio.paInt16 | |
CHANNELS = 1 | |
RATE = 48000 | |
CHUNK = 4096 | |
RECORD_SECONDS = 60 | |
def save_frames(audio, frames): | |
if frames and False: | |
now = datetime.now().strftime('%Y%m%d%H%M%S') | |
waveFile = wave.open(f"{now}.wav", 'wb') | |
waveFile.setnchannels(CHANNELS) | |
waveFile.setsampwidth(audio.get_sample_size(FORMAT)) | |
waveFile.setframerate(RATE) | |
waveFile.writeframes(b''.join(frames)) | |
waveFile.close() | |
def reqiter(audio_queue, frames, lang): | |
recognize_options = stt_pb2.StreamingOptions( | |
recognition_model=stt_pb2.RecognitionModelOptions( | |
audio_format=stt_pb2.AudioFormatOptions( | |
raw_audio=stt_pb2.RawAudio( | |
audio_encoding=stt_pb2.RawAudio.LINEAR16_PCM, | |
sample_rate_hertz=RATE, | |
audio_channel_count=1 | |
) | |
), | |
text_normalization=stt_pb2.TextNormalizationOptions( | |
text_normalization=stt_pb2.TextNormalizationOptions.TEXT_NORMALIZATION_ENABLED, | |
profanity_filter=True, | |
literature_text=False | |
), | |
language_restriction=stt_pb2.LanguageRestrictionOptions( | |
restriction_type=stt_pb2.LanguageRestrictionOptions.WHITELIST, | |
language_code=[{'ru': 'ru-RU', 'en': 'en-US'}.get(lang, 'en-US')] | |
), | |
audio_processing_type=stt_pb2.RecognitionModelOptions.REAL_TIME | |
) | |
) | |
yield stt_pb2.StreamingRequest(session_options=recognize_options) | |
sys.stderr.write("Recording...\n") | |
end_time = time.monotonic() + RECORD_SECONDS | |
while time.monotonic() < end_time: | |
mic_data = audio_queue.get() | |
frames.append(mic_data) | |
yield stt_pb2.StreamingRequest(chunk=stt_pb2.AudioChunk(data=mic_data)) | |
def run(token, lang): | |
audio = pyaudio.PyAudio() | |
sys.stderr.write(str(audio.get_default_input_device_info()) + "\n") | |
audio_queue = queue.Queue() | |
def callback(input_data, frame_count, time_info, status_flag): | |
audio_queue.put_nowait(input_data) | |
return (input_data, pyaudio.paContinue) | |
stream = audio.open( | |
format=FORMAT, | |
channels=CHANNELS, | |
rate=RATE, | |
input=True, | |
frames_per_buffer=CHUNK, | |
stream_callback=callback, | |
) | |
try: | |
while True: | |
frames = [] | |
cred = grpc.ssl_channel_credentials() | |
channel = grpc.secure_channel('stt.api.cloud.yandex.net:443', cred) | |
stub = stt_service_pb2_grpc.RecognizerStub(channel) | |
it = stub.RecognizeStreaming(reqiter(audio_queue, frames, lang), metadata=( | |
('authorization', f'Api-Key {token}'), | |
)) | |
try: | |
for r in it: | |
event_type, alternatives = r.WhichOneof('Event'), None | |
if event_type == 'partial' and len(r.partial.alternatives) > 0: | |
alternatives = [a.text for a in r.partial.alternatives] | |
if event_type == 'final': | |
alternatives = [a.text for a in r.final.alternatives] | |
if event_type == 'final_refinement': | |
alternatives = [a.text for a in r.final_refinement.normalized_text.alternatives] | |
if alternatives and alternatives[0]: | |
print(f'{{"event": "{event_type}", "text": "{alternatives[0]}"}}') | |
if event_type == "eou_update": | |
print('{"event": "eou"}') | |
except grpc._channel._Rendezvous as err: | |
sys.stderr.write(f'Error code {err._state.code}, message: {err._state.details}' + "\n") | |
raise err | |
save_frames(audio, frames) | |
except KeyboardInterrupt: | |
pass | |
finally: | |
save_frames(audio, frames) | |
stream.stop_stream() | |
stream.close() | |
audio.terminate() | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--token', required=True, help='API key or IAM token') | |
parser.add_argument('-l', dest="lang", help='Language') | |
args = parser.parse_args() | |
run(args.token, args.lang) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
~/.virtualenvs/speechkit-listener/bin/python listen.py --token XXX $@ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;; Based on https://sachachua.com/blog/2023/12/live-speech-with-deepgram/ | |
;; sqlite3 ~/Library/Application\ Support/com.apple.TCC/TCC.db | |
;; insert into access values ('kTCCServiceMicrophone', 'org.gnu.Emacs', 0, 2, 4, 1, null, null, null, 'UNUSED', null, null, 1704007344); | |
(defvar my/speechkit-buffer "*Speech*") | |
(defvar my/speechkit-current-buffer nil) | |
(defvar my/speechkit-process nil) | |
(defvar my/speechkit-stdout-buffer "*Speech JSON*") | |
(defvar my/speechkit-stderr-buffer "*Speech stderr*") | |
(defvar my/speechkit-auto-scroll t) | |
(defvar my/speechkit--change-group nil) | |
(defun my/speechkit-start (&optional arg) | |
"Turn on live captions." | |
(interactive "P") | |
(let ((lang (if (and current-input-method (string-match "cyrillic" current-input-method)) "-l ru" "-l en"))) | |
(with-current-buffer (if arg (current-buffer) (get-buffer-create my/speechkit-buffer)) | |
(setq my/speechkit-current-buffer (current-buffer)) | |
(unless (process-live-p my/speechkit-process) | |
(let ((default-directory "~/play/speechkit")) | |
(with-current-buffer (get-buffer-create my/speechkit-stdout-buffer) | |
(erase-buffer)) | |
(setq my/speechkit-process | |
(make-process | |
:command `("bash" "run-listen.sh" ,lang) | |
:name "speech" | |
:filter 'my/speechkit-json-filter | |
:sentinel #'my/speechkit-process-sentinel | |
:buffer my/speechkit-stdout-buffer | |
:stderr my/speechkit-stderr-buffer)))) | |
(display-buffer (current-buffer))))) | |
(defun my/speechkit-stop () | |
(interactive) | |
(if (process-live-p my/speechkit-process) | |
(interrupt-process my/speechkit-process))) | |
(defun my/speechkit-process-sentinel (proc event) | |
(when (string-match "finished" event) | |
(my/speechkit-stop))) | |
(defun my/speechkit-json-filter (proc string) | |
(when (buffer-live-p (process-buffer proc)) | |
(with-current-buffer (process-buffer proc) | |
(let* ((proc-mark (process-mark proc)) | |
(moving (= (point) proc-mark))) | |
;; insert the output | |
(save-excursion | |
(goto-char proc-mark) | |
(insert string) | |
(set-marker proc-mark (point))) | |
(if moving (goto-char proc-mark)) | |
;; process and remove all complete lines of JSON (lines are complete if ending with \n) | |
(let ((pos (point-min))) | |
(while (progn (goto-char pos) | |
(end-of-line) | |
(equal (following-char) ?\n)) | |
(let* ((end (point)) | |
(line (buffer-substring pos end))) | |
(delete-region pos (+ end 1)) | |
(my/speechkit-display-in-speech-buffer (json-parse-string line :object-type 'alist))))))))) | |
(defun my/speechkit-display-in-speech-buffer (json-object) | |
(with-current-buffer my/speechkit-current-buffer | |
(let-alist json-object | |
(let* ((at-end (eobp))) | |
(when (equal .event "eou") | |
(end-of-line) | |
(unless (bolp) | |
(insert "\n"))) | |
(when (equal .event "partial") | |
(beginning-of-line) | |
(unless (eolp) (kill-line)) | |
(insert .text)) | |
(when (equal .event "final") | |
(beginning-of-line) | |
(unless (eolp) (kill-line)) | |
(insert .text)) | |
(when (equal .event "final_refinement") | |
(beginning-of-line) | |
(unless (eolp) (kill-line)) | |
(insert .text "\n") | |
(end-of-line)) | |
(set-window-point (get-buffer-window (current-buffer)) (point)) | |
)))) | |
(defun my/speechkit-toggle-listen (&optional arg) | |
(interactive "P") | |
(if (process-live-p my/speechkit-process) | |
(progn | |
(my/speechkit-stop) | |
(message "SpeechKit stopped")) | |
(progn | |
(my/speechkit-start arg) | |
(message "SpeechKit started")))) | |
(provide 'speechkit) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment