Skip to content

Instantly share code, notes, and snippets.

@max-arnold
Created February 25, 2024 02:35
Show Gist options
  • Save max-arnold/02c381bdf9f787aff83c009659ebf659 to your computer and use it in GitHub Desktop.
Save max-arnold/02c381bdf9f787aff83c009659ebf659 to your computer and use it in GitHub Desktop.
Continuous voice recognition in Emacs using Yandex Speechkit
# Based on https://cloud.yandex.ru/ru/docs/speechkit/stt/api/microphone-streaming
import pyaudio
import wave
import argparse
import grpc
from datetime import datetime
import time
import queue
import sys
import yandex.cloud.ai.stt.v3.stt_pb2 as stt_pb2
import yandex.cloud.ai.stt.v3.stt_service_pb2_grpc as stt_service_pb2_grpc
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 48000
CHUNK = 4096
RECORD_SECONDS = 60
def save_frames(audio, frames):
if frames and False:
now = datetime.now().strftime('%Y%m%d%H%M%S')
waveFile = wave.open(f"{now}.wav", 'wb')
waveFile.setnchannels(CHANNELS)
waveFile.setsampwidth(audio.get_sample_size(FORMAT))
waveFile.setframerate(RATE)
waveFile.writeframes(b''.join(frames))
waveFile.close()
def reqiter(audio_queue, frames, lang):
recognize_options = stt_pb2.StreamingOptions(
recognition_model=stt_pb2.RecognitionModelOptions(
audio_format=stt_pb2.AudioFormatOptions(
raw_audio=stt_pb2.RawAudio(
audio_encoding=stt_pb2.RawAudio.LINEAR16_PCM,
sample_rate_hertz=RATE,
audio_channel_count=1
)
),
text_normalization=stt_pb2.TextNormalizationOptions(
text_normalization=stt_pb2.TextNormalizationOptions.TEXT_NORMALIZATION_ENABLED,
profanity_filter=True,
literature_text=False
),
language_restriction=stt_pb2.LanguageRestrictionOptions(
restriction_type=stt_pb2.LanguageRestrictionOptions.WHITELIST,
language_code=[{'ru': 'ru-RU', 'en': 'en-US'}.get(lang, 'en-US')]
),
audio_processing_type=stt_pb2.RecognitionModelOptions.REAL_TIME
)
)
yield stt_pb2.StreamingRequest(session_options=recognize_options)
sys.stderr.write("Recording...\n")
end_time = time.monotonic() + RECORD_SECONDS
while time.monotonic() < end_time:
mic_data = audio_queue.get()
frames.append(mic_data)
yield stt_pb2.StreamingRequest(chunk=stt_pb2.AudioChunk(data=mic_data))
def run(token, lang):
audio = pyaudio.PyAudio()
sys.stderr.write(str(audio.get_default_input_device_info()) + "\n")
audio_queue = queue.Queue()
def callback(input_data, frame_count, time_info, status_flag):
audio_queue.put_nowait(input_data)
return (input_data, pyaudio.paContinue)
stream = audio.open(
format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK,
stream_callback=callback,
)
try:
while True:
frames = []
cred = grpc.ssl_channel_credentials()
channel = grpc.secure_channel('stt.api.cloud.yandex.net:443', cred)
stub = stt_service_pb2_grpc.RecognizerStub(channel)
it = stub.RecognizeStreaming(reqiter(audio_queue, frames, lang), metadata=(
('authorization', f'Api-Key {token}'),
))
try:
for r in it:
event_type, alternatives = r.WhichOneof('Event'), None
if event_type == 'partial' and len(r.partial.alternatives) > 0:
alternatives = [a.text for a in r.partial.alternatives]
if event_type == 'final':
alternatives = [a.text for a in r.final.alternatives]
if event_type == 'final_refinement':
alternatives = [a.text for a in r.final_refinement.normalized_text.alternatives]
if alternatives and alternatives[0]:
print(f'{{"event": "{event_type}", "text": "{alternatives[0]}"}}')
if event_type == "eou_update":
print('{"event": "eou"}')
except grpc._channel._Rendezvous as err:
sys.stderr.write(f'Error code {err._state.code}, message: {err._state.details}' + "\n")
raise err
save_frames(audio, frames)
except KeyboardInterrupt:
pass
finally:
save_frames(audio, frames)
stream.stop_stream()
stream.close()
audio.terminate()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--token', required=True, help='API key or IAM token')
parser.add_argument('-l', dest="lang", help='Language')
args = parser.parse_args()
run(args.token, args.lang)
#!/bin/bash
~/.virtualenvs/speechkit-listener/bin/python listen.py --token XXX $@
;; Based on https://sachachua.com/blog/2023/12/live-speech-with-deepgram/
;; sqlite3 ~/Library/Application\ Support/com.apple.TCC/TCC.db
;; insert into access values ('kTCCServiceMicrophone', 'org.gnu.Emacs', 0, 2, 4, 1, null, null, null, 'UNUSED', null, null, 1704007344);
(defvar my/speechkit-buffer "*Speech*")
(defvar my/speechkit-current-buffer nil)
(defvar my/speechkit-process nil)
(defvar my/speechkit-stdout-buffer "*Speech JSON*")
(defvar my/speechkit-stderr-buffer "*Speech stderr*")
(defvar my/speechkit-auto-scroll t)
(defvar my/speechkit--change-group nil)
(defun my/speechkit-start (&optional arg)
"Turn on live captions."
(interactive "P")
(let ((lang (if (and current-input-method (string-match "cyrillic" current-input-method)) "-l ru" "-l en")))
(with-current-buffer (if arg (current-buffer) (get-buffer-create my/speechkit-buffer))
(setq my/speechkit-current-buffer (current-buffer))
(unless (process-live-p my/speechkit-process)
(let ((default-directory "~/play/speechkit"))
(with-current-buffer (get-buffer-create my/speechkit-stdout-buffer)
(erase-buffer))
(setq my/speechkit-process
(make-process
:command `("bash" "run-listen.sh" ,lang)
:name "speech"
:filter 'my/speechkit-json-filter
:sentinel #'my/speechkit-process-sentinel
:buffer my/speechkit-stdout-buffer
:stderr my/speechkit-stderr-buffer))))
(display-buffer (current-buffer)))))
(defun my/speechkit-stop ()
(interactive)
(if (process-live-p my/speechkit-process)
(interrupt-process my/speechkit-process)))
(defun my/speechkit-process-sentinel (proc event)
(when (string-match "finished" event)
(my/speechkit-stop)))
(defun my/speechkit-json-filter (proc string)
(when (buffer-live-p (process-buffer proc))
(with-current-buffer (process-buffer proc)
(let* ((proc-mark (process-mark proc))
(moving (= (point) proc-mark)))
;; insert the output
(save-excursion
(goto-char proc-mark)
(insert string)
(set-marker proc-mark (point)))
(if moving (goto-char proc-mark))
;; process and remove all complete lines of JSON (lines are complete if ending with \n)
(let ((pos (point-min)))
(while (progn (goto-char pos)
(end-of-line)
(equal (following-char) ?\n))
(let* ((end (point))
(line (buffer-substring pos end)))
(delete-region pos (+ end 1))
(my/speechkit-display-in-speech-buffer (json-parse-string line :object-type 'alist)))))))))
(defun my/speechkit-display-in-speech-buffer (json-object)
(with-current-buffer my/speechkit-current-buffer
(let-alist json-object
(let* ((at-end (eobp)))
(when (equal .event "eou")
(end-of-line)
(unless (bolp)
(insert "\n")))
(when (equal .event "partial")
(beginning-of-line)
(unless (eolp) (kill-line))
(insert .text))
(when (equal .event "final")
(beginning-of-line)
(unless (eolp) (kill-line))
(insert .text))
(when (equal .event "final_refinement")
(beginning-of-line)
(unless (eolp) (kill-line))
(insert .text "\n")
(end-of-line))
(set-window-point (get-buffer-window (current-buffer)) (point))
))))
(defun my/speechkit-toggle-listen (&optional arg)
(interactive "P")
(if (process-live-p my/speechkit-process)
(progn
(my/speechkit-stop)
(message "SpeechKit stopped"))
(progn
(my/speechkit-start arg)
(message "SpeechKit started"))))
(provide 'speechkit)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment