Skip to content

Instantly share code, notes, and snippets.

@YourFriendCaspian
Forked from srli/stt_py3.py
Created July 2, 2018 04:52
Show Gist options
  • Save YourFriendCaspian/187501213e55025a63da1b95d7fb6d1f to your computer and use it in GitHub Desktop.
Save YourFriendCaspian/187501213e55025a63da1b95d7fb6d1f to your computer and use it in GitHub Desktop.
Speech to text with PocketSphinx for Python3
from pocketsphinx.pocketsphinx import *
from sphinxbase.sphinxbase import *
import os
import pyaudio
import wave
import audioop
from collections import deque
import time
import math
“””
Written by Sophie Li, 2016
http://blog.justsophie.com/python-speech-to-text-with-pocketsphinx/
“””
class SpeechDetector:
def __init__(self):
# Microphone stream config.
self.CHUNK = 1024 # CHUNKS of bytes to read each time from mic
self.FORMAT = pyaudio.paInt16
self.CHANNELS = 1
self.RATE = 16000
self.SILENCE_LIMIT = 1 # Silence limit in seconds. The max ammount of seconds where
# only silence is recorded. When this time passes the
# recording finishes and the file is decoded
self.PREV_AUDIO = 0.5 # Previous audio (in seconds) to prepend. When noise
# is detected, how much of previously recorded audio is
# prepended. This helps to prevent chopping the beginning
# of the phrase.
self.THRESHOLD = 4500
self.num_phrases = -1
# These will need to be modified according to where the pocketsphinx folder is
MODELDIR = “pocketsphinx/model”
DATADIR = “pocketsphinx/test/data”
# Create a decoder with certain model
config = Decoder.default_config()
config.set_string(‘-hmm’, os.path.join(MODELDIR, ‘en-us/en-us’))
config.set_string(‘-lm’, os.path.join(MODELDIR, ‘en-us/en-us.lm.bin’))
config.set_string(‘-dict’, os.path.join(MODELDIR, ‘en-us/cmudict-en-us.dict’))
# Creaders decoder object for streaming data.
self.decoder = Decoder(config)
def setup_mic(self, num_samples=50):
“”” Gets average audio intensity of your mic sound. You can use it to get
average intensities while you’re talking and/or silent. The average
is the avg of the .2 of the largest intensities recorded.
“””
print (“Getting intensity values from mic.”)
p = pyaudio.PyAudio()
stream = p.open(format=self.FORMAT,
channels=self.CHANNELS,
rate=self.RATE,
input=True,
frames_per_buffer=self.CHUNK)
values = [math.sqrt(abs(audioop.avg(stream.read(self.CHUNK), 4)))
for x in range(num_samples)]
values = sorted(values, reverse=True)
r = sum(values[:int(num_samples * 0.2)]) / int(num_samples * 0.2)
print (” Finished “)
print (” Average audio intensity is %s ” % r)
stream.close()
p.terminate()
if r self.THRESHOLD for x in slid_win]) > 0:
if started == False:
print (“Starting recording of phrase”)
started = True
audio2send.append(cur_data)
elif started:
print (“Finished recording, decoding phrase”)
filename = self.save_speech(list(prev_audio) + audio2send, p)
r = self.decode_phrase(filename)
print (“DETECTED: %s” % r)
# Removes temp audio file
os.remove(filename)
# Reset all
started = False
slid_win = deque(maxlen=int(self.SILENCE_LIMIT * rel))
prev_audio = deque(maxlen=int(0.5 * rel))
audio2send = []
print (“Listening …”)
else:
prev_audio.append(cur_data)
print (“* Done listening”)
stream.close()
p.terminate()
if __name__ == “__main__”:
sd = SpeechDetector()
sd.run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment