Created
          June 25, 2018 14:47 
        
      - 
      
 - 
        
Save srli/d6f2908f032c333f1e4fe022663b25cc to your computer and use it in GitHub Desktop.  
    Speech to text with PocketSphinx for Python3
  
        
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | from pocketsphinx.pocketsphinx import * | |
| from sphinxbase.sphinxbase import * | |
| import os | |
| import pyaudio | |
| import wave | |
| import audioop | |
| from collections import deque | |
| import time | |
| import math | |
| “”” | |
| Written by Sophie Li, 2016 | |
| http://blog.justsophie.com/python-speech-to-text-with-pocketsphinx/ | |
| “”” | |
| class SpeechDetector: | |
| def __init__(self): | |
| # Microphone stream config. | |
| self.CHUNK = 1024 # CHUNKS of bytes to read each time from mic | |
| self.FORMAT = pyaudio.paInt16 | |
| self.CHANNELS = 1 | |
| self.RATE = 16000 | |
| self.SILENCE_LIMIT = 1 # Silence limit in seconds. The max ammount of seconds where | |
| # only silence is recorded. When this time passes the | |
| # recording finishes and the file is decoded | |
| self.PREV_AUDIO = 0.5 # Previous audio (in seconds) to prepend. When noise | |
| # is detected, how much of previously recorded audio is | |
| # prepended. This helps to prevent chopping the beginning | |
| # of the phrase. | |
| self.THRESHOLD = 4500 | |
| self.num_phrases = -1 | |
| # These will need to be modified according to where the pocketsphinx folder is | |
| MODELDIR = “pocketsphinx/model” | |
| DATADIR = “pocketsphinx/test/data” | |
| # Create a decoder with certain model | |
| config = Decoder.default_config() | |
| config.set_string(‘-hmm’, os.path.join(MODELDIR, ‘en-us/en-us’)) | |
| config.set_string(‘-lm’, os.path.join(MODELDIR, ‘en-us/en-us.lm.bin’)) | |
| config.set_string(‘-dict’, os.path.join(MODELDIR, ‘en-us/cmudict-en-us.dict’)) | |
| # Creaders decoder object for streaming data. | |
| self.decoder = Decoder(config) | |
| def setup_mic(self, num_samples=50): | |
| “”” Gets average audio intensity of your mic sound. You can use it to get | |
| average intensities while you’re talking and/or silent. The average | |
| is the avg of the .2 of the largest intensities recorded. | |
| “”” | |
| print (“Getting intensity values from mic.”) | |
| p = pyaudio.PyAudio() | |
| stream = p.open(format=self.FORMAT, | |
| channels=self.CHANNELS, | |
| rate=self.RATE, | |
| input=True, | |
| frames_per_buffer=self.CHUNK) | |
| values = [math.sqrt(abs(audioop.avg(stream.read(self.CHUNK), 4))) | |
| for x in range(num_samples)] | |
| values = sorted(values, reverse=True) | |
| r = sum(values[:int(num_samples * 0.2)]) / int(num_samples * 0.2) | |
| print (” Finished “) | |
| print (” Average audio intensity is %s ” % r) | |
| stream.close() | |
| p.terminate() | |
| if r self.THRESHOLD for x in slid_win]) > 0: | |
| if started == False: | |
| print (“Starting recording of phrase”) | |
| started = True | |
| audio2send.append(cur_data) | |
| elif started: | |
| print (“Finished recording, decoding phrase”) | |
| filename = self.save_speech(list(prev_audio) + audio2send, p) | |
| r = self.decode_phrase(filename) | |
| print (“DETECTED: %s” % r) | |
| # Removes temp audio file | |
| os.remove(filename) | |
| # Reset all | |
| started = False | |
| slid_win = deque(maxlen=int(self.SILENCE_LIMIT * rel)) | |
| prev_audio = deque(maxlen=int(0.5 * rel)) | |
| audio2send = [] | |
| print (“Listening …”) | |
| else: | |
| prev_audio.append(cur_data) | |
| print (“* Done listening”) | |
| stream.close() | |
| p.terminate() | |
| if __name__ == “__main__”: | |
| sd = SpeechDetector() | |
| sd.run() | 
This version will work with python3 -
Raspberry Pi  - Raspbian Stretch
USB mic ALSA audio device 2
sudo apt-get install swig libpulse-dev
sudo pip3 install pocketsphinx
#!/usr/bin/env python3
from pocketsphinx.pocketsphinx import *
from sphinxbase.sphinxbase import *
import os
import pyaudio
import wave
import audioop
from collections import deque
import time
import math
import sys
"""
Written by Sophie Li, 2016
http://blog.justsophie.com/python-speech-to-text-with-pocketsphinx/
"""
class SpeechDetector:
    def __init__(self,input_dev=0):
        # Microphone stream config.
        # self.CHUNK = 1024  # CHUNKS of bytes to read each time from mic
        self.CHUNK = 3072  # CHUNKS of bytes to read each time from mic
        self.FORMAT = pyaudio.paInt16
        self.CHANNELS = 1
        self.CAPTURE_DEV_INDEX = input_dev
        # Get sample rate from capture device
        p = pyaudio.PyAudio()
        self.RATE = int(p.get_device_info_by_index(self.CAPTURE_DEV_INDEX)['defaultSampleRate'])
        print("\n\n*** Mic default sample rate:",self.RATE)
        self.RATE = 16000  # Just use 16k for pocketsphinx
        self.SILENCE_LIMIT = 1  # Silence limit in seconds. The max ammount of seconds where
                           # only silence is recorded. When this time passes the
                           # recording finishes and the file is decoded
        self.PREV_AUDIO = 0.5  # Previous audio (in seconds) to prepend. When noise
                          # is detected, how much of previously recorded audio is
                          # prepended. This helps to prevent chopping the beginning
                          # of the phrase.
        # self.THRESHOLD = 4500
        self.THRESHOLD = 2500
        self.num_phrases = -1
        # These will need to be modified according to where the pocketsphinx folder is
        # MODELDIR = "../../tools/pocketsphinx/model"
        # DATADIR = "../../tools/pocketsphinx/test/data"
        # Create a decoder with certain model
        # config = Decoder.default_config()
        # config.set_string('-hmm', os.path.join(MODELDIR, 'en-us/en-us'))
        # config.set_string('-lm', os.path.join(MODELDIR, 'en-us/en-us.lm.bin'))
        # config.set_string('-dict', os.path.join(MODELDIR, 'en-us/cmudict-en-us.dict'))
        MODELDIR = "/usr/local/lib/python3.5/dist-packages/pocketsphinx/model"
        DATADIR = "/usr/local/lib/python3.5/dist-packages/pocketsphinx/data"
        # Create a decoder with en-us model
        config = Decoder.default_config()
        config.set_string('-hmm', os.path.join(MODELDIR, 'en-us'))
        config.set_string('-lm', os.path.join(MODELDIR, 'en-us.lm.bin'))
        config.set_string('-dict', os.path.join(MODELDIR, 'cmudict-en-us.dict'))
        config.set_string('-logfn', 'justsophie2.out')
        config.set_string('-samprate', str(int(self.RATE)))
        # Creaders decoder object for streaming data.
        self.decoder = Decoder(config)
    def setup_mic(self, num_samples=50):
        """ Gets average audio intensity of your mic sound. You can use it to get
            average intensities while you're talking and/or silent. The average
            is the avg of the .2 of the largest intensities recorded.
        """
        print("Getting intensity values from mic.")
        p = pyaudio.PyAudio()
        stream = p.open(format=self.FORMAT,
                        channels=self.CHANNELS,
                        rate=self.RATE,
                        input=True,
                        frames_per_buffer=self.CHUNK)
        values = [math.sqrt(abs(audioop.avg(stream.read(self.CHUNK), 4)))
                  for x in range(num_samples)]
        values = sorted(values, reverse=True)
        r = sum(values[:int(num_samples * 0.2)]) / int(num_samples * 0.2)
        print(" Finished ")
        print(" Average audio intensity is ", r)
        stream.close()
        p.terminate()
        if r < 3000:
            # self.THRESHOLD = 3500
            self.THRESHOLD = int(r)
        else:
            self.THRESHOLD = int(r + 100)
        print("Audio Threshold set to :",self.THRESHOLD)
    def save_speech(self, data, p):
        """
        Saves mic data to temporary WAV file. Returns filename of saved
        file
        """
        filename = 'output_'+str(int(time.time()))
        # writes data to WAV file
        data = b''.join(data)
        wf = wave.open(filename + '.wav', 'wb')
        wf.setnchannels(1)
        wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
        wf.setframerate(self.RATE)  # ALAN - changed to value from setup_mic
        wf.writeframes(data)
        wf.close()
        return filename + '.wav'
    def decode_phrase(self, wav_file):
        self.decoder.start_utt()
        stream = open(wav_file, "rb")
        while True:
          buf = stream.read(1024)
          if buf:
            self.decoder.process_raw(buf, False, False)
          else:
            break
        self.decoder.end_utt()
        words = []
        [words.append(seg.word) for seg in self.decoder.seg()]
        return words
    def run(self):
        """
        Listens to Microphone, extracts phrases from it and calls pocketsphinx
        to decode the sound
        """
        self.setup_mic()
        #Open stream
        p = pyaudio.PyAudio()
        stream = p.open(format=self.FORMAT,
                        channels=self.CHANNELS,
                        rate=self.RATE,
                        input=True,
                        frames_per_buffer=self.CHUNK)
        print("* Mic set up and listening. ")
        audio2send = []
        # cur_data = ''  # current chunk of audio data
        cur_data = ""  # current chunk of audio data
        rel = self.RATE/self.CHUNK
        #slid_win = deque(maxlen=self.SILENCE_LIMIT * rel)
        slid_win = deque(maxlen=int(self.SILENCE_LIMIT * rel))
        #Prepend audio from 0.5 seconds before noise was detected
        # prev_audio = deque(maxlen=self.PREV_AUDIO * rel)
        prev_audio = deque(maxlen=int(self.PREV_AUDIO * rel))
        started = False
        while True:
            # cur_data = stream.read(self.CHUNK)
            cur_data = stream.read(self.CHUNK,exception_on_overflow=False)
            slid_win.append(math.sqrt(abs(audioop.avg(cur_data, 4))))
            if sum([x > self.THRESHOLD for x in slid_win]) > 0:
                if started == False:
                    print("Starting recording of phrase")
                    started = True
                audio2send.append(cur_data)
            elif started:
                print("Finished recording, decoding phrase")
                filename = self.save_speech(list(prev_audio) + audio2send, p)
                r = self.decode_phrase(filename)
                print("DETECTED: ", r)
                # Removes temp audio file
                os.remove(filename)
                # Reset all
                started = False
                # slid_win = deque(maxlen=self.SILENCE_LIMIT * rel)
                # prev_audio = deque(maxlen=0.5 * rel)
                slid_win = deque(maxlen=int(self.SILENCE_LIMIT * rel))
                prev_audio = deque(maxlen=int(0.5 * rel))
                audio2send = []
                print("\n\n*** Listening ...")
            else:
                prev_audio.append(cur_data)
        print("* Done listening")
        stream.close()
        p.terminate()
if __name__ == "__main__":
    MY_MIC_ALSA_DEV_INDEX = 2
    sd = SpeechDetector(input_dev=MY_MIC_ALSA_DEV_INDEX)
    try:
        sd.run()
    except (KeyboardInterrupt):
        print('\nGoodbye.')
        sys.exit()
    except Exception as e:
        exc_type, exc_value, exc_tranceback = sys.exe_info()
        traceback.print_exception(exc_type, exc_value, exc_traceback,
                                  limit=2,
                                  file=sys.stdout)
        sys.exit()
    
  
    Sign up for free
    to join this conversation on GitHub.
    Already have an account?
    Sign in to comment
  
            
formatting error