-
-
Save tyspa1/9162330 to your computer and use it in GitHub Desktop.
Use Google Speech API with Python using Pyaudio and flac for Windows
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pyaudio | |
import wave | |
import audioop | |
from collections import deque | |
import os | |
import urllib2 | |
import urllib | |
import time | |
import math | |
LANG_CODE = 'en-US' # Language to use | |
GOOGLE_SPEECH_URL = 'https://www.google.com/speech-api/v1/recognize?xjerr=1&client=chromium&pfilter=2&lang=%s&maxresults=6' % (LANG_CODE) | |
FLAC_CONV = 'c:\\FLAC\\flac.exe ' # Path to flac.exe for wav to flac coversion. | |
# on Linux | |
# Microphone stream config. | |
CHUNK = 1024 # CHUNKS of bytes to read each time from mic | |
FORMAT = pyaudio.paInt16 | |
CHANNELS = 1 | |
RATE = 16000 | |
THRESHOLD = 2500 # The threshold intensity that defines silence | |
# and noise signal (an int. lower than THRESHOLD is silence). | |
SILENCE_LIMIT = 1 # Silence limit in seconds. The max ammount of seconds where | |
# only silence is recorded. When this time passes the | |
# recording finishes and the file is delivered. | |
PREV_AUDIO = 0.5 # Previous audio (in seconds) to prepend. When noise | |
# is detected, how much of previously recorded audio is | |
# prepended. This helps to prevent chopping the beggining | |
# of the phrase. | |
def audio_int(num_samples=50): | |
""" Gets average audio intensity of your mic sound. You can use it to get | |
average intensities while you're talking and/or silent. The average | |
is the avg of the 20% largest intensities recorded. | |
""" | |
print "Getting intensity values from mic." | |
p = pyaudio.PyAudio() | |
stream = p.open(format=FORMAT, | |
channels=CHANNELS, | |
rate=RATE, | |
input=True, | |
frames_per_buffer=CHUNK) | |
values = [math.sqrt(abs(audioop.avg(stream.read(CHUNK), 4))) | |
for x in range(num_samples)] | |
values = sorted(values, reverse=True) | |
r = sum(values[:int(num_samples * 0.2)]) / int(num_samples * 0.2) | |
print " Finished " | |
print " Average audio intensity is ", r | |
stream.close() | |
p.terminate() | |
return r | |
def listen_for_speech(threshold=THRESHOLD, num_phrases=1): | |
""" | |
Listens to Microphone, extracts phrases from it and sends it to | |
Google's TTS service and returns response. a "phrase" is sound | |
surrounded by silence (according to threshold). num_phrases controls | |
how many phrases to process before finishing the listening process | |
(-1 for infinite). | |
""" | |
#Open stream | |
p = pyaudio.PyAudio() | |
stream = p.open(format=FORMAT, | |
channels=CHANNELS, | |
rate=RATE, | |
input=True, | |
frames_per_buffer=CHUNK) | |
print "* Listening mic. " | |
audio2send = [] | |
cur_data = '' # current chunk of audio data | |
rel = RATE/CHUNK | |
slid_win = deque(maxlen=SILENCE_LIMIT * rel) | |
#Prepend audio from 0.5 seconds before noise was detected | |
prev_audio = deque(maxlen=PREV_AUDIO * rel) | |
started = False | |
n = num_phrases | |
response = [] | |
while (num_phrases == -1 or n > 0): | |
cur_data = stream.read(CHUNK) | |
slid_win.append(math.sqrt(abs(audioop.avg(cur_data, 4)))) | |
#print slid_win[-1] | |
if(sum([x > THRESHOLD for x in slid_win]) > 0): | |
if(not started): | |
#print "Starting record of phrase" | |
started = True | |
audio2send.append(cur_data) | |
elif (started is True): | |
#print "Finished" | |
# The limit was reached, finish capture and deliver. | |
filename = save_speech(list(prev_audio) + audio2send, p) | |
# Send file to Google and get response | |
r = stt_google_wav(filename) | |
if num_phrases == -1: | |
print "Response", r | |
else: | |
response.append(r) | |
# Remove temp file. Comment line to review. | |
os.remove(filename) | |
# Reset all | |
started = False | |
slid_win = deque(maxlen=SILENCE_LIMIT * rel) | |
prev_audio = deque(maxlen=0.5 * rel) | |
audio2send = [] | |
n -= 1 | |
#print "Listening ..." | |
else: | |
prev_audio.append(cur_data) | |
print "* Done recording" | |
stream.close() | |
p.terminate() | |
return response | |
def save_speech(data, p): | |
""" Saves mic data to temporary WAV file. Returns filename of saved | |
file """ | |
filename = 'output_'+str(int(time.time())) | |
# writes data to WAV file | |
data = ''.join(data) | |
wf = wave.open(filename + '.wav', 'wb') | |
wf.setnchannels(1) | |
wf.setsampwidth(p.get_sample_size(pyaudio.paInt16)) | |
wf.setframerate(16000) # TODO make this value a function parameter? | |
wf.writeframes(data) | |
wf.close() | |
return filename + '.wav' | |
def stt_google_wav(audio_fname): | |
""" Sends audio file (audio_fname) to Google's text to speech | |
service and returns service's response. We need a FLAC | |
converter if audio is not FLAC (check FLAC_CONV). """ | |
#print "Sending ", audio_fname | |
#Convert to flac first | |
filename = audio_fname | |
del_flac = False | |
if 'flac' not in filename: | |
del_flac = True | |
#print "Converting to flac" | |
#print FLAC_CONV + filename | |
os.system(FLAC_CONV + ' ' + filename) | |
filename = filename.split('.')[0] + '.flac' | |
f = open(filename, 'rb') | |
flac_cont = f.read() | |
f.close() | |
# Headers. A common Chromium (Linux) User-Agent | |
hrs = {"User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.63 Safari/535.7", | |
'Content-type': 'audio/x-flac; rate=16000'} | |
req = urllib2.Request(GOOGLE_SPEECH_URL, data=flac_cont, headers=hrs) | |
#print "Sending request to Google TTS" | |
#print "response", response | |
try: | |
p = urllib2.urlopen(req) | |
response = p.read() | |
res = eval(response)['hypotheses'] | |
except: | |
print "Couldn't parse service response" | |
res = None | |
if del_flac: | |
os.remove(filename) # Remove temp file | |
return res | |
if(__name__ == '__main__'): | |
data = listen_for_speech() # listen to mic. | |
#print stt_google_wav('hello.flac') # translate audio file | |
#audio_int() # To measure your mic levels | |
try: | |
print data[0][0]['utterance'] | |
except: | |
print "No Input recieved." |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment