Last active
December 28, 2023 15:14
-
-
Save lukehoban/0ee5c1bef438dc5bd7cb to your computer and use it in GitHub Desktop.
Project Oxford Speech APIs Node.js Sample
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var fs = require('fs'); | |
var util = require('util'); | |
var request = require('request'); | |
var clientId = 'test-app'; // Can be anything | |
var clientSecret = 'f6f0bfec08274b8790520a9079b808af'; // API key from Azure marketplace | |
var str = 'This is a cool demo to call Microsoft text to speach service in Node.js.'; | |
console.log('Converting from text -> speech -> text.'); | |
console.log('Input text: "' + str + '"'); | |
getAccessToken(clientId, clientSecret, function(err, accessToken) { | |
if(err) return console.log(err); | |
console.log('Got access token: ' + accessToken) | |
textToSpeech(str, 'test.wav', accessToken, function(err) { | |
if(err) return console.log(err); | |
console.log('Wrote out: ' + 'test.wav'); | |
speechToText('test.wav', accessToken, function(err, res) { | |
if(err) return console.log(err); | |
console.log('Confidence ' + res.results[0].confidence + ' for: "' + res.results[0].lexical + '"'); | |
}); | |
}); | |
}) | |
// ==== Helpers ==== | |
function getAccessToken(clientId, clientSecret, callback) { | |
request.post({ | |
url: 'https://oxford-speech.cloudapp.net/token/issueToken', | |
form: { | |
'grant_type': 'client_credentials', | |
'client_id': encodeURIComponent(clientId), | |
'client_secret': encodeURIComponent(clientSecret), | |
'scope': 'https://speech.platform.bing.com' | |
} | |
}, function(err, resp, body) { | |
if(err) return callback(err); | |
try { | |
var accessToken = JSON.parse(body).access_token; | |
if(accessToken) { | |
callback(null, accessToken); | |
} else { | |
callback(body); | |
} | |
} catch(e) { | |
callback(e); | |
} | |
}); | |
} | |
function textToSpeech(text, filename, accessToken, callback) { | |
var ssmlTemplate = "<speak version='1.0' xml:lang='en-us'><voice xml:lang='%s' xml:gender='%s' name='%s'>%s</voice></speak>"; | |
request.post({ | |
url: 'http://speech.platform.bing.com/synthesize', | |
body: util.format(ssmlTemplate, 'en-US', 'Female', 'Microsoft Server Speech Text to Speech Voice (en-US, ZiraRUS)', text), | |
encoding: null, | |
headers: { | |
'Authorization': 'Bearer ' + accessToken, | |
'Content-Type' : 'application/ssml+xml', | |
'X-Microsoft-OutputFormat' : 'riff-16khz-16bit-mono-pcm', | |
'X-Search-AppId': '07D3234E49CE426DAA29772419F436CA', | |
'X-Search-ClientID': '1ECFAE91408841A480F00935DC390960', | |
} | |
}, function(err, resp, body) { | |
if(err) return callback(err); | |
fs.writeFile(filename, body, 'binary', function (err) { | |
if (err) return callback(err); | |
callback(null); | |
}); | |
}); | |
} | |
function speechToText(filename, accessToken, callback) { | |
fs.readFile(filename, function(err, waveData) { | |
if(err) return callback(err); | |
request.post({ | |
url: 'https://speech.platform.bing.com/recognize/query', | |
qs: { | |
'scenarios': 'ulm', | |
'appid': 'D4D52672-91D7-4C74-8AD8-42B1D98141A5', // This magic value is required | |
'locale': 'en-US', | |
'device.os': 'wp7', | |
'version': '3.0', | |
'format': 'json', | |
'requestid': '1d4b6030-9099-11e0-91e4-0800200c9a66', // can be anything | |
'instanceid': '1d4b6030-9099-11e0-91e4-0800200c9a66' // can be anything | |
}, | |
body: waveData, | |
headers: { | |
'Authorization': 'Bearer ' + accessToken, | |
'Content-Type': 'audio/wav; samplerate=16000', | |
'Content-Length' : waveData.length | |
} | |
}, function(err, resp, body) { | |
if(err) return callback(err); | |
try { | |
callback(null, JSON.parse(body)); | |
} catch(e) { | |
callback(e); | |
} | |
}); | |
}); | |
} |
@lukehoban From where did you get 'X-Search-AppId': '07D3234E49CE426DAA29772419F436CA' and 'X-Search-ClientID': '1ECFAE91408841A480F00935DC390960'?
Hi,
The voice recognition process works fine but only if my sentence contains at least 5 or 6 syllables. Is there any limitation that prevent shortest sentences to be recognized?
Thanks
From where did you get 'X-Search-AppId': '' and 'X-Search-ClientID': ''?
@miparnisari hello,From where did you get 'X-Search-AppId': '' and 'X-Search-ClientID': ''?
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@myadhdoutlet did you fix the issue with the static? I had the same issue and I fixed it adding
encoding: null
to the request. This way you get the response as a Buffer. See https://github.com/palmerabollo/bingspeech-api-client/blob/master/src/client.ts#L102