Created
August 27, 2024 01:56
-
-
Save qtangs/adda94fa7c1a273b3dc1010e705550c3 to your computer and use it in GitHub Desktop.
Google Cloud TTS Streaming
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const textToSpeech = require('@google-cloud/text-to-speech'); | |
const fs = require('fs'); | |
async function streamTextToSpeech(texts: string[]) { | |
const client = new textToSpeech.TextToSpeechClient(); | |
const ttsStream = client.streamingSynthesize(); | |
// Write the response to a file, replace with your desired output stream | |
const writeStream = fs.createWriteStream('output.wav'); | |
// The audio data is headerless LINEAR16 audio with a sample rate of 24000. | |
// https://github.com/googleapis/google-cloud-node/blob/main/packages/google-cloud-texttospeech/protos/google/cloud/texttospeech/v1/cloud_tts.proto#L352 | |
const sampleRate = 24000; | |
const numChannels = 1; // Mono audio | |
const byteRate = sampleRate * numChannels * 2; | |
const header = createWavHeader(sampleRate, numChannels, byteRate, 0); | |
writeStream.write(header); | |
// Handle the TTS response stream | |
ttsStream.on('data', (response: any) => { | |
if (response.audioContent) { | |
writeStream.write(response.audioContent); | |
} | |
}); | |
ttsStream.on('error', (err: any) => { | |
console.error('Error during Text-to-Speech:', err); | |
writeStream.end(); | |
}); | |
ttsStream.on('end', () => { | |
console.log('Finished streaming Text-to-Speech'); | |
writeStream.end(); | |
}); | |
// Note: Only Journey voices support streaming for now. | |
ttsStream.write({streamingConfig: {voice: { name: 'en-us-Journey-O', languageCode: 'en-US', ssmlGender: "NEUTRAL" }}}); | |
// Stream the texts to TTS stream, replace with actual streaming texts | |
for (const text of texts) { | |
ttsStream.write({input: {text: text}}); | |
} | |
ttsStream.end(); | |
} | |
function writeString(view: DataView, offset: number, str: string) { | |
for (let i = 0; i < str.length; i++) { | |
view.setUint8(offset + i, str.charCodeAt(i)); | |
} | |
} | |
function createWavHeader(sampleRate: number, numChannels: number, byteRate: number, dataSize: number): Uint8Array { | |
const header = new ArrayBuffer(44); | |
const view = new DataView(header); | |
// RIFF chunk descriptor | |
writeString(view, 0, 'RIFF'); | |
view.setUint32(4, 36 + dataSize, true); // File size - 8 | |
writeString(view, 8, 'WAVE'); | |
// fmt sub-chunk | |
writeString(view, 12, 'fmt '); | |
view.setUint32(16, 16, true); // Subchunk1Size (16 for PCM) | |
view.setUint16(20, 1, true); // AudioFormat (1 for PCM) | |
view.setUint16(22, numChannels, true); // NumChannels | |
view.setUint32(24, sampleRate, true); // SampleRate | |
view.setUint32(28, byteRate, true); // ByteRate (SampleRate * NumChannels * BitsPerSample/8) | |
view.setUint16(32, numChannels * 2, true); // BlockAlign (NumChannels * BitsPerSample/8) | |
view.setUint16(34, 16, true); // BitsPerSample | |
// data sub-chunk | |
writeString(view, 36, 'data'); | |
view.setUint32(40, dataSize, true); // Subchunk2Size | |
return new Uint8Array(header); | |
} | |
// Example usage | |
streamTextToSpeech(['Hello,', ' world!']).then((data) => { | |
console.log("Success"); | |
} | |
); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
very helpful, thank you so much.