Skip to content

Instantly share code, notes, and snippets.

Created August 27, 2024 01:56
Show Gist options
  • Save qtangs/adda94fa7c1a273b3dc1010e705550c3 to your computer and use it in GitHub Desktop.
Save qtangs/adda94fa7c1a273b3dc1010e705550c3 to your computer and use it in GitHub Desktop.
Google Cloud TTS Streaming
const textToSpeech = require('@google-cloud/text-to-speech');
const fs = require('fs');
async function streamTextToSpeech(texts: string[]) {
const client = new textToSpeech.TextToSpeechClient();
const ttsStream = client.streamingSynthesize();
// Write the response to a file, replace with your desired output stream
const writeStream = fs.createWriteStream('output.wav');
// The audio data is headerless LINEAR16 audio with a sample rate of 24000.
const sampleRate = 24000;
const numChannels = 1; // Mono audio
const byteRate = sampleRate * numChannels * 2;
const header = createWavHeader(sampleRate, numChannels, byteRate, 0);
// Handle the TTS response stream
ttsStream.on('data', (response: any) => {
if (response.audioContent) {
ttsStream.on('error', (err: any) => {
console.error('Error during Text-to-Speech:', err);
ttsStream.on('end', () => {
console.log('Finished streaming Text-to-Speech');
// Note: Only Journey voices support streaming for now.
ttsStream.write({streamingConfig: {voice: { name: 'en-us-Journey-O', languageCode: 'en-US', ssmlGender: "NEUTRAL" }}});
// Stream the texts to TTS stream, replace with actual streaming texts
for (const text of texts) {
ttsStream.write({input: {text: text}});
function writeString(view: DataView, offset: number, str: string) {
for (let i = 0; i < str.length; i++) {
view.setUint8(offset + i, str.charCodeAt(i));
function createWavHeader(sampleRate: number, numChannels: number, byteRate: number, dataSize: number): Uint8Array {
const header = new ArrayBuffer(44);
const view = new DataView(header);
// RIFF chunk descriptor
writeString(view, 0, 'RIFF');
view.setUint32(4, 36 + dataSize, true); // File size - 8
writeString(view, 8, 'WAVE');
// fmt sub-chunk
writeString(view, 12, 'fmt ');
view.setUint32(16, 16, true); // Subchunk1Size (16 for PCM)
view.setUint16(20, 1, true); // AudioFormat (1 for PCM)
view.setUint16(22, numChannels, true); // NumChannels
view.setUint32(24, sampleRate, true); // SampleRate
view.setUint32(28, byteRate, true); // ByteRate (SampleRate * NumChannels * BitsPerSample/8)
view.setUint16(32, numChannels * 2, true); // BlockAlign (NumChannels * BitsPerSample/8)
view.setUint16(34, 16, true); // BitsPerSample
// data sub-chunk
writeString(view, 36, 'data');
view.setUint32(40, dataSize, true); // Subchunk2Size
return new Uint8Array(header);
// Example usage
streamTextToSpeech(['Hello,', ' world!']).then((data) => {
Copy link

very helpful, thank you so much.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment