seatedro · September 2, 2022 13:22
diff --git a/speechToTextUtils.ts b/speechToTextUtils.ts
 import speech, { SpeechClient } from '@google-cloud/speech';
 import { google } from '@google-cloud/speech/build/protos/protos';
 import * as pumpify from 'pumpify';
 import chalk from 'chalk';
 import { Socket } from 'socket.io';
 let speechClient: SpeechClient | null = null;

 class SpeechToTextUtils {
 	recognizeStream!: pumpify | null;
 	resultEndTime = 0;
 	isFinalEndTime = 0;
 	finalRequestEndTime = 0;
 	bridgingOffset = 0;
 	streamingLimit = 290000;
 	restartCounter = 0;
 	lastTranscriptWasFinal = false;
 	audioInput: DataView[] = [];
 	lastAudioInput: DataView[] = [];
 	newStream = true;
 	socket!: Socket;
 	request!: google.cloud.speech.v1.IStreamingRecognitionConfig | undefined;
 	restartTimeout: NodeJS.Timeout | undefined;

 	set _socket(value: Socket) {
 		this.socket = value;
 	}

 	set _request(value: google.cloud.speech.v1.IStreamingRecognitionConfig) {
 		this.request = value;
 	}

 	startRecognitionStream() {
 		this.audioInput = [];
 		if (!speechClient) {
 			speechClient = new speech.SpeechClient(); // Creates a client
 		}
 		this.recognizeStream = speechClient
 			.streamingRecognize(this.request)
 			.on('error', (err) => {
 				console.error('Error when processing audio: ' + err);
 				this.socket.emit('googleCloudStreamError', err);
 				this.stopRecognitionStream();
 			})
 			.on('data', this.speechCallback.bind(this));

 		this.restartTimeout = setTimeout(
 			this.restartStream.bind(this),
 			this.streamingLimit
 		);
 	}

 	speechCallback(stream: google.cloud.speech.v1.StreamingRecognizeResponse) {
 		// Null checks
 		if (
 			stream.results &&
 			stream.results[0] &&
 			stream.results[0].resultEndTime &&
 			stream.results[0].resultEndTime.nanos &&
 			stream.results[0].resultEndTime.seconds &&
 			stream.results[0].alternatives &&
 			stream.results[0].isFinal
 		) {
 			// Convert API result end time from seconds + nanoseconds to milliseconds
 			// The below seconds are useful to see the timestamps in the console
 			let seconds: number;
 			if (typeof stream.results[0].resultEndTime.seconds === 'string')
 				seconds = parseInt(stream.results[0].resultEndTime.seconds);
 			else if (Long.isLong(stream.results[0].resultEndTime.seconds))
 				seconds = stream.results[0].resultEndTime.seconds.toNumber();
 			else seconds = stream.results[0].resultEndTime.seconds;
 			this.resultEndTime =
 				seconds * 1000 +
 				Math.round(stream.results[0].resultEndTime.nanos / 1000000);

 			// Calculate correct time based on offset from audio sent twice
 			const correctedTime =
 				this.resultEndTime -
 				this.bridgingOffset +
 				this.streamingLimit * this.restartCounter;

 			process.stdout.clearLine(0);
 			process.stdout.cursorTo(0);
 			let stdoutText = '';
 			if (stream.results[0] && stream.results[0].alternatives[0]) {
 				stdoutText =
 					correctedTime + ': ' + stream.results[0].alternatives[0].transcript;
 			}

 			if (stream.results[0].isFinal) {
 				process.stdout.write(chalk.green(`${stdoutText}\n`));
 				this.socket.emit(
 					'speechData',
 					stream.results[0].alternatives[0].transcript
 				);

 				this.isFinalEndTime = this.resultEndTime;
 				this.lastTranscriptWasFinal = true;
 			} else {
 				// Make sure transcript does not exceed console character length
 				if (stdoutText.length > process.stdout.columns) {
 					stdoutText =
 						stdoutText.substring(0, process.stdout.columns - 4) + '...';
 				}
 				process.stdout.write(chalk.red(`${stdoutText}`));

 				this.lastTranscriptWasFinal = false;
 			}
 		}
 	}
 }

 export default new SpeechToTextUtils();
	import speech, { SpeechClient } from '@google-cloud/speech';
	import { google } from '@google-cloud/speech/build/protos/protos';
	import * as pumpify from 'pumpify';
	import chalk from 'chalk';
	import { Socket } from 'socket.io';
	let speechClient: SpeechClient \| null = null;

	class SpeechToTextUtils {
	recognizeStream!: pumpify \| null;
	resultEndTime = 0;
	isFinalEndTime = 0;
	finalRequestEndTime = 0;
	bridgingOffset = 0;
	streamingLimit = 290000;
	restartCounter = 0;
	lastTranscriptWasFinal = false;
	audioInput: DataView[] = [];
	lastAudioInput: DataView[] = [];
	newStream = true;
	socket!: Socket;
	request!: google.cloud.speech.v1.IStreamingRecognitionConfig \| undefined;
	restartTimeout: NodeJS.Timeout \| undefined;

	set _socket(value: Socket) {
	this.socket = value;
	}

	set _request(value: google.cloud.speech.v1.IStreamingRecognitionConfig) {
	this.request = value;
	}

	startRecognitionStream() {
	this.audioInput = [];
	if (!speechClient) {
	speechClient = new speech.SpeechClient(); // Creates a client
	}
	this.recognizeStream = speechClient
	.streamingRecognize(this.request)
	.on('error', (err) => {
	console.error('Error when processing audio: ' + err);
	this.socket.emit('googleCloudStreamError', err);
	this.stopRecognitionStream();
	})
	.on('data', this.speechCallback.bind(this));

	this.restartTimeout = setTimeout(
	this.restartStream.bind(this),
	this.streamingLimit
	);
	}

	speechCallback(stream: google.cloud.speech.v1.StreamingRecognizeResponse) {
	// Null checks
	if (
	stream.results &&
	stream.results[0] &&
	stream.results[0].resultEndTime &&
	stream.results[0].resultEndTime.nanos &&
	stream.results[0].resultEndTime.seconds &&
	stream.results[0].alternatives &&
	stream.results[0].isFinal
	) {
	// Convert API result end time from seconds + nanoseconds to milliseconds
	// The below seconds are useful to see the timestamps in the console
	let seconds: number;
	if (typeof stream.results[0].resultEndTime.seconds === 'string')
	seconds = parseInt(stream.results[0].resultEndTime.seconds);
	else if (Long.isLong(stream.results[0].resultEndTime.seconds))
	seconds = stream.results[0].resultEndTime.seconds.toNumber();
	else seconds = stream.results[0].resultEndTime.seconds;
	this.resultEndTime =
	seconds * 1000 +
	Math.round(stream.results[0].resultEndTime.nanos / 1000000);

	// Calculate correct time based on offset from audio sent twice
	const correctedTime =
	this.resultEndTime -
	this.bridgingOffset +
	this.streamingLimit * this.restartCounter;

	process.stdout.clearLine(0);
	process.stdout.cursorTo(0);
	let stdoutText = '';
	if (stream.results[0] && stream.results[0].alternatives[0]) {
	stdoutText =
	correctedTime + ': ' + stream.results[0].alternatives[0].transcript;
	}

	if (stream.results[0].isFinal) {
	process.stdout.write(chalk.green(`${stdoutText}\n`));
	this.socket.emit(
	'speechData',
	stream.results[0].alternatives[0].transcript
	);

	this.isFinalEndTime = this.resultEndTime;
	this.lastTranscriptWasFinal = true;
	} else {
	// Make sure transcript does not exceed console character length
	if (stdoutText.length > process.stdout.columns) {
	stdoutText =
	stdoutText.substring(0, process.stdout.columns - 4) + '...';
	}
	process.stdout.write(chalk.red(`${stdoutText}`));

	this.lastTranscriptWasFinal = false;
	}
	}
	}
	}

	export default new SpeechToTextUtils();