Skip to content

Instantly share code, notes, and snippets.

@Lachee
Last active May 9, 2023 06:03
Show Gist options
  • Save Lachee/46b0deee854c8b990242582f72edb0a2 to your computer and use it in GitHub Desktop.
Save Lachee/46b0deee854c8b990242582f72edb0a2 to your computer and use it in GitHub Desktop.
Converts your speech into text, and then into speech again in mostly real time using the Web Speech API. (CHROME ONLY)
<html>
<head>
<style>
* { font-family: Arial, Helvetica, sans-serif;}
div {
background: #00000008;
margin: 10px;
min-height: 1em;
padding: 10px;
border-radius: 5px;
}
</style>
</head>
<body>
OpenTTS URL
<input type="url" id="api" name="api" value="http://localhost:5500/">
Use Conqui (slow):
<input type="checkbox" id="coqui" checked="true" />
<button id="play">Begin</button><br>
<hr>
Status: <div id="status"></div>
<hr>
Recognised: <div id="text"></div>
Queue: <ul id="queue"></ul>
Speech: <div id="speech"></div>
<audio id="audio" controls="" autoplay="" src="blob:http://localhost:5500/cd4f8b80-c654-4bfa-b233-7728b718cd8e"></audio>
</body>
<script>
// STT: https://www.google.com/intl/en/chrome/demos/speech.html
// TTS: https://github.com/mdn/dom-examples/tree/main/web-speech-api/speak-easy-synthesis
const lang = 'en-AU';
const voice = 'larynx:southern_english_female-glow_tts'; // 'Google UK English Female';
const coquiVoice = 'coqui-tts:en_vctk';
const coquiSpeaker = 'p259';
const playButton = document.getElementById('play');
const resultBox = document.getElementById('text');
const recognitionStatusBox = document.getElementById('status');
const speechBox = document.getElementById('speech');
const queueList = document.getElementById('queue');
const coquiCheckbox = document.getElementById('coqui');
const audio = document.getElementById('audio');
const apiInput = document.getElementById('api');
let isSpeaking = false;
let isListening = false;
let recognitionResults = '';
let recognition = new webkitSpeechRecognition();
let synthDownloadQueue = [];
let synthSpeakQueue = [];
recognition.lang = lang;
recognition.continuous = true;
recognition.interimResults = true;
recognition.maxAlternatives = 1;
recognition.onstart = function() {
recognitionStatusBox.innerText = 'Recognition started. Waiting...';
};
recognition.onerror = function(event) {
if (event.error == 'no-speech') {
recognitionStatusBox.innerText = 'No speech available';
}
if (event.error == 'audio-capture') {
recognitionStatusBox.innerText = 'No microphone available';
}
if (event.error == 'not-allowed') {
if (event.timeStamp - start_timestamp < 100) {
recognitionStatusBox.innerText = 'Web Speech API Blocked';
} else {
recognitionStatusBox.innerText = 'Web Speech API Denied';
}
}
};
recognition.onend = function(e) {
recognitionStatusBox.innerText = 'Recognition ended.';
if (isListening) {
console.warn('recognition has ended early', e);
recognition.start();
}
};
recognition.onresult = function(event) {
let interim = '';
for (var i = event.resultIndex; i < event.results.length; ++i) {
if (event.results[i].isFinal) {
recognitionResults = event.results[i][0].transcript;
console.log('speaking', event.results[i]);
if (event.results[i].length > 1) alert('DEBUG: ANOTHER RESULT');
speak(recognitionResults);
} else {
interim += event.results[i][0].transcript;
}
}
resultBox.innerHTML = linebreak(recognitionResults);
recognitionStatusBox.innerHTML = linebreak(interim);
}
function stopListening() {
console.log('stopped listening');
isListening = false;
recognition.stop();
playButton.innerText = 'Begin';
}
function startListen() {
if (isListening)
stopListening();
isListening = true;
recognition.start();
resultBox.innerHTML = '';
playButton.innerText = 'End';
}
function speak(words) {
if (words === "")
return false;
/** Downloads the synth for the words and returns a URL for the blob */
const download = async (words, attempts = 3) => {
try {
const url = new URL('/api/tts', apiInput.value);
if (coquiCheckbox.checked) {
url.searchParams.append('voice', coquiVoice);
url.searchParams.append('speakerId', coquiSpeaker);
} else {
url.searchParams.append('voice', voice);
}
url.searchParams.append('lang', 'en');
url.searchParams.append('text', words);
url.searchParams.append('vocoder', 'medium'); // quality
url.searchParams.append('ssml', false); // SSML support
console.log('requesting ', url.toString());
const response = await fetch(url);
if (!response.ok) {
console.error('failed to synth', response);
return;
}
const blob = await response.blob();
return URL.createObjectURL(blob);
}catch(e) {
if (attempts <= 0) {
console.error('DOWNLOAD ABORTED', e);
} else {
console.warn('failed to download clip, trying again in some time', e);
return new Promise((resolve) => {
setTimeout(() => download(words, attempts - 1).then(r => resolve(r)), 250);
});
}
}
}
synthDownloadQueue.push({ words, synth: download(words) });
updateDownloadQueue();
if (!isSpeaking) {
startSpeaking();
}
}
async function startSpeaking() {
const play = (src) => new Promise((resolve, reject) => {
audio.src = src;
audio.play();
audio.onended = resolve;
});
isSpeaking = true;
while(synthDownloadQueue.length > 0) {
const download = synthDownloadQueue.shift();
updateDownloadQueue();
speechBox.innerText = download.words + "... (downloading)";
const url = await download.synth;
speechBox.innerText = download.words;
console.log('playing', url);
await play(url);
speechBox.innerText = '';
}
isSpeaking = false;
}
function updateDownloadQueue() {
let html = '';
for(const download of synthDownloadQueue)
html = `<li><div>${download.words}</div></li>${html}`
queueList.innerHTML = html;
}
playButton.addEventListener('click', () => {
if (isListening) {
stopListening();
} else {
startListen();
}
});
function capitalize(s) {
const first_char = /\S/;
return s.replace(first_char, function(m) { return m.toUpperCase(); });
}
function linebreak(s) {
const two_line = /\n\n/g;
const one_line = /\n/g;
return s.replace(two_line, '<p></p>').replace(one_line, '<br>');
}
document.addEventListener('DOMContentLoaded', () => {
//startListen();
})
</script>
</html>
<html>
<head>
<style>
* { font-family: Arial, Helvetica, sans-serif;}
div {
background: #00000008;
margin: 10px;
min-height: 1em;
padding: 10px;
border-radius: 5px;
}
</style>
</head>
<body>
<div>
<input type="range" id="pitch" name="pitch" min="0" max="2" value="1" step="0.1">
<label for="volume">Pitch</label>
</div>
<div>
<input type="range" id="rate" name="rate" min="0" max="2" value="1" step="0.1">
<label for="cowbell">Rate</label>
</div>
<button id="play">Begin</button><br>
<hr>
Status: <div id="status"></div>
<hr>
Recognised: <div id="text"></div>
Queue: <ul id="queue"></ul>
Speech: <div id="speech"></div>
</body>
<script>
// STT: https://www.google.com/intl/en/chrome/demos/speech.html
// TTS: https://github.com/mdn/dom-examples/tree/main/web-speech-api/speak-easy-synthesis
const lang = 'en-US';
const voice = 'Microsoft Catherine - English (Australia)'; // 'Google UK English Female';
const synth = window.speechSynthesis;
const playButton = document.getElementById('play');
const resultBox = document.getElementById('text');
const recognitionStatusBox = document.getElementById('status');
const speechBox = document.getElementById('speech');
const queueList = document.getElementById('queue');
const pitchRange = document.getElementById('pitch');
const rateRange = document.getElementById('rate');
let voices = [];
function populateVoices() {
voices = synth.getVoices().sort(function (a, b) {
const aname = a.name.toUpperCase();
const bname = b.name.toUpperCase();
if (aname < bname) {
return -1;
} else if (aname == bname) {
return 0;
} else {
return +1;
}
});
console.log('voices', voices);
}
synth.addEventListener('voiceschanged', () => populateVoices());
let isSynthing = false;
let isListening = false;
let recognitionResults = '';
let recognition = new webkitSpeechRecognition();
let synthQueue = [];
recognition.lang = lang;
recognition.continuous = true;
recognition.interimResults = true;
recognition.maxAlternatives = 1;
recognition.onstart = function() {
recognitionStatusBox.innerText = 'Recognition started. Waiting...';
};
recognition.onerror = function(event) {
if (event.error == 'no-speech') {
recognitionStatusBox.innerText = 'No speech available';
}
if (event.error == 'audio-capture') {
recognitionStatusBox.innerText = 'No microphone available';
}
if (event.error == 'not-allowed') {
if (event.timeStamp - start_timestamp < 100) {
recognitionStatusBox.innerText = 'Web Speech API Blocked';
} else {
recognitionStatusBox.innerText = 'Web Speech API Denied';
}
}
};
recognition.onend = function(e) {
recognitionStatusBox.innerText = 'Recognition ended.';
if (isListening) {
console.warn('recognition has ended early', e);
recognition.start();
}
};
recognition.onresult = function(event) {
let interim = '';
for (var i = event.resultIndex; i < event.results.length; ++i) {
if (event.results[i].isFinal) {
recognitionResults = event.results[i][0].transcript;
console.log('speaking', event.results[i]);
if (event.results[i].length > 1) alert('DEBUG: ANOTHER RESULT');
speak(recognitionResults);
} else {
interim += event.results[i][0].transcript;
}
}
resultBox.innerHTML = linebreak(recognitionResults);
recognitionStatusBox.innerHTML = linebreak(interim);
}
function stopListening() {
console.log('stopped listening');
isListening = false;
recognition.stop();
playButton.innerText = 'Begin';
}
function startListen() {
if (isListening)
stopListening();
isListening = true;
recognition.start();
resultBox.innerHTML = '';
playButton.innerText = 'End';
}
function speak(words) {
if (words === "")
return false;
synthQueue.push(words);
updateQueue();
if (!isSynthing) {
startSynthQueue();
}
}
async function startSynthQueue() {
const say = (words) => new Promise((resolve, reject) => {
const utterThis = new SpeechSynthesisUtterance(words);
utterThis.onend = function (event) {
console.log("SpeechSynthesisUtterance.onend");
recognitionStatusBox.innerText = "Synth Ended";
resolve(words);
};
utterThis.onerror = function (event) {
console.error("SpeechSynthesisUtterance.onerror", event);
recognitionStatusBox.innerText = "Synth Errored";
reject(event);
};
if (voices.length == 0)
populateVoices();
for (let i = 0; i < voices.length; i++) {
if (voices[i].name === voice) {
utterThis.voice = voices[i];
break;
}
}
utterThis.pitch = pitchRange.value;
utterThis.rate = rateRange.value;
synth.speak(utterThis);
speechBox.innerText = words;
});
isSynthing = true;
while(synthQueue.length > 0) {
const words = synthQueue.shift();
updateQueue();
await say(words);
}
isSynthing = false;
}
function updateQueue() {
let html = '';
for(const w of synthQueue)
html = `<li><div>${w}</div></li>${html}`
queueList.innerHTML = html;
}
playButton.addEventListener('click', () => {
if (isListening) {
stopListening();
} else {
startListen();
}
});
function capitalize(s) {
const first_char = /\S/;
return s.replace(first_char, function(m) { return m.toUpperCase(); });
}
function linebreak(s) {
const two_line = /\n\n/g;
const one_line = /\n/g;
return s.replace(two_line, '<p></p>').replace(one_line, '<br>');
}
document.addEventListener('DOMContentLoaded', () => {
//startListen();
})
</script>
</html>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment