Last active
May 9, 2023 06:03
-
-
Save Lachee/46b0deee854c8b990242582f72edb0a2 to your computer and use it in GitHub Desktop.
Converts your speech into text, and then into speech again in mostly real time using the Web Speech API. (CHROME ONLY)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<html> | |
<head> | |
<style> | |
* { font-family: Arial, Helvetica, sans-serif;} | |
div { | |
background: #00000008; | |
margin: 10px; | |
min-height: 1em; | |
padding: 10px; | |
border-radius: 5px; | |
} | |
</style> | |
</head> | |
<body> | |
OpenTTS URL | |
<input type="url" id="api" name="api" value="http://localhost:5500/"> | |
Use Conqui (slow): | |
<input type="checkbox" id="coqui" checked="true" /> | |
<button id="play">Begin</button><br> | |
<hr> | |
Status: <div id="status"></div> | |
<hr> | |
Recognised: <div id="text"></div> | |
Queue: <ul id="queue"></ul> | |
Speech: <div id="speech"></div> | |
<audio id="audio" controls="" autoplay="" src="blob:http://localhost:5500/cd4f8b80-c654-4bfa-b233-7728b718cd8e"></audio> | |
</body> | |
<script> | |
// STT: https://www.google.com/intl/en/chrome/demos/speech.html | |
// TTS: https://github.com/mdn/dom-examples/tree/main/web-speech-api/speak-easy-synthesis | |
const lang = 'en-AU'; | |
const voice = 'larynx:southern_english_female-glow_tts'; // 'Google UK English Female'; | |
const coquiVoice = 'coqui-tts:en_vctk'; | |
const coquiSpeaker = 'p259'; | |
const playButton = document.getElementById('play'); | |
const resultBox = document.getElementById('text'); | |
const recognitionStatusBox = document.getElementById('status'); | |
const speechBox = document.getElementById('speech'); | |
const queueList = document.getElementById('queue'); | |
const coquiCheckbox = document.getElementById('coqui'); | |
const audio = document.getElementById('audio'); | |
const apiInput = document.getElementById('api'); | |
let isSpeaking = false; | |
let isListening = false; | |
let recognitionResults = ''; | |
let recognition = new webkitSpeechRecognition(); | |
let synthDownloadQueue = []; | |
let synthSpeakQueue = []; | |
recognition.lang = lang; | |
recognition.continuous = true; | |
recognition.interimResults = true; | |
recognition.maxAlternatives = 1; | |
recognition.onstart = function() { | |
recognitionStatusBox.innerText = 'Recognition started. Waiting...'; | |
}; | |
recognition.onerror = function(event) { | |
if (event.error == 'no-speech') { | |
recognitionStatusBox.innerText = 'No speech available'; | |
} | |
if (event.error == 'audio-capture') { | |
recognitionStatusBox.innerText = 'No microphone available'; | |
} | |
if (event.error == 'not-allowed') { | |
if (event.timeStamp - start_timestamp < 100) { | |
recognitionStatusBox.innerText = 'Web Speech API Blocked'; | |
} else { | |
recognitionStatusBox.innerText = 'Web Speech API Denied'; | |
} | |
} | |
}; | |
recognition.onend = function(e) { | |
recognitionStatusBox.innerText = 'Recognition ended.'; | |
if (isListening) { | |
console.warn('recognition has ended early', e); | |
recognition.start(); | |
} | |
}; | |
recognition.onresult = function(event) { | |
let interim = ''; | |
for (var i = event.resultIndex; i < event.results.length; ++i) { | |
if (event.results[i].isFinal) { | |
recognitionResults = event.results[i][0].transcript; | |
console.log('speaking', event.results[i]); | |
if (event.results[i].length > 1) alert('DEBUG: ANOTHER RESULT'); | |
speak(recognitionResults); | |
} else { | |
interim += event.results[i][0].transcript; | |
} | |
} | |
resultBox.innerHTML = linebreak(recognitionResults); | |
recognitionStatusBox.innerHTML = linebreak(interim); | |
} | |
function stopListening() { | |
console.log('stopped listening'); | |
isListening = false; | |
recognition.stop(); | |
playButton.innerText = 'Begin'; | |
} | |
function startListen() { | |
if (isListening) | |
stopListening(); | |
isListening = true; | |
recognition.start(); | |
resultBox.innerHTML = ''; | |
playButton.innerText = 'End'; | |
} | |
function speak(words) { | |
if (words === "") | |
return false; | |
/** Downloads the synth for the words and returns a URL for the blob */ | |
const download = async (words, attempts = 3) => { | |
try { | |
const url = new URL('/api/tts', apiInput.value); | |
if (coquiCheckbox.checked) { | |
url.searchParams.append('voice', coquiVoice); | |
url.searchParams.append('speakerId', coquiSpeaker); | |
} else { | |
url.searchParams.append('voice', voice); | |
} | |
url.searchParams.append('lang', 'en'); | |
url.searchParams.append('text', words); | |
url.searchParams.append('vocoder', 'medium'); // quality | |
url.searchParams.append('ssml', false); // SSML support | |
console.log('requesting ', url.toString()); | |
const response = await fetch(url); | |
if (!response.ok) { | |
console.error('failed to synth', response); | |
return; | |
} | |
const blob = await response.blob(); | |
return URL.createObjectURL(blob); | |
}catch(e) { | |
if (attempts <= 0) { | |
console.error('DOWNLOAD ABORTED', e); | |
} else { | |
console.warn('failed to download clip, trying again in some time', e); | |
return new Promise((resolve) => { | |
setTimeout(() => download(words, attempts - 1).then(r => resolve(r)), 250); | |
}); | |
} | |
} | |
} | |
synthDownloadQueue.push({ words, synth: download(words) }); | |
updateDownloadQueue(); | |
if (!isSpeaking) { | |
startSpeaking(); | |
} | |
} | |
async function startSpeaking() { | |
const play = (src) => new Promise((resolve, reject) => { | |
audio.src = src; | |
audio.play(); | |
audio.onended = resolve; | |
}); | |
isSpeaking = true; | |
while(synthDownloadQueue.length > 0) { | |
const download = synthDownloadQueue.shift(); | |
updateDownloadQueue(); | |
speechBox.innerText = download.words + "... (downloading)"; | |
const url = await download.synth; | |
speechBox.innerText = download.words; | |
console.log('playing', url); | |
await play(url); | |
speechBox.innerText = ''; | |
} | |
isSpeaking = false; | |
} | |
function updateDownloadQueue() { | |
let html = ''; | |
for(const download of synthDownloadQueue) | |
html = `<li><div>${download.words}</div></li>${html}` | |
queueList.innerHTML = html; | |
} | |
playButton.addEventListener('click', () => { | |
if (isListening) { | |
stopListening(); | |
} else { | |
startListen(); | |
} | |
}); | |
function capitalize(s) { | |
const first_char = /\S/; | |
return s.replace(first_char, function(m) { return m.toUpperCase(); }); | |
} | |
function linebreak(s) { | |
const two_line = /\n\n/g; | |
const one_line = /\n/g; | |
return s.replace(two_line, '<p></p>').replace(one_line, '<br>'); | |
} | |
document.addEventListener('DOMContentLoaded', () => { | |
//startListen(); | |
}) | |
</script> | |
</html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<html> | |
<head> | |
<style> | |
* { font-family: Arial, Helvetica, sans-serif;} | |
div { | |
background: #00000008; | |
margin: 10px; | |
min-height: 1em; | |
padding: 10px; | |
border-radius: 5px; | |
} | |
</style> | |
</head> | |
<body> | |
<div> | |
<input type="range" id="pitch" name="pitch" min="0" max="2" value="1" step="0.1"> | |
<label for="volume">Pitch</label> | |
</div> | |
<div> | |
<input type="range" id="rate" name="rate" min="0" max="2" value="1" step="0.1"> | |
<label for="cowbell">Rate</label> | |
</div> | |
<button id="play">Begin</button><br> | |
<hr> | |
Status: <div id="status"></div> | |
<hr> | |
Recognised: <div id="text"></div> | |
Queue: <ul id="queue"></ul> | |
Speech: <div id="speech"></div> | |
</body> | |
<script> | |
// STT: https://www.google.com/intl/en/chrome/demos/speech.html | |
// TTS: https://github.com/mdn/dom-examples/tree/main/web-speech-api/speak-easy-synthesis | |
const lang = 'en-US'; | |
const voice = 'Microsoft Catherine - English (Australia)'; // 'Google UK English Female'; | |
const synth = window.speechSynthesis; | |
const playButton = document.getElementById('play'); | |
const resultBox = document.getElementById('text'); | |
const recognitionStatusBox = document.getElementById('status'); | |
const speechBox = document.getElementById('speech'); | |
const queueList = document.getElementById('queue'); | |
const pitchRange = document.getElementById('pitch'); | |
const rateRange = document.getElementById('rate'); | |
let voices = []; | |
function populateVoices() { | |
voices = synth.getVoices().sort(function (a, b) { | |
const aname = a.name.toUpperCase(); | |
const bname = b.name.toUpperCase(); | |
if (aname < bname) { | |
return -1; | |
} else if (aname == bname) { | |
return 0; | |
} else { | |
return +1; | |
} | |
}); | |
console.log('voices', voices); | |
} | |
synth.addEventListener('voiceschanged', () => populateVoices()); | |
let isSynthing = false; | |
let isListening = false; | |
let recognitionResults = ''; | |
let recognition = new webkitSpeechRecognition(); | |
let synthQueue = []; | |
recognition.lang = lang; | |
recognition.continuous = true; | |
recognition.interimResults = true; | |
recognition.maxAlternatives = 1; | |
recognition.onstart = function() { | |
recognitionStatusBox.innerText = 'Recognition started. Waiting...'; | |
}; | |
recognition.onerror = function(event) { | |
if (event.error == 'no-speech') { | |
recognitionStatusBox.innerText = 'No speech available'; | |
} | |
if (event.error == 'audio-capture') { | |
recognitionStatusBox.innerText = 'No microphone available'; | |
} | |
if (event.error == 'not-allowed') { | |
if (event.timeStamp - start_timestamp < 100) { | |
recognitionStatusBox.innerText = 'Web Speech API Blocked'; | |
} else { | |
recognitionStatusBox.innerText = 'Web Speech API Denied'; | |
} | |
} | |
}; | |
recognition.onend = function(e) { | |
recognitionStatusBox.innerText = 'Recognition ended.'; | |
if (isListening) { | |
console.warn('recognition has ended early', e); | |
recognition.start(); | |
} | |
}; | |
recognition.onresult = function(event) { | |
let interim = ''; | |
for (var i = event.resultIndex; i < event.results.length; ++i) { | |
if (event.results[i].isFinal) { | |
recognitionResults = event.results[i][0].transcript; | |
console.log('speaking', event.results[i]); | |
if (event.results[i].length > 1) alert('DEBUG: ANOTHER RESULT'); | |
speak(recognitionResults); | |
} else { | |
interim += event.results[i][0].transcript; | |
} | |
} | |
resultBox.innerHTML = linebreak(recognitionResults); | |
recognitionStatusBox.innerHTML = linebreak(interim); | |
} | |
function stopListening() { | |
console.log('stopped listening'); | |
isListening = false; | |
recognition.stop(); | |
playButton.innerText = 'Begin'; | |
} | |
function startListen() { | |
if (isListening) | |
stopListening(); | |
isListening = true; | |
recognition.start(); | |
resultBox.innerHTML = ''; | |
playButton.innerText = 'End'; | |
} | |
function speak(words) { | |
if (words === "") | |
return false; | |
synthQueue.push(words); | |
updateQueue(); | |
if (!isSynthing) { | |
startSynthQueue(); | |
} | |
} | |
async function startSynthQueue() { | |
const say = (words) => new Promise((resolve, reject) => { | |
const utterThis = new SpeechSynthesisUtterance(words); | |
utterThis.onend = function (event) { | |
console.log("SpeechSynthesisUtterance.onend"); | |
recognitionStatusBox.innerText = "Synth Ended"; | |
resolve(words); | |
}; | |
utterThis.onerror = function (event) { | |
console.error("SpeechSynthesisUtterance.onerror", event); | |
recognitionStatusBox.innerText = "Synth Errored"; | |
reject(event); | |
}; | |
if (voices.length == 0) | |
populateVoices(); | |
for (let i = 0; i < voices.length; i++) { | |
if (voices[i].name === voice) { | |
utterThis.voice = voices[i]; | |
break; | |
} | |
} | |
utterThis.pitch = pitchRange.value; | |
utterThis.rate = rateRange.value; | |
synth.speak(utterThis); | |
speechBox.innerText = words; | |
}); | |
isSynthing = true; | |
while(synthQueue.length > 0) { | |
const words = synthQueue.shift(); | |
updateQueue(); | |
await say(words); | |
} | |
isSynthing = false; | |
} | |
function updateQueue() { | |
let html = ''; | |
for(const w of synthQueue) | |
html = `<li><div>${w}</div></li>${html}` | |
queueList.innerHTML = html; | |
} | |
playButton.addEventListener('click', () => { | |
if (isListening) { | |
stopListening(); | |
} else { | |
startListen(); | |
} | |
}); | |
function capitalize(s) { | |
const first_char = /\S/; | |
return s.replace(first_char, function(m) { return m.toUpperCase(); }); | |
} | |
function linebreak(s) { | |
const two_line = /\n\n/g; | |
const one_line = /\n/g; | |
return s.replace(two_line, '<p></p>').replace(one_line, '<br>'); | |
} | |
document.addEventListener('DOMContentLoaded', () => { | |
//startListen(); | |
}) | |
</script> | |
</html> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment