Created
January 15, 2024 09:34
-
-
Save dedmen/575c0d597dc2c0c3b19ea9c6c64d1794 to your computer and use it in GitHub Desktop.
TextToSpeech.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//Header File: | |
#pragma once | |
struct ISpVoice; | |
struct ISpStreamFormat; | |
namespace Speech | |
{ | |
class TextToSpeech | |
{ | |
friend class TextToSpeechClass; | |
std::shared_ptr<ISpVoice> m_Voice; | |
bool m_IsSpeaking = false; | |
bool m_HasPendingTTSNotifications = false; | |
std::shared_ptr<ISpStreamFormat> m_NullStream; | |
public: | |
TextToSpeech(); | |
/** | |
\brief The TextToSpeech instance needs to stay alive until speech is over https://learn.microsoft.com/en-us/previous-versions/windows/desktop/ms717077(v=vs.85) | |
*/ | |
void SpeakXML(std::string_view stuff); | |
/** | |
\brief The TextToSpeech instance needs to stay alive until speech is over https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/speech-synthesis-markup?tabs=csharp | |
*/ | |
void SpeakSSML(std::string_view stuff); | |
/** | |
\brief Required to be called for events to fire | |
*/ | |
void Tick(); | |
void SetAudioEnabled(bool enabled); | |
/** | |
\brief Called during speech to update visual face/mouth state https://learn.microsoft.com/en-us/previous-versions/windows/desktop/ms717289(v=vs.85) | |
*/ | |
void OnMouthStateChanged(int viseme); //#TODO implement this as a event that's forwarded to script via extension callback | |
//! https://learn.microsoft.com/en-us/previous-versions/windows/desktop/ee431828(v=vs.85) phoneme ID | |
void OnPhoneme(int durationMilliseconds, int phoneme); //#TODO implement this as a event that's forwarded to script via extension callback | |
/** | |
\brief Called when whole queued speech has completed | |
*/ | |
void OnSpeechDone(); //#TODO implement this as a event that's forwarded to script via extension callback | |
/** | |
\brief Called when a XML bookmark tag has been hit | |
*/ | |
void OnBookmark(std::string_view name); //#TODO implement this as a event that's forwarded to script via extension callback | |
void OnTTSNotification(); | |
} | |
} | |
//Source file: | |
#include "TextToSpeech.h" | |
#include "sdkddkver.h" | |
// Fix undefined defines errors https://developercommunity.visualstudio.com/t/several-warnings-in-windows-sdk-100177630-in-windo/435362 | |
#define _WIN32_WINNT_WIN10_TH2 NTDDI_WIN10_TH2 | |
#define _WIN32_WINNT_WIN10_RS1 NTDDI_WIN10_RS1 | |
#define _WIN32_WINNT_WIN10_RS2 NTDDI_WIN10_RS2 | |
#define _WIN32_WINNT_WIN10_RS3 NTDDI_WIN10_RS3 | |
#define _WIN32_WINNT_WIN10_RS4 NTDDI_WIN10_RS4 | |
#define _WIN32_WINNT_WIN10_RS5 NTDDI_WIN10_RS5 | |
// warning C4996: 'GetVersionExW': was declared deprecated | |
#pragma warning(disable: 4996) | |
#include "sphelper.h" | |
namespace Speech | |
{ | |
void __stdcall SPNOTIFYCALLBACK(WPARAM wParam, LPARAM lParam) | |
{ | |
reinterpret_cast<TextToSpeech*>(wParam)->OnTTSNotification(); | |
} | |
TextToSpeech::TextToSpeech() | |
{ | |
if( FAILED( CoInitialize(NULL) ) ) | |
{ | |
return; | |
} | |
// Perform application initialization: | |
//if (!InitInstance (hInstance, nCmdShow)) | |
//{ | |
// return FALSE; | |
//} | |
//hAccelTable = LoadAccelerators(hInstance, (LPCTSTR)IDC_GUIAPP); | |
// Main message loop: | |
//while (GetMessage(&msg;, NULL, 0, 0)) | |
//{ | |
// if (!TranslateAccelerator(msg.hwnd, hAccelTable, &msg;)) | |
// { | |
// TranslateMessage(&msg;); | |
// DispatchMessage(&msg;); | |
// } | |
//} | |
//Initialize SAPI | |
HRESULT hr = CoCreateInstance(CLSID_SpVoice, NULL, CLSCTX_ALL, IID_ISpVoice, reinterpret_cast<void**>(m_Voice.Init())); | |
if(! SUCCEEDED( hr ) ) | |
{ | |
//#TODO log error | |
} | |
m_Voice->SetNotifyCallbackFunction(SPNOTIFYCALLBACK, reinterpret_cast<uintptr_t>(this), reinterpret_cast<uintptr_t>(nullptr)); | |
auto hres = m_Voice->SetInterest( | |
SPFEI(SPEI_PHONEME) | SPFEI(SPEI_VISEME) | SPFEI(SPEI_TTS_BOOKMARK), | |
SPFEI(SPEI_PHONEME) | SPFEI(SPEI_VISEME) | SPFEI(SPEI_TTS_BOOKMARK) ); | |
__nop(); | |
} | |
void TextToSpeech::SpeakXML(std::string_view stuff) | |
{ | |
if (!m_Voice) | |
return; // #TODO error print | |
#TODO convert string from UTF-8 to UTF-16 | |
std::wstring speechStr; | |
// Utf8ToWideChar(speechStr, stuff.Data()); | |
m_Voice->Speak( speechStr.Data(), SPF_ASYNC | SPF_IS_XML | SPF_PURGEBEFORESPEAK, NULL); | |
m_IsSpeaking = true; | |
} | |
void TextToSpeech::SpeakSSML(std::string_view stuff) | |
{ | |
if (!m_Voice) | |
return; // #TODO error print | |
#TODO convert string from UTF-8 to UTF-16 | |
std::wstring speechStr; | |
//Utf8ToWideChar(speechStr, stuff.Data()); | |
m_Voice->Speak( speechStr.Data(), SPF_ASYNC | SPF_PARSE_SSML | SPF_PURGEBEFORESPEAK, NULL); | |
m_IsSpeaking = true; | |
} | |
void TextToSpeech::Tick() | |
{ | |
if (!m_Voice || !m_IsSpeaking) | |
return; // #TODO error print | |
bool signalled = WaitForSingleObjectEx(m_Voice->SpeakCompleteEvent(), 0, true) != WAIT_TIMEOUT; | |
if (signalled) | |
{ | |
m_IsSpeaking = false; | |
OnSpeechDone(); | |
} | |
if (true) // m_HasPendingTTSNotifications | |
{ | |
SPEVENT eventItem; | |
memset( &eventItem, 0,sizeof(SPEVENT)); | |
while (m_Voice->GetEvents(1, &eventItem , NULL) == S_OK) | |
{ | |
switch (eventItem.eEventId) | |
{ | |
case SPEI_VISEME: | |
{ | |
auto viseme = static_cast<SPVISEMES>(LOWORD(eventItem.lParam)); | |
OnMouthStateChanged(viseme); | |
break; | |
} | |
case SPEI_TTS_BOOKMARK: | |
{ | |
#TODO convert string from UTF-16 to UTF-8 | |
std::string bookmarkName; | |
// WideCharToUtf8(bookmarkName, (wchar_t*)eventItem.lParam); | |
OnBookmark(bookmarkName); | |
break; | |
} | |
case SPEI_PHONEME: | |
{ | |
OnPhoneme(HIWORD(eventItem.wParam), LOWORD(eventItem.lParam)); | |
} | |
default: | |
break; | |
} | |
SpClearEvent(&eventItem); | |
} | |
} | |
} | |
struct NullSPStream : ISpStreamFormat | |
{ | |
HRESULT QueryInterface(const IID& riid, void** ppvObject) override | |
{ | |
assert(ppvObject != nullptr); | |
HRESULT hr = S_OK; | |
if (riid == __uuidof(IUnknown) || riid == __uuidof(ISpStreamFormat) || riid == __uuidof(IStream)) | |
{ | |
*ppvObject = this; | |
AddRef(); | |
} | |
else | |
{ | |
*ppvObject = nullptr; | |
hr = E_NOINTERFACE; | |
} | |
return hr; | |
} | |
ULONG AddRef() override | |
{ | |
return S_OK; | |
} | |
ULONG Release() override | |
{ | |
return S_OK; | |
} | |
HRESULT Read(void* pv, ULONG cb, ULONG* pcbRead) override | |
{ | |
return E_NOTIMPL; | |
} | |
HRESULT Write(const void* pv, ULONG cb, ULONG* pcbWritten) override | |
{ | |
return S_OK; | |
} | |
HRESULT Seek(LARGE_INTEGER dlibMove, DWORD dwOrigin, ULARGE_INTEGER* plibNewPosition) override | |
{ | |
if (plibNewPosition) | |
plibNewPosition->QuadPart = dlibMove.QuadPart; | |
return S_OK; | |
} | |
HRESULT SetSize(ULARGE_INTEGER libNewSize) override | |
{ | |
return E_NOTIMPL; | |
} | |
HRESULT CopyTo(IStream* pstm, ULARGE_INTEGER cb, ULARGE_INTEGER* pcbRead, ULARGE_INTEGER* pcbWritten) override | |
{ | |
return E_NOTIMPL; | |
} | |
HRESULT Commit(DWORD grfCommitFlags) override | |
{ | |
return E_NOTIMPL; | |
} | |
HRESULT Revert() override | |
{ | |
return E_NOTIMPL; | |
} | |
HRESULT LockRegion(ULARGE_INTEGER libOffset, ULARGE_INTEGER cb, DWORD dwLockType) override | |
{ | |
return E_NOTIMPL; | |
} | |
HRESULT UnlockRegion(ULARGE_INTEGER libOffset, ULARGE_INTEGER cb, DWORD dwLockType) override | |
{ | |
return E_NOTIMPL; | |
} | |
HRESULT Stat(STATSTG* pstatstg, DWORD grfStatFlag) override | |
{ | |
return E_NOTIMPL; | |
} | |
HRESULT Clone(IStream** ppstm) override | |
{ | |
return E_NOTIMPL; | |
} | |
HRESULT GetFormat(GUID* pguidFormatId, WAVEFORMATEX** ppCoMemWaveFormatEx) override | |
{ | |
*pguidFormatId = SPDFID_WaveFormatEx; | |
*ppCoMemWaveFormatEx = (WAVEFORMATEX*)CoTaskMemAlloc(sizeof(WAVEFORMATEX)); | |
(*ppCoMemWaveFormatEx)->cbSize = 0; | |
(*ppCoMemWaveFormatEx)->nAvgBytesPerSec = 512; | |
(*ppCoMemWaveFormatEx)->nBlockAlign = 1; | |
(*ppCoMemWaveFormatEx)->nChannels = 1; | |
(*ppCoMemWaveFormatEx)->nSamplesPerSec = 44100; | |
(*ppCoMemWaveFormatEx)->wBitsPerSample = 8; | |
(*ppCoMemWaveFormatEx)->wFormatTag = WAVE_FORMAT_PCM; | |
return S_OK; | |
} | |
}; | |
void TextToSpeech::SetAudioEnabled(bool enabled) | |
{ | |
if (!m_Voice) | |
return; // #TODO error print | |
if (enabled) | |
m_Voice->SetOutput(nullptr, TRUE); // Default Audio device | |
else | |
{ | |
if (!m_NullStream) | |
m_NullStream = new NullSPStream(); | |
auto res = m_Voice->SetOutput(m_NullStream, TRUE); | |
__nop(); | |
} | |
} | |
void TextToSpeech::OnTTSNotification() | |
{ | |
m_HasPendingTTSNotifications = true; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment