Created
October 8, 2019 10:30
-
-
Save dspinellis/4d0a6a6e73d15a520b5c78d55414652e to your computer and use it in GitHub Desktop.
Command-line tool to convert speech in a WAV audio file into text using Windows SAPI
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* Convert the specified speech WAV file into text output | |
* on the program's standard output. | |
* | |
* Diomidis Spinellis, October 2019 | |
* Based on https://stackoverflow.com/a/40002268/20520 | |
*/ | |
#include <iostream> | |
#include <sapi.h> | |
#include <sphelper.h> | |
int main(int argc, char* argv[]) | |
{ | |
if (argc != 2) { | |
std::cerr << "Usage: " << argv[0] << " file.wav\n"; | |
return 1; | |
} | |
::CoInitialize(NULL); | |
HRESULT hr = S_OK; | |
CComPtr<ISpStream> cpInputStream; | |
CComPtr<ISpRecognizer> cpRecognizer; | |
CComPtr<ISpRecoContext> cpRecoContext; | |
CComPtr<ISpRecoGrammar> cpRecoGrammar; | |
hr = cpRecognizer.CoCreateInstance(CLSID_SpInprocRecognizer); | |
hr = cpInputStream.CoCreateInstance(CLSID_SpStream); | |
std::string sInputFileName(argv[1]); | |
std::wstring wInputFileName(sInputFileName.begin(), sInputFileName.end()); | |
hr = cpInputStream->BindToFile(wInputFileName.c_str(), SPFM_OPEN_READONLY, NULL, NULL, SPFEI_ALL_EVENTS); | |
if (FAILED(hr)) { | |
std::cerr << "Unable to open " << argv[1] << '\n'; | |
return 1; | |
} | |
hr = cpRecognizer->SetInput(cpInputStream, TRUE); | |
hr = cpRecognizer->CreateRecoContext(&cpRecoContext); | |
hr = cpRecoContext->CreateGrammar(NULL, &cpRecoGrammar); | |
hr = cpRecoGrammar->LoadDictation(NULL, SPLO_STATIC); | |
hr = cpRecoContext->SetNotifyWin32Event(); | |
hr = cpRecoContext->SetInterest(SPFEI(SPEI_RECOGNITION) | SPFEI(SPEI_END_SR_STREAM), SPFEI(SPEI_RECOGNITION) | SPFEI(SPEI_END_SR_STREAM)); | |
hr = cpRecoGrammar->SetDictationState(SPRS_ACTIVE); | |
BOOL fEndStreamReached = FALSE; | |
while (!fEndStreamReached && cpRecoContext->WaitForNotifyEvent(INFINITE) == S_OK) { | |
CSpEvent spEvent; | |
ISpRecoResult *pPhrase; | |
SPPHRASE *phrase; | |
while (!fEndStreamReached && spEvent.GetFrom(cpRecoContext) == S_OK) { | |
switch (spEvent.eEventId) { | |
case SPEI_RECOGNITION: | |
pPhrase = spEvent.RecoResult(); | |
phrase = NULL; | |
pPhrase->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, TRUE, NULL, NULL); | |
pPhrase->GetPhrase(&phrase); | |
if (phrase == NULL || phrase->pElements == NULL) | |
break; | |
for (int i = 0; i < phrase->Rule.ulCountOfElements; i++) | |
if (phrase->pElements[i].pszDisplayText != NULL) | |
std::wcout << phrase->pElements[i].pszDisplayText << ' '; | |
break; | |
case SPEI_END_SR_STREAM: | |
fEndStreamReached = TRUE; | |
break; | |
} | |
spEvent.Clear(); | |
} | |
} | |
hr = cpRecoGrammar->SetDictationState(SPRS_INACTIVE); | |
hr = cpRecoGrammar->UnloadDictation(); | |
hr = cpInputStream->Close(); | |
::CoUninitialize(); | |
std::wcout << '\n'; | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment