Created
February 21, 2024 05:06
-
-
Save niuniulla/e876a0924971b47fa26d2589b23385a2 to your computer and use it in GitHub Desktop.
A minimal use case of SDL Audio and whisper.cpp
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
A minimal use case of SDL Audio and whisper.cpp | |
*/ | |
/* | |
The program, upon launch, record a piece of recording of up to | |
1 minute and transcribe it into text on the terminal. | |
For more info on whipser.cpp: https://github.com/ggerganov/whisper.cpp/blob/master/CMakeLists.txt | |
*/ | |
#include <SDL2/SDL.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <iostream> | |
#include <thread> | |
#include <vector> | |
#include "whisper.h" | |
// command-line parameters | |
struct whisper_params { | |
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency()); | |
int32_t n_processors = 1; | |
int32_t step_ms = 3000; | |
int32_t length_ms = 10000; | |
int32_t keep_ms = 200; | |
int32_t capture_id = -1; | |
int32_t max_tokens = 32; | |
int32_t audio_ctx = 0; | |
float vad_thold = 0.6f; | |
float freq_thold = 100.0f; | |
bool speed_up = false; | |
bool translate = false; | |
bool no_fallback = false; | |
bool print_special = false; | |
bool no_context = true; | |
bool no_timestamps = false; | |
bool tinydiarize = false; | |
bool save_audio = false; // save audio to wav file | |
bool use_gpu = true; | |
std::string language = "en"; | |
std::string model = "models/ggml-base.en.bin"; | |
std::string fname_out; | |
}; | |
static std::vector<float> pcmf32; | |
static SDL_AudioDeviceID input_dev; | |
static std::vector<float> floatBufffer; | |
static size_t in_floatPos = 0; | |
void callback(void *userdata, Uint8 *stream, int len) | |
{ | |
size_t floatLen = len / sizeof(float); | |
SDL_memcpy(&pcmf32[in_floatPos], stream, len); | |
in_floatPos += floatLen; | |
} | |
int main() { | |
// test for whisper | |
std::cout << "Starting test" << std::endl; | |
std::cout << "set params" << std::endl; | |
whisper_params params; | |
// init whisper | |
std::cout << "Init whisper" << std::endl; | |
struct whisper_context_params cparams = whisper_context_default_params(); | |
params.use_gpu = params.use_gpu; | |
std::cout << "create context" << std::endl; | |
struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams); | |
std::vector<whisper_token> prompt_tokens; | |
// init | |
int lenghtOfRecordingSecond = 60; // 5s | |
std::cout << "init SDL audio" << std::endl; | |
SDL_Init(SDL_INIT_AUDIO); | |
//Default recording spec | |
SDL_AudioSpec desiredRecordingSpec, receivedRecordingSpec; | |
SDL_zero(desiredRecordingSpec); | |
desiredRecordingSpec.freq = WHISPER_SAMPLE_RATE; | |
desiredRecordingSpec.format = AUDIO_F32; //float32 of little endian byte order | |
desiredRecordingSpec.channels = 1; | |
desiredRecordingSpec.samples = 1024; | |
desiredRecordingSpec.callback = callback; | |
// select audio device | |
SDL_AudioDeviceID recordingDeviceId = 0; | |
//Open recording device | |
recordingDeviceId = SDL_OpenAudioDevice( SDL_GetAudioDeviceName(0, SDL_TRUE), | |
SDL_TRUE, | |
&desiredRecordingSpec, | |
&receivedRecordingSpec, | |
SDL_AUDIO_ALLOW_FORMAT_CHANGE | |
); | |
if( recordingDeviceId == 0 ) | |
{ | |
std::cout << "ERR - Failed to open recording device :" << SDL_GetError() << std::endl; | |
return -1; | |
} | |
// compte buffer size | |
int bufferSize = receivedRecordingSpec.freq * lenghtOfRecordingSecond; | |
pcmf32.resize(bufferSize); | |
std::fill(pcmf32.begin(), pcmf32.end(), 0); | |
std::cout << "INFO - pcmf32 size :" << bufferSize << std::endl; | |
bool is_running = true; | |
bool bReadyForInference = false; | |
std::cout << "INFO - Start to record." << std::endl; | |
while (is_running) | |
{ | |
SDL_PauseAudioDevice(recordingDeviceId, SDL_FALSE);// unpause device | |
SDL_LockAudioDevice(recordingDeviceId); | |
// fill the buffer until full | |
if (in_floatPos > pcmf32.size()) | |
{ | |
std::cout << "INFO - recording done: " << std::endl; | |
SDL_PauseAudioDevice(recordingDeviceId, SDL_TRUE); // pause recording | |
bReadyForInference = true; | |
is_running = false; | |
} | |
SDL_UnlockAudioDevice(recordingDeviceId); | |
SDL_Delay(100); // pause a bit for loop | |
// do inference | |
if (bReadyForInference) | |
{ | |
std::cout << "INFO - Start to transcript." << std::endl; | |
// do inference | |
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); | |
wparams.print_progress = false; | |
wparams.print_special = params.print_special; | |
wparams.print_realtime = false; | |
wparams.print_timestamps = !params.no_timestamps; | |
wparams.translate = params.translate; | |
wparams.single_segment = false; | |
wparams.max_tokens = params.max_tokens; | |
wparams.language = params.language.c_str(); | |
wparams.n_threads = params.n_threads; | |
wparams.audio_ctx = params.audio_ctx; | |
wparams.speed_up = params.speed_up; | |
wparams.tdrz_enable = params.tinydiarize; // [TDRZ] | |
// disable temperature fallback | |
//wparams.temperature_inc = -1.0f; | |
wparams.temperature_inc = params.no_fallback ? 0.0f : wparams.temperature_inc; | |
wparams.prompt_tokens = params.no_context ? nullptr : prompt_tokens.data(); | |
wparams.prompt_n_tokens = params.no_context ? 0 : prompt_tokens.size(); | |
if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) | |
{ | |
std::cout << "failed to process audio." << std::endl; | |
return 1; | |
} | |
std::cout << "INFO - end of inference." << std::endl; | |
// output text | |
int n_segments = whisper_full_n_segments(ctx); | |
for (int i=0; i<n_segments; i++) | |
{ | |
const char *text = whisper_full_get_segment_text(ctx, i); | |
std::cout << "transcript: " << text; | |
} | |
SDL_Delay(5000); | |
std::cout << "INFO - finished. " << std::endl; | |
} | |
} | |
whisper_free(ctx); | |
SDL_CloseAudioDevice(recordingDeviceId); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment