Skip to content

Instantly share code, notes, and snippets.

@chimame
Created April 26, 2026 08:54
Show Gist options
  • Select an option

  • Save chimame/de7c0b6d7bcbc72af3e23652f9308ab2 to your computer and use it in GitHub Desktop.

Select an option

Save chimame/de7c0b6d7bcbc72af3e23652f9308ab2 to your computer and use it in GitHub Desktop.
Workers AI Whisper STT provider for @cloudflare/voice
import type { Transcriber, TranscriberSession, TranscriberSessionOptions } from "@cloudflare/voice";
export interface WorkersAIWhisperSTTOptions {
/** Language code. Use "ja" for Japanese. */
language?: string;
/** Sample rate in Hz. @default 16000 */
sampleRate?: number;
/** RMS threshold for local speech detection. @default 0.015 */
silenceThreshold?: number;
/** Silence duration that finalizes one utterance. @default 900 */
silenceDurationMs?: number;
/** Ignore very short detected sounds. @default 250 */
minSpeechMs?: number;
/** Force a transcription before the buffered utterance grows too large. @default 25000 */
maxSegmentMs?: number;
/** Preprocess audio with Workers AI VAD. @default true */
vadFilter?: boolean;
/** Optional prompt to help Whisper with domain-specific context. */
initialPrompt?: string;
}
type WhisperLargeV3TurboResult = {
text?: string;
};
type WhisperInput = {
audio: string;
task: "transcribe";
language?: string;
vad_filter?: boolean;
initial_prompt?: string;
};
type WhisperConfig = Required<
Pick<
WorkersAIWhisperSTTOptions,
| "sampleRate"
| "silenceThreshold"
| "silenceDurationMs"
| "minSpeechMs"
| "maxSegmentMs"
| "vadFilter"
>
> &
Pick<WorkersAIWhisperSTTOptions, "language" | "initialPrompt">;
export class WorkersAIWhisperSTT implements Transcriber {
readonly #ai: Ai;
readonly #options: WhisperConfig;
constructor(ai: Ai, options: WorkersAIWhisperSTTOptions = {}) {
this.#ai = ai;
this.#options = {
language: options.language ?? "ja",
sampleRate: options.sampleRate ?? 16_000,
silenceThreshold: options.silenceThreshold ?? 0.015,
silenceDurationMs: options.silenceDurationMs ?? 900,
minSpeechMs: options.minSpeechMs ?? 250,
maxSegmentMs: options.maxSegmentMs ?? 25_000,
vadFilter: options.vadFilter ?? true,
initialPrompt: options.initialPrompt,
};
}
createSession(options?: TranscriberSessionOptions): TranscriberSession {
return new WhisperSession(
this.#ai,
{
...this.#options,
language: options?.language ?? this.#options.language,
},
options,
);
}
}
class WhisperSession implements TranscriberSession {
readonly #ai: Ai;
readonly #config: WhisperConfig;
readonly #onUtterance?: (transcript: string) => void;
#closed = false;
#preSpeechChunks: ArrayBuffer[] = [];
#preSpeechMs = 0;
#segmentChunks: ArrayBuffer[] = [];
#speechMs = 0;
#silenceMs = 0;
#segmentMs = 0;
#queue = Promise.resolve();
constructor(ai: Ai, config: WhisperConfig, options?: TranscriberSessionOptions) {
this.#ai = ai;
this.#config = config;
this.#onUtterance = options?.onUtterance;
}
feed(chunk: ArrayBuffer): void {
if (this.#closed) return;
const chunkMs = (chunk.byteLength / 2 / this.#config.sampleRate) * 1000;
const isSpeech = calculateRms(chunk) >= this.#config.silenceThreshold;
if (isSpeech) {
if (this.#segmentChunks.length === 0) {
this.#segmentChunks = this.#preSpeechChunks;
this.#segmentMs = this.#preSpeechMs;
this.#preSpeechChunks = [];
this.#preSpeechMs = 0;
}
this.#segmentChunks.push(chunk);
this.#speechMs += chunkMs;
this.#segmentMs += chunkMs;
this.#silenceMs = 0;
} else if (this.#segmentChunks.length > 0) {
this.#segmentChunks.push(chunk);
this.#silenceMs += chunkMs;
this.#segmentMs += chunkMs;
} else {
this.#preSpeechChunks.push(chunk);
this.#preSpeechMs += chunkMs;
while (this.#preSpeechMs > 300 && this.#preSpeechChunks.length > 1) {
const removed = this.#preSpeechChunks.shift();
if (removed) this.#preSpeechMs -= (removed.byteLength / 2 / this.#config.sampleRate) * 1000;
}
}
if (
this.#segmentChunks.length > 0 &&
((this.#speechMs >= this.#config.minSpeechMs &&
this.#silenceMs >= this.#config.silenceDurationMs) ||
this.#segmentMs >= this.#config.maxSegmentMs)
) {
this.#flush();
}
}
close(): void {
if (this.#closed) return;
this.#flush();
this.#closed = true;
this.#preSpeechChunks = [];
}
#flush(): void {
if (this.#segmentChunks.length === 0) return;
const chunks = this.#segmentChunks;
const speechMs = this.#speechMs;
this.#segmentChunks = [];
this.#speechMs = 0;
this.#silenceMs = 0;
this.#segmentMs = 0;
if (speechMs < this.#config.minSpeechMs) return;
this.#queue = this.#queue
.then(() => this.#transcribe(chunks))
.catch((error) => {
console.error("[WhisperSTT] Transcription error:", error);
});
}
async #transcribe(chunks: ArrayBuffer[]): Promise<void> {
const wav = encodePcm16Wav(chunks, this.#config.sampleRate);
const input: WhisperInput = {
audio: toBase64(wav),
task: "transcribe",
language: this.#config.language,
vad_filter: this.#config.vadFilter,
};
if (this.#config.initialPrompt) {
input.initial_prompt = this.#config.initialPrompt;
}
const result = (await this.#ai.run(
"@cf/openai/whisper-large-v3-turbo",
input,
)) as WhisperLargeV3TurboResult;
const transcript = result.text?.trim();
if (transcript && !this.#closed) {
this.#onUtterance?.(transcript);
}
}
}
function calculateRms(chunk: ArrayBuffer): number {
const view = new DataView(chunk);
let sum = 0;
let samples = 0;
for (let offset = 0; offset + 1 < view.byteLength; offset += 2) {
const sample = view.getInt16(offset, true) / 32768;
sum += sample * sample;
samples++;
}
return samples === 0 ? 0 : Math.sqrt(sum / samples);
}
function encodePcm16Wav(chunks: ArrayBuffer[], sampleRate: number): Uint8Array {
const pcmByteLength = chunks.reduce((total, chunk) => total + chunk.byteLength, 0);
const wav = new Uint8Array(44 + pcmByteLength);
const view = new DataView(wav.buffer);
writeAscii(wav, 0, "RIFF");
view.setUint32(4, 36 + pcmByteLength, true);
writeAscii(wav, 8, "WAVE");
writeAscii(wav, 12, "fmt ");
view.setUint32(16, 16, true);
view.setUint16(20, 1, true);
view.setUint16(22, 1, true);
view.setUint32(24, sampleRate, true);
view.setUint32(28, sampleRate * 2, true);
view.setUint16(32, 2, true);
view.setUint16(34, 16, true);
writeAscii(wav, 36, "data");
view.setUint32(40, pcmByteLength, true);
let offset = 44;
for (const chunk of chunks) {
wav.set(new Uint8Array(chunk), offset);
offset += chunk.byteLength;
}
return wav;
}
function writeAscii(target: Uint8Array, offset: number, value: string): void {
for (let index = 0; index < value.length; index++) {
target[offset + index] = value.charCodeAt(index);
}
}
function toBase64(bytes: Uint8Array): string {
let binary = "";
for (let offset = 0; offset < bytes.length; offset += 0x8000) {
binary += String.fromCharCode(...bytes.subarray(offset, offset + 0x8000));
}
return btoa(binary);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment