Created
April 26, 2026 08:54
-
-
Save chimame/de7c0b6d7bcbc72af3e23652f9308ab2 to your computer and use it in GitHub Desktop.
Workers AI Whisper STT provider for @cloudflare/voice
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import type { Transcriber, TranscriberSession, TranscriberSessionOptions } from "@cloudflare/voice"; | |
| export interface WorkersAIWhisperSTTOptions { | |
| /** Language code. Use "ja" for Japanese. */ | |
| language?: string; | |
| /** Sample rate in Hz. @default 16000 */ | |
| sampleRate?: number; | |
| /** RMS threshold for local speech detection. @default 0.015 */ | |
| silenceThreshold?: number; | |
| /** Silence duration that finalizes one utterance. @default 900 */ | |
| silenceDurationMs?: number; | |
| /** Ignore very short detected sounds. @default 250 */ | |
| minSpeechMs?: number; | |
| /** Force a transcription before the buffered utterance grows too large. @default 25000 */ | |
| maxSegmentMs?: number; | |
| /** Preprocess audio with Workers AI VAD. @default true */ | |
| vadFilter?: boolean; | |
| /** Optional prompt to help Whisper with domain-specific context. */ | |
| initialPrompt?: string; | |
| } | |
| type WhisperLargeV3TurboResult = { | |
| text?: string; | |
| }; | |
| type WhisperInput = { | |
| audio: string; | |
| task: "transcribe"; | |
| language?: string; | |
| vad_filter?: boolean; | |
| initial_prompt?: string; | |
| }; | |
| type WhisperConfig = Required< | |
| Pick< | |
| WorkersAIWhisperSTTOptions, | |
| | "sampleRate" | |
| | "silenceThreshold" | |
| | "silenceDurationMs" | |
| | "minSpeechMs" | |
| | "maxSegmentMs" | |
| | "vadFilter" | |
| > | |
| > & | |
| Pick<WorkersAIWhisperSTTOptions, "language" | "initialPrompt">; | |
| export class WorkersAIWhisperSTT implements Transcriber { | |
| readonly #ai: Ai; | |
| readonly #options: WhisperConfig; | |
| constructor(ai: Ai, options: WorkersAIWhisperSTTOptions = {}) { | |
| this.#ai = ai; | |
| this.#options = { | |
| language: options.language ?? "ja", | |
| sampleRate: options.sampleRate ?? 16_000, | |
| silenceThreshold: options.silenceThreshold ?? 0.015, | |
| silenceDurationMs: options.silenceDurationMs ?? 900, | |
| minSpeechMs: options.minSpeechMs ?? 250, | |
| maxSegmentMs: options.maxSegmentMs ?? 25_000, | |
| vadFilter: options.vadFilter ?? true, | |
| initialPrompt: options.initialPrompt, | |
| }; | |
| } | |
| createSession(options?: TranscriberSessionOptions): TranscriberSession { | |
| return new WhisperSession( | |
| this.#ai, | |
| { | |
| ...this.#options, | |
| language: options?.language ?? this.#options.language, | |
| }, | |
| options, | |
| ); | |
| } | |
| } | |
| class WhisperSession implements TranscriberSession { | |
| readonly #ai: Ai; | |
| readonly #config: WhisperConfig; | |
| readonly #onUtterance?: (transcript: string) => void; | |
| #closed = false; | |
| #preSpeechChunks: ArrayBuffer[] = []; | |
| #preSpeechMs = 0; | |
| #segmentChunks: ArrayBuffer[] = []; | |
| #speechMs = 0; | |
| #silenceMs = 0; | |
| #segmentMs = 0; | |
| #queue = Promise.resolve(); | |
| constructor(ai: Ai, config: WhisperConfig, options?: TranscriberSessionOptions) { | |
| this.#ai = ai; | |
| this.#config = config; | |
| this.#onUtterance = options?.onUtterance; | |
| } | |
| feed(chunk: ArrayBuffer): void { | |
| if (this.#closed) return; | |
| const chunkMs = (chunk.byteLength / 2 / this.#config.sampleRate) * 1000; | |
| const isSpeech = calculateRms(chunk) >= this.#config.silenceThreshold; | |
| if (isSpeech) { | |
| if (this.#segmentChunks.length === 0) { | |
| this.#segmentChunks = this.#preSpeechChunks; | |
| this.#segmentMs = this.#preSpeechMs; | |
| this.#preSpeechChunks = []; | |
| this.#preSpeechMs = 0; | |
| } | |
| this.#segmentChunks.push(chunk); | |
| this.#speechMs += chunkMs; | |
| this.#segmentMs += chunkMs; | |
| this.#silenceMs = 0; | |
| } else if (this.#segmentChunks.length > 0) { | |
| this.#segmentChunks.push(chunk); | |
| this.#silenceMs += chunkMs; | |
| this.#segmentMs += chunkMs; | |
| } else { | |
| this.#preSpeechChunks.push(chunk); | |
| this.#preSpeechMs += chunkMs; | |
| while (this.#preSpeechMs > 300 && this.#preSpeechChunks.length > 1) { | |
| const removed = this.#preSpeechChunks.shift(); | |
| if (removed) this.#preSpeechMs -= (removed.byteLength / 2 / this.#config.sampleRate) * 1000; | |
| } | |
| } | |
| if ( | |
| this.#segmentChunks.length > 0 && | |
| ((this.#speechMs >= this.#config.minSpeechMs && | |
| this.#silenceMs >= this.#config.silenceDurationMs) || | |
| this.#segmentMs >= this.#config.maxSegmentMs) | |
| ) { | |
| this.#flush(); | |
| } | |
| } | |
| close(): void { | |
| if (this.#closed) return; | |
| this.#flush(); | |
| this.#closed = true; | |
| this.#preSpeechChunks = []; | |
| } | |
| #flush(): void { | |
| if (this.#segmentChunks.length === 0) return; | |
| const chunks = this.#segmentChunks; | |
| const speechMs = this.#speechMs; | |
| this.#segmentChunks = []; | |
| this.#speechMs = 0; | |
| this.#silenceMs = 0; | |
| this.#segmentMs = 0; | |
| if (speechMs < this.#config.minSpeechMs) return; | |
| this.#queue = this.#queue | |
| .then(() => this.#transcribe(chunks)) | |
| .catch((error) => { | |
| console.error("[WhisperSTT] Transcription error:", error); | |
| }); | |
| } | |
| async #transcribe(chunks: ArrayBuffer[]): Promise<void> { | |
| const wav = encodePcm16Wav(chunks, this.#config.sampleRate); | |
| const input: WhisperInput = { | |
| audio: toBase64(wav), | |
| task: "transcribe", | |
| language: this.#config.language, | |
| vad_filter: this.#config.vadFilter, | |
| }; | |
| if (this.#config.initialPrompt) { | |
| input.initial_prompt = this.#config.initialPrompt; | |
| } | |
| const result = (await this.#ai.run( | |
| "@cf/openai/whisper-large-v3-turbo", | |
| input, | |
| )) as WhisperLargeV3TurboResult; | |
| const transcript = result.text?.trim(); | |
| if (transcript && !this.#closed) { | |
| this.#onUtterance?.(transcript); | |
| } | |
| } | |
| } | |
| function calculateRms(chunk: ArrayBuffer): number { | |
| const view = new DataView(chunk); | |
| let sum = 0; | |
| let samples = 0; | |
| for (let offset = 0; offset + 1 < view.byteLength; offset += 2) { | |
| const sample = view.getInt16(offset, true) / 32768; | |
| sum += sample * sample; | |
| samples++; | |
| } | |
| return samples === 0 ? 0 : Math.sqrt(sum / samples); | |
| } | |
| function encodePcm16Wav(chunks: ArrayBuffer[], sampleRate: number): Uint8Array { | |
| const pcmByteLength = chunks.reduce((total, chunk) => total + chunk.byteLength, 0); | |
| const wav = new Uint8Array(44 + pcmByteLength); | |
| const view = new DataView(wav.buffer); | |
| writeAscii(wav, 0, "RIFF"); | |
| view.setUint32(4, 36 + pcmByteLength, true); | |
| writeAscii(wav, 8, "WAVE"); | |
| writeAscii(wav, 12, "fmt "); | |
| view.setUint32(16, 16, true); | |
| view.setUint16(20, 1, true); | |
| view.setUint16(22, 1, true); | |
| view.setUint32(24, sampleRate, true); | |
| view.setUint32(28, sampleRate * 2, true); | |
| view.setUint16(32, 2, true); | |
| view.setUint16(34, 16, true); | |
| writeAscii(wav, 36, "data"); | |
| view.setUint32(40, pcmByteLength, true); | |
| let offset = 44; | |
| for (const chunk of chunks) { | |
| wav.set(new Uint8Array(chunk), offset); | |
| offset += chunk.byteLength; | |
| } | |
| return wav; | |
| } | |
| function writeAscii(target: Uint8Array, offset: number, value: string): void { | |
| for (let index = 0; index < value.length; index++) { | |
| target[offset + index] = value.charCodeAt(index); | |
| } | |
| } | |
| function toBase64(bytes: Uint8Array): string { | |
| let binary = ""; | |
| for (let offset = 0; offset < bytes.length; offset += 0x8000) { | |
| binary += String.fromCharCode(...bytes.subarray(offset, offset + 0x8000)); | |
| } | |
| return btoa(binary); | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment