chimame · April 26, 2026 08:54
diff --git a/WorkersAIWhisperSTT.ts b/WorkersAIWhisperSTT.ts
 import type { Transcriber, TranscriberSession, TranscriberSessionOptions } from "@cloudflare/voice";

 export interface WorkersAIWhisperSTTOptions {
  /** Language code. Use "ja" for Japanese. */
  language?: string;
  /** Sample rate in Hz. @default 16000 */
  sampleRate?: number;
  /** RMS threshold for local speech detection. @default 0.015 */
  silenceThreshold?: number;
  /** Silence duration that finalizes one utterance. @default 900 */
  silenceDurationMs?: number;
  /** Ignore very short detected sounds. @default 250 */
  minSpeechMs?: number;
  /** Force a transcription before the buffered utterance grows too large. @default 25000 */
  maxSegmentMs?: number;
  /** Preprocess audio with Workers AI VAD. @default true */
  vadFilter?: boolean;
  /** Optional prompt to help Whisper with domain-specific context. */
  initialPrompt?: string;
 }

 type WhisperLargeV3TurboResult = {
  text?: string;
 };

 type WhisperInput = {
  audio: string;
  task: "transcribe";
  language?: string;
  vad_filter?: boolean;
  initial_prompt?: string;
 };

 type WhisperConfig = Required<
  Pick<
    WorkersAIWhisperSTTOptions,
    | "sampleRate"
    | "silenceThreshold"
    | "silenceDurationMs"
    | "minSpeechMs"
    | "maxSegmentMs"
    | "vadFilter"
  >
 > &
  Pick<WorkersAIWhisperSTTOptions, "language" | "initialPrompt">;

 export class WorkersAIWhisperSTT implements Transcriber {
  readonly #ai: Ai;
  readonly #options: WhisperConfig;

  constructor(ai: Ai, options: WorkersAIWhisperSTTOptions = {}) {
    this.#ai = ai;
    this.#options = {
      language: options.language ?? "ja",
      sampleRate: options.sampleRate ?? 16_000,
      silenceThreshold: options.silenceThreshold ?? 0.015,
      silenceDurationMs: options.silenceDurationMs ?? 900,
      minSpeechMs: options.minSpeechMs ?? 250,
      maxSegmentMs: options.maxSegmentMs ?? 25_000,
      vadFilter: options.vadFilter ?? true,
      initialPrompt: options.initialPrompt,
    };
  }

  createSession(options?: TranscriberSessionOptions): TranscriberSession {
    return new WhisperSession(
      this.#ai,
      {
        ...this.#options,
        language: options?.language ?? this.#options.language,
      },
      options,
    );
  }
 }

 class WhisperSession implements TranscriberSession {
  readonly #ai: Ai;
  readonly #config: WhisperConfig;
  readonly #onUtterance?: (transcript: string) => void;

  #closed = false;
  #preSpeechChunks: ArrayBuffer[] = [];
  #preSpeechMs = 0;
  #segmentChunks: ArrayBuffer[] = [];
  #speechMs = 0;
  #silenceMs = 0;
  #segmentMs = 0;
  #queue = Promise.resolve();

  constructor(ai: Ai, config: WhisperConfig, options?: TranscriberSessionOptions) {
    this.#ai = ai;
    this.#config = config;
    this.#onUtterance = options?.onUtterance;
  }

  feed(chunk: ArrayBuffer): void {
    if (this.#closed) return;

    const chunkMs = (chunk.byteLength / 2 / this.#config.sampleRate) * 1000;
    const isSpeech = calculateRms(chunk) >= this.#config.silenceThreshold;

    if (isSpeech) {
      if (this.#segmentChunks.length === 0) {
        this.#segmentChunks = this.#preSpeechChunks;
        this.#segmentMs = this.#preSpeechMs;
        this.#preSpeechChunks = [];
        this.#preSpeechMs = 0;
      }

      this.#segmentChunks.push(chunk);
      this.#speechMs += chunkMs;
      this.#segmentMs += chunkMs;
      this.#silenceMs = 0;
    } else if (this.#segmentChunks.length > 0) {
      this.#segmentChunks.push(chunk);
      this.#silenceMs += chunkMs;
      this.#segmentMs += chunkMs;
    } else {
      this.#preSpeechChunks.push(chunk);
      this.#preSpeechMs += chunkMs;

      while (this.#preSpeechMs > 300 && this.#preSpeechChunks.length > 1) {
        const removed = this.#preSpeechChunks.shift();
        if (removed) this.#preSpeechMs -= (removed.byteLength / 2 / this.#config.sampleRate) * 1000;
      }
    }

    if (
      this.#segmentChunks.length > 0 &&
      ((this.#speechMs >= this.#config.minSpeechMs &&
        this.#silenceMs >= this.#config.silenceDurationMs) ||
        this.#segmentMs >= this.#config.maxSegmentMs)
    ) {
      this.#flush();
    }
  }

  close(): void {
    if (this.#closed) return;
    this.#flush();
    this.#closed = true;
    this.#preSpeechChunks = [];
  }

  #flush(): void {
    if (this.#segmentChunks.length === 0) return;

    const chunks = this.#segmentChunks;
    const speechMs = this.#speechMs;

    this.#segmentChunks = [];
    this.#speechMs = 0;
    this.#silenceMs = 0;
    this.#segmentMs = 0;

    if (speechMs < this.#config.minSpeechMs) return;

    this.#queue = this.#queue
      .then(() => this.#transcribe(chunks))
      .catch((error) => {
        console.error("[WhisperSTT] Transcription error:", error);
      });
  }

  async #transcribe(chunks: ArrayBuffer[]): Promise<void> {
    const wav = encodePcm16Wav(chunks, this.#config.sampleRate);
    const input: WhisperInput = {
      audio: toBase64(wav),
      task: "transcribe",
      language: this.#config.language,
      vad_filter: this.#config.vadFilter,
    };

    if (this.#config.initialPrompt) {
      input.initial_prompt = this.#config.initialPrompt;
    }

    const result = (await this.#ai.run(
      "@cf/openai/whisper-large-v3-turbo",
      input,
    )) as WhisperLargeV3TurboResult;
    const transcript = result.text?.trim();

    if (transcript && !this.#closed) {
      this.#onUtterance?.(transcript);
    }
  }
 }

 function calculateRms(chunk: ArrayBuffer): number {
  const view = new DataView(chunk);
  let sum = 0;
  let samples = 0;

  for (let offset = 0; offset + 1 < view.byteLength; offset += 2) {
    const sample = view.getInt16(offset, true) / 32768;
    sum += sample * sample;
    samples++;
  }

  return samples === 0 ? 0 : Math.sqrt(sum / samples);
 }

 function encodePcm16Wav(chunks: ArrayBuffer[], sampleRate: number): Uint8Array {
  const pcmByteLength = chunks.reduce((total, chunk) => total + chunk.byteLength, 0);
  const wav = new Uint8Array(44 + pcmByteLength);
  const view = new DataView(wav.buffer);

  writeAscii(wav, 0, "RIFF");
  view.setUint32(4, 36 + pcmByteLength, true);
  writeAscii(wav, 8, "WAVE");
  writeAscii(wav, 12, "fmt ");
  view.setUint32(16, 16, true);
  view.setUint16(20, 1, true);
  view.setUint16(22, 1, true);
  view.setUint32(24, sampleRate, true);
  view.setUint32(28, sampleRate * 2, true);
  view.setUint16(32, 2, true);
  view.setUint16(34, 16, true);
  writeAscii(wav, 36, "data");
  view.setUint32(40, pcmByteLength, true);

  let offset = 44;
  for (const chunk of chunks) {
    wav.set(new Uint8Array(chunk), offset);
    offset += chunk.byteLength;
  }

  return wav;
 }

 function writeAscii(target: Uint8Array, offset: number, value: string): void {
  for (let index = 0; index < value.length; index++) {
    target[offset + index] = value.charCodeAt(index);
  }
 }

 function toBase64(bytes: Uint8Array): string {
  let binary = "";

  for (let offset = 0; offset < bytes.length; offset += 0x8000) {
    binary += String.fromCharCode(...bytes.subarray(offset, offset + 0x8000));
  }

  return btoa(binary);
 }
	import type { Transcriber, TranscriberSession, TranscriberSessionOptions } from "@cloudflare/voice";

	export interface WorkersAIWhisperSTTOptions {
	/** Language code. Use "ja" for Japanese. */
	language?: string;
	/** Sample rate in Hz. @default 16000 */
	sampleRate?: number;
	/** RMS threshold for local speech detection. @default 0.015 */
	silenceThreshold?: number;
	/** Silence duration that finalizes one utterance. @default 900 */
	silenceDurationMs?: number;
	/** Ignore very short detected sounds. @default 250 */
	minSpeechMs?: number;
	/** Force a transcription before the buffered utterance grows too large. @default 25000 */
	maxSegmentMs?: number;
	/** Preprocess audio with Workers AI VAD. @default true */
	vadFilter?: boolean;
	/** Optional prompt to help Whisper with domain-specific context. */
	initialPrompt?: string;
	}

	type WhisperLargeV3TurboResult = {
	text?: string;
	};

	type WhisperInput = {
	audio: string;
	task: "transcribe";
	language?: string;
	vad_filter?: boolean;
	initial_prompt?: string;
	};

	type WhisperConfig = Required<
	Pick<
	WorkersAIWhisperSTTOptions,
	\| "sampleRate"
	\| "silenceThreshold"
	\| "silenceDurationMs"
	\| "minSpeechMs"
	\| "maxSegmentMs"
	\| "vadFilter"
	>
	> &
	Pick<WorkersAIWhisperSTTOptions, "language" \| "initialPrompt">;

	export class WorkersAIWhisperSTT implements Transcriber {
	readonly #ai: Ai;
	readonly #options: WhisperConfig;

	constructor(ai: Ai, options: WorkersAIWhisperSTTOptions = {}) {
	this.#ai = ai;
	this.#options = {
	language: options.language ?? "ja",
	sampleRate: options.sampleRate ?? 16_000,
	silenceThreshold: options.silenceThreshold ?? 0.015,
	silenceDurationMs: options.silenceDurationMs ?? 900,
	minSpeechMs: options.minSpeechMs ?? 250,
	maxSegmentMs: options.maxSegmentMs ?? 25_000,
	vadFilter: options.vadFilter ?? true,
	initialPrompt: options.initialPrompt,
	};
	}

	createSession(options?: TranscriberSessionOptions): TranscriberSession {
	return new WhisperSession(
	this.#ai,
	{
	...this.#options,
	language: options?.language ?? this.#options.language,
	},
	options,
	);
	}
	}

	class WhisperSession implements TranscriberSession {
	readonly #ai: Ai;
	readonly #config: WhisperConfig;
	readonly #onUtterance?: (transcript: string) => void;

	#closed = false;
	#preSpeechChunks: ArrayBuffer[] = [];
	#preSpeechMs = 0;
	#segmentChunks: ArrayBuffer[] = [];
	#speechMs = 0;
	#silenceMs = 0;
	#segmentMs = 0;
	#queue = Promise.resolve();

	constructor(ai: Ai, config: WhisperConfig, options?: TranscriberSessionOptions) {
	this.#ai = ai;
	this.#config = config;
	this.#onUtterance = options?.onUtterance;
	}

	feed(chunk: ArrayBuffer): void {
	if (this.#closed) return;

	const chunkMs = (chunk.byteLength / 2 / this.#config.sampleRate) * 1000;
	const isSpeech = calculateRms(chunk) >= this.#config.silenceThreshold;

	if (isSpeech) {
	if (this.#segmentChunks.length === 0) {
	this.#segmentChunks = this.#preSpeechChunks;
	this.#segmentMs = this.#preSpeechMs;
	this.#preSpeechChunks = [];
	this.#preSpeechMs = 0;
	}

	this.#segmentChunks.push(chunk);
	this.#speechMs += chunkMs;
	this.#segmentMs += chunkMs;
	this.#silenceMs = 0;
	} else if (this.#segmentChunks.length > 0) {
	this.#segmentChunks.push(chunk);
	this.#silenceMs += chunkMs;
	this.#segmentMs += chunkMs;
	} else {
	this.#preSpeechChunks.push(chunk);
	this.#preSpeechMs += chunkMs;

	while (this.#preSpeechMs > 300 && this.#preSpeechChunks.length > 1) {
	const removed = this.#preSpeechChunks.shift();
	if (removed) this.#preSpeechMs -= (removed.byteLength / 2 / this.#config.sampleRate) * 1000;
	}
	}

	if (
	this.#segmentChunks.length > 0 &&
	((this.#speechMs >= this.#config.minSpeechMs &&
	this.#silenceMs >= this.#config.silenceDurationMs) \|\|
	this.#segmentMs >= this.#config.maxSegmentMs)
	) {
	this.#flush();
	}
	}

	close(): void {
	if (this.#closed) return;
	this.#flush();
	this.#closed = true;
	this.#preSpeechChunks = [];
	}

	#flush(): void {
	if (this.#segmentChunks.length === 0) return;

	const chunks = this.#segmentChunks;
	const speechMs = this.#speechMs;

	this.#segmentChunks = [];
	this.#speechMs = 0;
	this.#silenceMs = 0;
	this.#segmentMs = 0;

	if (speechMs < this.#config.minSpeechMs) return;

	this.#queue = this.#queue
	.then(() => this.#transcribe(chunks))
	.catch((error) => {
	console.error("[WhisperSTT] Transcription error:", error);
	});
	}

	async #transcribe(chunks: ArrayBuffer[]): Promise<void> {
	const wav = encodePcm16Wav(chunks, this.#config.sampleRate);
	const input: WhisperInput = {
	audio: toBase64(wav),
	task: "transcribe",
	language: this.#config.language,
	vad_filter: this.#config.vadFilter,
	};

	if (this.#config.initialPrompt) {
	input.initial_prompt = this.#config.initialPrompt;
	}

	const result = (await this.#ai.run(
	"@cf/openai/whisper-large-v3-turbo",
	input,
	)) as WhisperLargeV3TurboResult;
	const transcript = result.text?.trim();

	if (transcript && !this.#closed) {
	this.#onUtterance?.(transcript);
	}
	}
	}

	function calculateRms(chunk: ArrayBuffer): number {
	const view = new DataView(chunk);
	let sum = 0;
	let samples = 0;

	for (let offset = 0; offset + 1 < view.byteLength; offset += 2) {
	const sample = view.getInt16(offset, true) / 32768;
	sum += sample * sample;
	samples++;
	}

	return samples === 0 ? 0 : Math.sqrt(sum / samples);
	}

	function encodePcm16Wav(chunks: ArrayBuffer[], sampleRate: number): Uint8Array {
	const pcmByteLength = chunks.reduce((total, chunk) => total + chunk.byteLength, 0);
	const wav = new Uint8Array(44 + pcmByteLength);
	const view = new DataView(wav.buffer);

	writeAscii(wav, 0, "RIFF");
	view.setUint32(4, 36 + pcmByteLength, true);
	writeAscii(wav, 8, "WAVE");
	writeAscii(wav, 12, "fmt ");
	view.setUint32(16, 16, true);
	view.setUint16(20, 1, true);
	view.setUint16(22, 1, true);
	view.setUint32(24, sampleRate, true);
	view.setUint32(28, sampleRate * 2, true);
	view.setUint16(32, 2, true);
	view.setUint16(34, 16, true);
	writeAscii(wav, 36, "data");
	view.setUint32(40, pcmByteLength, true);

	let offset = 44;
	for (const chunk of chunks) {
	wav.set(new Uint8Array(chunk), offset);
	offset += chunk.byteLength;
	}

	return wav;
	}

	function writeAscii(target: Uint8Array, offset: number, value: string): void {
	for (let index = 0; index < value.length; index++) {
	target[offset + index] = value.charCodeAt(index);
	}
	}

	function toBase64(bytes: Uint8Array): string {
	let binary = "";

	for (let offset = 0; offset < bytes.length; offset += 0x8000) {
	binary += String.fromCharCode(...bytes.subarray(offset, offset + 0x8000));
	}

	return btoa(binary);
	}
No results found