benkant · April 16, 2025 16:22
diff --git a/audio-processor.js b/audio-processor.js
 // audio-processor.js
 // This AudioWorkletProcessor receives audio input and sends the raw Float32 data to the main thread.
 class AudioProcessor extends AudioWorkletProcessor {
  process(inputs, outputs, parameters) {
    // 'inputs' is an array of arrays; assume the first input and first channel (mono input).
    if (inputs.length > 0 && inputs[0].length > 0) {
      const channelData = inputs[0][0]; // This is a Float32Array of audio samples.
      // Create a copy of the data.
      const audioChunk = new Float32Array(channelData);
      // Post the audio chunk to the main thread.
      this.port.postMessage(audioChunk);
    }
    // Returning true keeps the processor alive.
    return true;
  }
 }

 registerProcessor('audio-processor', AudioProcessor);
diff --git a/main.js b/main.js
 // main.js
 // Assume that 'ws' is an open WebSocket connection to OpenAI’s realtime endpoint.
 const OPENAI_API_KEY = 'sk-XXXXXXXXXXXXXXXXXXXXXXXX';  // Replace with your key.
 const MODEL_ID = 'gpt-4o-realtime-preview-2024-10-01';
 const ws = new WebSocket(`wss://api.openai.com/v1/realtime?model=${MODEL_ID}`, [], {
  // In many browser environments, direct header configuration is not supported.
  // For production, consider proxying via your own server.
  headers: {
    'Authorization': `Bearer ${OPENAI_API_KEY}`,
    'OpenAI-Beta': 'realtime=v1'
  }
 });

 // This function sets up microphone capture, connects it to an AudioWorklet, and sends audio data.
 async function initAudioWorkletAndMic() {
  try {
    // Request access to the microphone.
    const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
    // Create an AudioContext; sampleRate may be adjusted per API requirements.
    const audioContext = new AudioContext({ sampleRate: 24000 });

    // Load the AudioWorklet module.
    await audioContext.audioWorklet.addModule('audio-processor.js');

    // Create a MediaStreamSource node from the microphone stream.
    const sourceNode = audioContext.createMediaStreamSource(stream);

    // Create an instance of the AudioWorkletNode using our "audio-processor".
    const audioWorkletNode = new AudioWorkletNode(audioContext, 'audio-processor');

    // Optionally, if you want to hear your own audio you could connect it to destination:
    // audioWorkletNode.connect(audioContext.destination);

    // Listen for messages from the AudioWorklet (each message is a Float32Array chunk).
    audioWorkletNode.port.onmessage = (event) => {
      const float32Array = event.data; // Audio chunk as Float32Array.
      
      // Convert from 32-bit float (range -1 to 1) to 16-bit PCM.
      const int16Array = new Int16Array(float32Array.length);
      for (let i = 0; i < float32Array.length; i++) {
        // Clamp the value just in case and convert to 16-bit.
        const sample = Math.max(-1, Math.min(1, float32Array[i]));
        int16Array[i] = sample < 0 ? sample * 32768 : sample * 32767;
      }
      
      // Convert the Int16Array into a binary string.
      let binaryString = "";
      for (let i = 0; i < int16Array.length; i++) {
        binaryString += String.fromCharCode(int16Array[i]);
      }
      // Base64 encode the binary string.
      const base64Audio = btoa(binaryString);
      
      // Package the audio data per OpenAI's expected event structure.
      const audioMessage = {
        type: 'input_audio_buffer.append',
        audio: base64Audio
      };
      
      // Send the audio message over the WebSocket if it is open.
      if (ws.readyState === WebSocket.OPEN) {
        ws.send(JSON.stringify(audioMessage));
      }
    };

    // Connect the audio source to the worklet node.
    sourceNode.connect(audioWorkletNode);
  } catch (err) {
    console.error('Error initializing audio capture and worklet:', err);
  }
 }

 // Initialize microphone capture and AudioWorklet.
 initAudioWorkletAndMic();
	// audio-processor.js
	// This AudioWorkletProcessor receives audio input and sends the raw Float32 data to the main thread.
	class AudioProcessor extends AudioWorkletProcessor {
	process(inputs, outputs, parameters) {
	// 'inputs' is an array of arrays; assume the first input and first channel (mono input).
	if (inputs.length > 0 && inputs[0].length > 0) {
	const channelData = inputs[0][0]; // This is a Float32Array of audio samples.
	// Create a copy of the data.
	const audioChunk = new Float32Array(channelData);
	// Post the audio chunk to the main thread.
	this.port.postMessage(audioChunk);
	}
	// Returning true keeps the processor alive.
	return true;
	}
	}

	registerProcessor('audio-processor', AudioProcessor);
	// main.js
	// Assume that 'ws' is an open WebSocket connection to OpenAI’s realtime endpoint.
	const OPENAI_API_KEY = 'sk-XXXXXXXXXXXXXXXXXXXXXXXX'; // Replace with your key.
	const MODEL_ID = 'gpt-4o-realtime-preview-2024-10-01';
	const ws = new WebSocket(`wss://api.openai.com/v1/realtime?model=${MODEL_ID}`, [], {
	// In many browser environments, direct header configuration is not supported.
	// For production, consider proxying via your own server.
	headers: {
	'Authorization': `Bearer ${OPENAI_API_KEY}`,
	'OpenAI-Beta': 'realtime=v1'
	}
	});

	// This function sets up microphone capture, connects it to an AudioWorklet, and sends audio data.
	async function initAudioWorkletAndMic() {
	try {
	// Request access to the microphone.
	const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
	// Create an AudioContext; sampleRate may be adjusted per API requirements.
	const audioContext = new AudioContext({ sampleRate: 24000 });

	// Load the AudioWorklet module.
	await audioContext.audioWorklet.addModule('audio-processor.js');

	// Create a MediaStreamSource node from the microphone stream.
	const sourceNode = audioContext.createMediaStreamSource(stream);

	// Create an instance of the AudioWorkletNode using our "audio-processor".
	const audioWorkletNode = new AudioWorkletNode(audioContext, 'audio-processor');

	// Optionally, if you want to hear your own audio you could connect it to destination:
	// audioWorkletNode.connect(audioContext.destination);

	// Listen for messages from the AudioWorklet (each message is a Float32Array chunk).
	audioWorkletNode.port.onmessage = (event) => {
	const float32Array = event.data; // Audio chunk as Float32Array.

	// Convert from 32-bit float (range -1 to 1) to 16-bit PCM.
	const int16Array = new Int16Array(float32Array.length);
	for (let i = 0; i < float32Array.length; i++) {
	// Clamp the value just in case and convert to 16-bit.
	const sample = Math.max(-1, Math.min(1, float32Array[i]));
	int16Array[i] = sample < 0 ? sample * 32768 : sample * 32767;
	}

	// Convert the Int16Array into a binary string.
	let binaryString = "";
	for (let i = 0; i < int16Array.length; i++) {
	binaryString += String.fromCharCode(int16Array[i]);
	}
	// Base64 encode the binary string.
	const base64Audio = btoa(binaryString);

	// Package the audio data per OpenAI's expected event structure.
	const audioMessage = {
	type: 'input_audio_buffer.append',
	audio: base64Audio
	};

	// Send the audio message over the WebSocket if it is open.
	if (ws.readyState === WebSocket.OPEN) {
	ws.send(JSON.stringify(audioMessage));
	}
	};

	// Connect the audio source to the worklet node.
	sourceNode.connect(audioWorkletNode);
	} catch (err) {
	console.error('Error initializing audio capture and worklet:', err);
	}
	}

	// Initialize microphone capture and AudioWorklet.
	initAudioWorkletAndMic();