Skip to content

Instantly share code, notes, and snippets.

@laiso
Created October 5, 2024 13:44
Show Gist options
  • Save laiso/ac5a0141654d40350f97adeb083ff23e to your computer and use it in GitHub Desktop.
Save laiso/ac5a0141654d40350f97adeb083ff23e to your computer and use it in GitHub Desktop.
iOS Voice Interaction PoC using OpenAI Realtime API

iOS Voice Interaction PoC using OpenAI Realtime API

Key Features

  1. Real-time voice recording (using AVAudioEngine)
  2. WebSocket connection with OpenAI Realtime API
  3. Real-time transmission of voice data (Base64 encoded)
  4. Processing text/voice responses from the API
  5. Audio playback (recorded voice and API response voice)

Main Implementations

  • Audio session configuration
  • WebSocket connection management
  • Voice data conversion (PCM to Base64)
  • Response processing (JSON parsing, voice decoding)
  • Recording control
  • Audio file operations

Important Notes

  • Secure management of API keys is necessary
  • Balancing voice quality and latency is crucial
import SwiftUI
import AVFoundation
struct ContentView: View {
@State private var isRecording = false
@State private var transcription = ""
@State private var recordedAudioURL: URL?
@State private var responseAudioURL: URL?
@State private var isConnected = false
@State private var accumulatedAudioData = Data()
@State private var webSocketTask: URLSessionWebSocketTask?
private let audioEngine = AVAudioEngine()
var body: some View {
VStack {
Image(systemName: isRecording ? "mic.fill" : "mic")
.imageScale(.large)
.foregroundStyle(.tint)
Text(isRecording ? "Recording..." : "Start Recording")
Button(action: toggleRecording) {
Text(isRecording ? "Stop" : "Start")
.padding()
.background(isRecording ? Color.red : Color.blue)
.foregroundColor(.white)
.cornerRadius(10)
}
Button(action: commitAndCreateResponse) {
Text("Send")
.padding()
.background(Color.green)
.foregroundColor(.white)
.cornerRadius(10)
}
Text(transcription)
.padding()
Button(action: playRecordedAudio) {
Text("Play")
.padding()
.background(Color.orange)
.foregroundColor(.white)
.cornerRadius(10)
}
.disabled(recordedAudioURL == nil)
Button(action: playResponseAudio) {
Text("Play Response")
.padding()
.background(Color.purple)
.foregroundColor(.white)
.cornerRadius(10)
}
.disabled(responseAudioURL == nil)
}
.padding()
.onAppear {
setupAudioSession()
setupWebSocket()
}
}
private func setupAudioSession() {
do {
let audioSession = AVAudioSession.sharedInstance()
try audioSession.setCategory(.playAndRecord, mode: .default, options: [.defaultToSpeaker, .mixWithOthers])
try audioSession.setActive(true)
} catch {
print("Audio session error: \(error)")
}
}
private func setupWebSocket() {
guard let url = URL(string: "wss://api.openai.com/v1/realtime?model=gpt-4o-realtime-preview-2024-10-01") else { return }
var request = URLRequest(url: url)
if let apiKey = Bundle.main.object(forInfoDictionaryKey: "OPENAI_API_KEY") ?? ProcessInfo.processInfo.environment["OPENAI_API_KEY"] {
request.setValue("Bearer \(apiKey)", forHTTPHeaderField: "Authorization")
} else {
print("Error: OPENAI_API_KEY not found in environment variables")
return
}
request.setValue("realtime=v1", forHTTPHeaderField: "OpenAI-Beta")
webSocketTask = URLSession.shared.webSocketTask(with: request)
receiveMessage()
webSocketTask?.resume()
isConnected = true
}
private func receiveMessage() {
webSocketTask?.receive { result in
switch result {
case .success(let message):
switch message {
case .string(let text):
self.handleReceivedText(text)
case .data:
print("Received data message")
@unknown default:
break
}
self.receiveMessage()
case .failure(let error):
print("WebSocket error: \(error)")
}
}
}
private func handleReceivedText(_ text: String) {
if let data = text.data(using: .utf8),
let json = try? JSONSerialization.jsonObject(with: data, options: []) as? [String: Any] {
if let item = json["item"] as? [String: Any],
let content = item["content"] as? [[String: Any]],
let firstContent = content.first,
let transcript = firstContent["transcript"] as? String {
DispatchQueue.main.async {
self.transcription = transcript
}
} else if let type = json["type"] as? String {
switch type {
case "response.audio.delta":
if let delta = json["delta"] as? String {
handleAudioDelta(delta)
}
case "response.audio.done":
convertAccumulatedAudioToFile()
default:
print("handleReceivedText:other_types:\(type)")
}
}
}
}
private func handleAudioDelta(_ base64Audio: String) {
guard let audioData = Data(base64Encoded: base64Audio) else {
print("Failed to decode Base64 audio data")
return
}
accumulatedAudioData.append(audioData)
}
private func convertAccumulatedAudioToFile() {
let sampleRate = 24000.0
let format = AVAudioFormat(commonFormat: .pcmFormatFloat32, sampleRate: sampleRate, channels: 1, interleaved: false)!
let documentsPath = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0]
let outputURL = documentsPath.appendingPathComponent("responseAudio.wav")
do {
let audioFile = try AVAudioFile(forWriting: outputURL, settings: format.settings)
let frameCount = UInt32(accumulatedAudioData.count) / 4
let buffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: frameCount)!
buffer.frameLength = frameCount
accumulatedAudioData.withUnsafeBytes { rawBufferPointer in
let int16BufferPointer = rawBufferPointer.bindMemory(to: Int16.self)
for i in 0..<Int(frameCount) {
let sample = Float(int16BufferPointer[i]) / Float(Int16.max)
buffer.floatChannelData?[0][i] = sample
}
}
try audioFile.write(from: buffer)
DispatchQueue.main.async {
self.responseAudioURL = outputURL
}
} catch {
print("Failed to save response audio: \(error)")
}
accumulatedAudioData = Data()
}
private func toggleRecording() {
if isRecording {
stopRecording()
} else {
setupWebSocket()
startRecording()
}
}
private func startRecording() {
let inputNode = audioEngine.inputNode
let recordingFormat = inputNode.outputFormat(forBus: 0)
let documentsPath = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0]
recordedAudioURL = documentsPath.appendingPathComponent("recordedAudio.wav")
let file = try! AVAudioFile(forWriting: recordedAudioURL!, settings: recordingFormat.settings)
inputNode.installTap(onBus: 0, bufferSize: 1024, format: recordingFormat) { buffer, _ in
try? file.write(from: buffer)
self.sendAudioData(buffer)
}
audioEngine.prepare()
do {
try audioEngine.start()
isRecording = true
} catch {
print("Failed to start audio engine: \(error)")
}
}
private func stopRecording() {
audioEngine.stop()
audioEngine.inputNode.removeTap(onBus: 0)
isRecording = false
webSocketTask?.cancel(with: .goingAway, reason: nil)
}
private func pcmBufferToBase64(pcmBuffer: AVAudioPCMBuffer) -> String? {
guard let channelData = pcmBuffer.floatChannelData else {
return nil
}
let frameLength = Int(pcmBuffer.frameLength)
let channelCount = Int(pcmBuffer.format.channelCount)
let dataSize = frameLength * channelCount * MemoryLayout<Float>.size
let audioData = UnsafeMutableRawPointer(mutating: channelData).bindMemory(to: Float.self, capacity: dataSize)
// Create Data object from PCM buffer
let data = Data(bytes: audioData, count: dataSize)
// Convert Data to Base64 string
return data.base64EncodedString()
}
private func sendAudioData(_ buffer: AVAudioPCMBuffer) {
guard isConnected else {
print("WebSocket is not connected. Cannot send audio data.")
return
}
guard let base64Audio = self.pcmBufferToBase64(pcmBuffer: buffer) else {
print("Failed to convert PCM buffer to Base64 string.")
return
}
let event: [String: Any] = [
"type": "input_audio_buffer.append",
"audio": base64Audio
]
if let jsonData = try? JSONSerialization.data(withJSONObject: event),
let jsonString = String(data: jsonData, encoding: .utf8) {
webSocketTask?.send(.string(jsonString)) { error in
if let error = error {
print("Error sending message: \(error)")
}
}
}
}
private func playRecordedAudio() {
if isRecording {
stopRecording()
}
guard let audioURL = recordedAudioURL else {
print("No recorded audio URL found.")
return
}
do {
let audioFile = try AVAudioFile(forReading: audioURL)
let playerNode = AVAudioPlayerNode()
audioEngine.attach(playerNode)
let mainMixerNode = audioEngine.mainMixerNode
audioEngine.connect(playerNode, to: mainMixerNode, format: audioFile.processingFormat)
playerNode.scheduleFile(audioFile, at: nil)
if !audioEngine.isRunning {
try audioEngine.start()
}
playerNode.play()
} catch {
print("Failed to play recorded audio: \(error)")
}
}
private func playResponseAudio() {
if isRecording {
stopRecording()
}
guard let audioURL = responseAudioURL else {
print("No response audio URL found.")
return
}
do {
let audioFile = try AVAudioFile(forReading: audioURL)
let mainMixerNode = audioEngine.mainMixerNode
let outputFormat = mainMixerNode.outputFormat(forBus: 0)
let playerNode = AVAudioPlayerNode()
audioEngine.attach(playerNode)
audioEngine.connect(playerNode, to: mainMixerNode, format: outputFormat)
playerNode.scheduleFile(audioFile, at: nil)
if !audioEngine.isRunning {
try audioEngine.start()
}
playerNode.play()
} catch {
print("Failed to play response audio: \(error)")
}
}
private func commitAndCreateResponse() {
guard isConnected else {
print("WebSocket is not connected. Cannot commit and create response.")
return
}
let instructions = "Please assist the user."
let commitEvent: [String: Any] = [
"type": "input_audio_buffer.commit"
]
let responseEvent: [String: Any] = [
"type": "response.create",
"response": [
"modalities": ["audio", "text"],
"instructions": instructions
]
]
[commitEvent, responseEvent].forEach { event in
if let jsonData = try? JSONSerialization.data(withJSONObject: event),
let jsonString = String(data: jsonData, encoding: .utf8) {
webSocketTask?.send(.string(jsonString)) { error in
if let error = error {
print("Error sending message: \(error)")
}
}
}
}
}
}
#Preview {
ContentView()
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment