laiso · October 5, 2024 13:44
diff --git a/README.md b/README.md
diff --git a/ContentView.swift b/ContentView.swift
 import SwiftUI
 import AVFoundation

 struct ContentView: View {
    @State private var isRecording = false
    @State private var transcription = ""
    @State private var recordedAudioURL: URL?
    @State private var responseAudioURL: URL?
    @State private var isConnected = false
    @State private var accumulatedAudioData = Data()
    @State private var webSocketTask: URLSessionWebSocketTask?
    
    private let audioEngine = AVAudioEngine()
    
    var body: some View {
        VStack {
            Image(systemName: isRecording ? "mic.fill" : "mic")
                .imageScale(.large)
                .foregroundStyle(.tint)
            Text(isRecording ? "Recording..." : "Start Recording")
            Button(action: toggleRecording) {
                Text(isRecording ? "Stop" : "Start")
                    .padding()
                    .background(isRecording ? Color.red : Color.blue)
                    .foregroundColor(.white)
                    .cornerRadius(10)
            }
            Button(action: commitAndCreateResponse) {
                Text("Send")
                    .padding()
                    .background(Color.green)
                    .foregroundColor(.white)
                    .cornerRadius(10)
            }
            Text(transcription)
                .padding()
            Button(action: playRecordedAudio) {
                Text("Play")
                    .padding()
                    .background(Color.orange)
                    .foregroundColor(.white)
                    .cornerRadius(10)
            }
            .disabled(recordedAudioURL == nil)
            Button(action: playResponseAudio) {
                Text("Play Response")
                    .padding()
                    .background(Color.purple)
                    .foregroundColor(.white)
                    .cornerRadius(10)
            }
            .disabled(responseAudioURL == nil)
        }
        .padding()
        .onAppear {
            setupAudioSession()
            setupWebSocket()
        }
    }
    private func setupAudioSession() {
        do {
            let audioSession = AVAudioSession.sharedInstance()
            try audioSession.setCategory(.playAndRecord, mode: .default, options: [.defaultToSpeaker, .mixWithOthers])
            try audioSession.setActive(true)
        } catch {
            print("Audio session error: \(error)")
        }
    }
    
    private func setupWebSocket() {
        guard let url = URL(string: "wss://api.openai.com/v1/realtime?model=gpt-4o-realtime-preview-2024-10-01") else { return }
        var request = URLRequest(url: url)
        if let apiKey = Bundle.main.object(forInfoDictionaryKey: "OPENAI_API_KEY") ?? ProcessInfo.processInfo.environment["OPENAI_API_KEY"] {
            request.setValue("Bearer \(apiKey)", forHTTPHeaderField: "Authorization")
        } else {
            print("Error: OPENAI_API_KEY not found in environment variables")
            return
        }
        request.setValue("realtime=v1", forHTTPHeaderField: "OpenAI-Beta")
        
        webSocketTask = URLSession.shared.webSocketTask(with: request)
        receiveMessage()
        webSocketTask?.resume()
        isConnected = true
    }
    
    private func receiveMessage() {
        webSocketTask?.receive { result in
            switch result {
            case .success(let message):
                switch message {
                case .string(let text):
                    self.handleReceivedText(text)
                case .data:
                    print("Received data message")
                @unknown default:
                    break
                }
                self.receiveMessage()
            case .failure(let error):
                print("WebSocket error: \(error)")
            }
        }
    }
    
    private func handleReceivedText(_ text: String) {
        if let data = text.data(using: .utf8),
           let json = try? JSONSerialization.jsonObject(with: data, options: []) as? [String: Any] {
            if let item = json["item"] as? [String: Any],
               let content = item["content"] as? [[String: Any]],
               let firstContent = content.first,
               let transcript = firstContent["transcript"] as? String {
                DispatchQueue.main.async {
                    self.transcription = transcript
                }
            } else if let type = json["type"] as? String {
                switch type {
                case "response.audio.delta":
                    if let delta = json["delta"] as? String {
                        handleAudioDelta(delta)
                    }
                case "response.audio.done":
                    convertAccumulatedAudioToFile()
                default:
                    print("handleReceivedText:other_types:\(type)")
                }
            }
        }
    }
    
    private func handleAudioDelta(_ base64Audio: String) {
        guard let audioData = Data(base64Encoded: base64Audio) else {
            print("Failed to decode Base64 audio data")
            return
        }
        accumulatedAudioData.append(audioData)
    }
    
    private func convertAccumulatedAudioToFile() {
        let sampleRate = 24000.0
        
        let format = AVAudioFormat(commonFormat: .pcmFormatFloat32, sampleRate: sampleRate, channels: 1, interleaved: false)!
        
        let documentsPath = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0]
        let outputURL = documentsPath.appendingPathComponent("responseAudio.wav")
        
        do {
            let audioFile = try AVAudioFile(forWriting: outputURL, settings: format.settings)
            
            let frameCount = UInt32(accumulatedAudioData.count) / 4
            let buffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: frameCount)!
            buffer.frameLength = frameCount
            
            accumulatedAudioData.withUnsafeBytes { rawBufferPointer in
                let int16BufferPointer = rawBufferPointer.bindMemory(to: Int16.self)
                for i in 0..<Int(frameCount) {
                    let sample = Float(int16BufferPointer[i]) / Float(Int16.max)
                    buffer.floatChannelData?[0][i] = sample
                }
            }
            
            try audioFile.write(from: buffer)
            
            DispatchQueue.main.async {
                self.responseAudioURL = outputURL
            }
        } catch {
            print("Failed to save response audio: \(error)")
        }
        
        accumulatedAudioData = Data()
    }
    
    private func toggleRecording() {
        if isRecording {
            stopRecording()
        } else {
            setupWebSocket()
            startRecording()
        }
    }
    
    private func startRecording() {
        let inputNode = audioEngine.inputNode
        let recordingFormat = inputNode.outputFormat(forBus: 0)
        
        let documentsPath = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0]
        recordedAudioURL = documentsPath.appendingPathComponent("recordedAudio.wav")
        
        let file = try! AVAudioFile(forWriting: recordedAudioURL!, settings: recordingFormat.settings)
        
        inputNode.installTap(onBus: 0, bufferSize: 1024, format: recordingFormat) { buffer, _ in
            try? file.write(from: buffer)
            self.sendAudioData(buffer)
        }
        
        audioEngine.prepare()
        do {
            try audioEngine.start()
            isRecording = true
        } catch {
            print("Failed to start audio engine: \(error)")
        }
    }
    
    private func stopRecording() {
        audioEngine.stop()
        audioEngine.inputNode.removeTap(onBus: 0)
        isRecording = false
        webSocketTask?.cancel(with: .goingAway, reason: nil)
    }
    
    private func pcmBufferToBase64(pcmBuffer: AVAudioPCMBuffer) -> String? {
        guard let channelData = pcmBuffer.floatChannelData else {
            return nil
        }
        
        let frameLength = Int(pcmBuffer.frameLength)
        let channelCount = Int(pcmBuffer.format.channelCount)
        let dataSize = frameLength * channelCount * MemoryLayout<Float>.size
        let audioData = UnsafeMutableRawPointer(mutating: channelData).bindMemory(to: Float.self, capacity: dataSize)
        
        // Create Data object from PCM buffer
        let data = Data(bytes: audioData, count: dataSize)
        
        // Convert Data to Base64 string
        return data.base64EncodedString()
    }
    
    private func sendAudioData(_ buffer: AVAudioPCMBuffer) {
        guard isConnected else {
            print("WebSocket is not connected. Cannot send audio data.")
            return
        }
        
        guard let base64Audio = self.pcmBufferToBase64(pcmBuffer: buffer) else {
            print("Failed to convert PCM buffer to Base64 string.")
            return
        }
        
        let event: [String: Any] = [
            "type": "input_audio_buffer.append",
            "audio": base64Audio
        ]
        
        if let jsonData = try? JSONSerialization.data(withJSONObject: event),
           let jsonString = String(data: jsonData, encoding: .utf8) {
            webSocketTask?.send(.string(jsonString)) { error in
                if let error = error {
                    print("Error sending message: \(error)")
                }
            }
        }
    }
    
    private func playRecordedAudio() {
        if isRecording {
            stopRecording()
        }
        
        guard let audioURL = recordedAudioURL else {
            print("No recorded audio URL found.")
            return
        }
        
        do {
            let audioFile = try AVAudioFile(forReading: audioURL)
            
            let playerNode = AVAudioPlayerNode()
            audioEngine.attach(playerNode)
            
            let mainMixerNode = audioEngine.mainMixerNode
            audioEngine.connect(playerNode, to: mainMixerNode, format: audioFile.processingFormat)
            
            playerNode.scheduleFile(audioFile, at: nil)
            
            if !audioEngine.isRunning {
                try audioEngine.start()
            }
            
            playerNode.play()
        } catch {
            print("Failed to play recorded audio: \(error)")
        }
    }
    
    private func playResponseAudio() {
        if isRecording {
            stopRecording()
        }
        
        guard let audioURL = responseAudioURL else {
            print("No response audio URL found.")
            return
        }
        
        do {
            let audioFile = try AVAudioFile(forReading: audioURL)
            let mainMixerNode = audioEngine.mainMixerNode
            let outputFormat = mainMixerNode.outputFormat(forBus: 0)
            
            let playerNode = AVAudioPlayerNode()
            audioEngine.attach(playerNode)
            audioEngine.connect(playerNode, to: mainMixerNode, format: outputFormat)
            
            playerNode.scheduleFile(audioFile, at: nil)
            
            if !audioEngine.isRunning {
                try audioEngine.start()
            }
            
            playerNode.play()
        } catch {
            print("Failed to play response audio: \(error)")
        }
    }
    
    private func commitAndCreateResponse() {
        guard isConnected else {
            print("WebSocket is not connected. Cannot commit and create response.")
            return
        }
        
        let instructions = "Please assist the user."
        
        let commitEvent: [String: Any] = [
            "type": "input_audio_buffer.commit"
        ]
        
        let responseEvent: [String: Any] = [
            "type": "response.create",
            "response": [
                "modalities": ["audio", "text"],
                "instructions": instructions
            ]
        ]
        
        [commitEvent, responseEvent].forEach { event in
            if let jsonData = try? JSONSerialization.data(withJSONObject: event),
               let jsonString = String(data: jsonData, encoding: .utf8) {
                webSocketTask?.send(.string(jsonString)) { error in
                    if let error = error {
                        print("Error sending message: \(error)")
                    }
                }
            }
        }
    }
 }

 #Preview {
    ContentView()
 }
	import SwiftUI
	import AVFoundation

	struct ContentView: View {
	@State private var isRecording = false
	@State private var transcription = ""
	@State private var recordedAudioURL: URL?
	@State private var responseAudioURL: URL?
	@State private var isConnected = false
	@State private var accumulatedAudioData = Data()
	@State private var webSocketTask: URLSessionWebSocketTask?

	private let audioEngine = AVAudioEngine()

	var body: some View {
	VStack {
	Image(systemName: isRecording ? "mic.fill" : "mic")
	.imageScale(.large)
	.foregroundStyle(.tint)
	Text(isRecording ? "Recording..." : "Start Recording")
	Button(action: toggleRecording) {
	Text(isRecording ? "Stop" : "Start")
	.padding()
	.background(isRecording ? Color.red : Color.blue)
	.foregroundColor(.white)
	.cornerRadius(10)
	}
	Button(action: commitAndCreateResponse) {
	Text("Send")
	.padding()
	.background(Color.green)
	.foregroundColor(.white)
	.cornerRadius(10)
	}
	Text(transcription)
	.padding()
	Button(action: playRecordedAudio) {
	Text("Play")
	.padding()
	.background(Color.orange)
	.foregroundColor(.white)
	.cornerRadius(10)
	}
	.disabled(recordedAudioURL == nil)
	Button(action: playResponseAudio) {
	Text("Play Response")
	.padding()
	.background(Color.purple)
	.foregroundColor(.white)
	.cornerRadius(10)
	}
	.disabled(responseAudioURL == nil)
	}
	.padding()
	.onAppear {
	setupAudioSession()
	setupWebSocket()
	}
	}
	private func setupAudioSession() {
	do {
	let audioSession = AVAudioSession.sharedInstance()
	try audioSession.setCategory(.playAndRecord, mode: .default, options: [.defaultToSpeaker, .mixWithOthers])
	try audioSession.setActive(true)
	} catch {
	print("Audio session error: \(error)")
	}
	}

	private func setupWebSocket() {
	guard let url = URL(string: "wss://api.openai.com/v1/realtime?model=gpt-4o-realtime-preview-2024-10-01") else { return }
	var request = URLRequest(url: url)
	if let apiKey = Bundle.main.object(forInfoDictionaryKey: "OPENAI_API_KEY") ?? ProcessInfo.processInfo.environment["OPENAI_API_KEY"] {
	request.setValue("Bearer \(apiKey)", forHTTPHeaderField: "Authorization")
	} else {
	print("Error: OPENAI_API_KEY not found in environment variables")
	return
	}
	request.setValue("realtime=v1", forHTTPHeaderField: "OpenAI-Beta")

	webSocketTask = URLSession.shared.webSocketTask(with: request)
	receiveMessage()
	webSocketTask?.resume()
	isConnected = true
	}

	private func receiveMessage() {
	webSocketTask?.receive { result in
	switch result {
	case .success(let message):
	switch message {
	case .string(let text):
	self.handleReceivedText(text)
	case .data:
	print("Received data message")
	@unknown default:
	break
	}
	self.receiveMessage()
	case .failure(let error):
	print("WebSocket error: \(error)")
	}
	}
	}

	private func handleReceivedText(_ text: String) {
	if let data = text.data(using: .utf8),
	let json = try? JSONSerialization.jsonObject(with: data, options: []) as? [String: Any] {
	if let item = json["item"] as? [String: Any],
	let content = item["content"] as? [[String: Any]],
	let firstContent = content.first,
	let transcript = firstContent["transcript"] as? String {
	DispatchQueue.main.async {
	self.transcription = transcript
	}
	} else if let type = json["type"] as? String {
	switch type {
	case "response.audio.delta":
	if let delta = json["delta"] as? String {
	handleAudioDelta(delta)
	}
	case "response.audio.done":
	convertAccumulatedAudioToFile()
	default:
	print("handleReceivedText:other_types:\(type)")
	}
	}
	}
	}

	private func handleAudioDelta(_ base64Audio: String) {
	guard let audioData = Data(base64Encoded: base64Audio) else {
	print("Failed to decode Base64 audio data")
	return
	}
	accumulatedAudioData.append(audioData)
	}

	private func convertAccumulatedAudioToFile() {
	let sampleRate = 24000.0

	let format = AVAudioFormat(commonFormat: .pcmFormatFloat32, sampleRate: sampleRate, channels: 1, interleaved: false)!

	let documentsPath = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0]
	let outputURL = documentsPath.appendingPathComponent("responseAudio.wav")

	do {
	let audioFile = try AVAudioFile(forWriting: outputURL, settings: format.settings)

	let frameCount = UInt32(accumulatedAudioData.count) / 4
	let buffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: frameCount)!
	buffer.frameLength = frameCount

	accumulatedAudioData.withUnsafeBytes { rawBufferPointer in
	let int16BufferPointer = rawBufferPointer.bindMemory(to: Int16.self)
	for i in 0..<Int(frameCount) {
	let sample = Float(int16BufferPointer[i]) / Float(Int16.max)
	buffer.floatChannelData?[0][i] = sample
	}
	}

	try audioFile.write(from: buffer)

	DispatchQueue.main.async {
	self.responseAudioURL = outputURL
	}
	} catch {
	print("Failed to save response audio: \(error)")
	}

	accumulatedAudioData = Data()
	}

	private func toggleRecording() {
	if isRecording {
	stopRecording()
	} else {
	setupWebSocket()
	startRecording()
	}
	}

	private func startRecording() {
	let inputNode = audioEngine.inputNode
	let recordingFormat = inputNode.outputFormat(forBus: 0)

	let documentsPath = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0]
	recordedAudioURL = documentsPath.appendingPathComponent("recordedAudio.wav")

	let file = try! AVAudioFile(forWriting: recordedAudioURL!, settings: recordingFormat.settings)

	inputNode.installTap(onBus: 0, bufferSize: 1024, format: recordingFormat) { buffer, _ in
	try? file.write(from: buffer)
	self.sendAudioData(buffer)
	}

	audioEngine.prepare()
	do {
	try audioEngine.start()
	isRecording = true
	} catch {
	print("Failed to start audio engine: \(error)")
	}
	}

	private func stopRecording() {
	audioEngine.stop()
	audioEngine.inputNode.removeTap(onBus: 0)
	isRecording = false
	webSocketTask?.cancel(with: .goingAway, reason: nil)
	}

	private func pcmBufferToBase64(pcmBuffer: AVAudioPCMBuffer) -> String? {
	guard let channelData = pcmBuffer.floatChannelData else {
	return nil
	}

	let frameLength = Int(pcmBuffer.frameLength)
	let channelCount = Int(pcmBuffer.format.channelCount)
	let dataSize = frameLength * channelCount * MemoryLayout<Float>.size
	let audioData = UnsafeMutableRawPointer(mutating: channelData).bindMemory(to: Float.self, capacity: dataSize)

	// Create Data object from PCM buffer
	let data = Data(bytes: audioData, count: dataSize)

	// Convert Data to Base64 string
	return data.base64EncodedString()
	}

	private func sendAudioData(_ buffer: AVAudioPCMBuffer) {
	guard isConnected else {
	print("WebSocket is not connected. Cannot send audio data.")
	return
	}

	guard let base64Audio = self.pcmBufferToBase64(pcmBuffer: buffer) else {
	print("Failed to convert PCM buffer to Base64 string.")
	return
	}

	let event: [String: Any] = [
	"type": "input_audio_buffer.append",
	"audio": base64Audio
	]

	if let jsonData = try? JSONSerialization.data(withJSONObject: event),
	let jsonString = String(data: jsonData, encoding: .utf8) {
	webSocketTask?.send(.string(jsonString)) { error in
	if let error = error {
	print("Error sending message: \(error)")
	}
	}
	}
	}

	private func playRecordedAudio() {
	if isRecording {
	stopRecording()
	}

	guard let audioURL = recordedAudioURL else {
	print("No recorded audio URL found.")
	return
	}

	do {
	let audioFile = try AVAudioFile(forReading: audioURL)

	let playerNode = AVAudioPlayerNode()
	audioEngine.attach(playerNode)

	let mainMixerNode = audioEngine.mainMixerNode
	audioEngine.connect(playerNode, to: mainMixerNode, format: audioFile.processingFormat)

	playerNode.scheduleFile(audioFile, at: nil)

	if !audioEngine.isRunning {
	try audioEngine.start()
	}

	playerNode.play()
	} catch {
	print("Failed to play recorded audio: \(error)")
	}
	}

	private func playResponseAudio() {
	if isRecording {
	stopRecording()
	}

	guard let audioURL = responseAudioURL else {
	print("No response audio URL found.")
	return
	}

	do {
	let audioFile = try AVAudioFile(forReading: audioURL)
	let mainMixerNode = audioEngine.mainMixerNode
	let outputFormat = mainMixerNode.outputFormat(forBus: 0)

	let playerNode = AVAudioPlayerNode()
	audioEngine.attach(playerNode)
	audioEngine.connect(playerNode, to: mainMixerNode, format: outputFormat)

	playerNode.scheduleFile(audioFile, at: nil)

	if !audioEngine.isRunning {
	try audioEngine.start()
	}

	playerNode.play()
	} catch {
	print("Failed to play response audio: \(error)")
	}
	}

	private func commitAndCreateResponse() {
	guard isConnected else {
	print("WebSocket is not connected. Cannot commit and create response.")
	return
	}

	let instructions = "Please assist the user."

	let commitEvent: [String: Any] = [
	"type": "input_audio_buffer.commit"
	]

	let responseEvent: [String: Any] = [
	"type": "response.create",
	"response": [
	"modalities": ["audio", "text"],
	"instructions": instructions
	]
	]

	[commitEvent, responseEvent].forEach { event in
	if let jsonData = try? JSONSerialization.data(withJSONObject: event),
	let jsonString = String(data: jsonData, encoding: .utf8) {
	webSocketTask?.send(.string(jsonString)) { error in
	if let error = error {
	print("Error sending message: \(error)")
	}
	}
	}
	}
	}
	}

	#Preview {
	ContentView()
	}