Skip to content

Instantly share code, notes, and snippets.

@satish860
Created March 9, 2025 08:48
Show Gist options
  • Save satish860/77c9d646858ae385f4895ad3ecb7feab to your computer and use it in GitHub Desktop.
Save satish860/77c9d646858ae385f4895ad3ecb7feab to your computer and use it in GitHub Desktop.
Gemini Voice Assitance
# -*- coding: utf-8 -*-
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
## Setup
To install the dependencies for this script, run:
```
uv pip install -r requirements.txt
```
Before running this script, ensure the `GEMINI_API_KEY` environment
variable is set in your .env file.
Important: **Use headphones**. This script uses the system default audio
input and output, which often won't include echo cancellation. So to prevent
the model from interrupting itself it is important that you use headphones.
## Run
To run the script:
```
python main.py
```
"""
import asyncio
import os
import sys
import traceback
from dotenv import load_dotenv
# Import required libraries for audio
import pyaudio # For audio input/output
import argparse # For command line arguments
from google import genai # Google Generative AI client
from google.genai import types # Import types for configuration
# For Python versions below 3.11, import backported TaskGroup and ExceptionGroup
if sys.version_info < (3, 11, 0):
import taskgroup, exceptiongroup
asyncio.TaskGroup = taskgroup.TaskGroup
asyncio.ExceptionGroup = exceptiongroup.ExceptionGroup
# Load environment variables from .env file
load_dotenv()
# Audio configuration constants
FORMAT = pyaudio.paInt16 # Audio format (16-bit PCM)
CHANNELS = 1 # Mono audio
SEND_SAMPLE_RATE = 16000 # Sample rate for sending audio (16kHz)
RECEIVE_SAMPLE_RATE = 24000 # Sample rate for receiving audio (24kHz)
CHUNK_SIZE = 1024 # Number of frames per buffer
# Gemini model configuration
MODEL = "models/gemini-2.0-flash-exp" # Model identifier
# Initialize the Gemini client with API key from environment variables
client = genai.Client(
api_key=os.getenv("GEMINI_API_KEY", "GEMINI_API_KEY"),
http_options={"api_version": "v1alpha"}
)
# Configure response modalities with speech configuration and Kore voice
config = types.LiveConnectConfig(
response_modalities=["AUDIO"],
speech_config=types.SpeechConfig(
voice_config=types.VoiceConfig(
prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name="Kore")
)
)
)
# Initialize PyAudio instance for audio I/O
pya = pyaudio.PyAudio()
class AudioLoop:
"""Main class that handles the audio interaction with Gemini"""
def __init__(self):
"""Initialize the AudioLoop"""
# Queues for audio data (initialized later)
self.audio_in_queue = None # For incoming audio from Gemini
self.out_queue = None # For outgoing data to Gemini
# Session object (initialized later)
self.session = None
# Audio stream object
self.audio_stream = None
async def send_text(self):
"""Background task to read text input from user and send to Gemini"""
while True:
# Get text input from user (blocking operation, so use to_thread)
text = await asyncio.to_thread(
input,
"message > ",
)
# Exit if user types 'q'
if text.lower() == "q":
break
# Send the text to Gemini and mark it as end of turn
await self.session.send(input=text or ".", end_of_turn=True)
async def send_realtime(self):
"""Background task to send data from the queue to Gemini"""
while True:
# Get the next item from the queue
msg = await self.out_queue.get()
# Send it to Gemini
await self.session.send(input=msg)
async def listen_audio(self):
"""Background task to capture audio from the microphone and put it in the queue"""
# Get the default microphone
mic_info = pya.get_default_input_device_info()
# Initialize the audio stream (blocking operation, so use to_thread)
self.audio_stream = await asyncio.to_thread(
pya.open,
format=FORMAT,
channels=CHANNELS,
rate=SEND_SAMPLE_RATE,
input=True,
input_device_index=mic_info["index"],
frames_per_buffer=CHUNK_SIZE,
)
# In debug mode, ignore overflow errors
if __debug__:
kwargs = {"exception_on_overflow": False}
else:
kwargs = {}
while True:
# Read audio data from the microphone (blocking operation, so use to_thread)
data = await asyncio.to_thread(self.audio_stream.read, CHUNK_SIZE, **kwargs)
# Put the audio data in the queue for sending to Gemini
await self.out_queue.put({"data": data, "mime_type": "audio/pcm"})
async def receive_audio(self):
"""Background task to read from Gemini and write audio chunks to the output queue"""
while True:
# Get the next turn from Gemini
turn = self.session.receive()
async for response in turn:
# If we got audio data, put it in the queue for playback
if data := response.data:
self.audio_in_queue.put_nowait(data)
continue
# If we got text, print it
if text := response.text:
print(text, end="")
# If you interrupt the model, it sends a turn_complete.
# For interruptions to work, we need to stop playback.
# So empty out the audio queue because it may have loaded
# much more audio than has played yet.
while not self.audio_in_queue.empty():
self.audio_in_queue.get_nowait()
async def play_audio(self):
"""Background task to play audio data from the queue"""
# Initialize the audio output stream (blocking operation, so use to_thread)
stream = await asyncio.to_thread(
pya.open,
format=FORMAT,
channels=CHANNELS,
rate=RECEIVE_SAMPLE_RATE,
output=True,
)
while True:
# Get the next audio chunk from the queue
bytestream = await self.audio_in_queue.get()
# Play it (blocking operation, so use to_thread)
await asyncio.to_thread(stream.write, bytestream)
async def run(self):
"""Main method to run the audio loop"""
try:
# Connect to Gemini and create a task group for all background tasks
async with (
client.aio.live.connect(model=MODEL, config=config) as session,
asyncio.TaskGroup() as tg,
):
self.session = session
# Initialize queues
self.audio_in_queue = asyncio.Queue() # For incoming audio from Gemini
self.out_queue = asyncio.Queue(maxsize=5) # For outgoing data to Gemini
# Create background tasks
send_text_task = tg.create_task(self.send_text()) # For text input
tg.create_task(self.send_realtime()) # For sending data from the queue
tg.create_task(self.listen_audio()) # For capturing audio
# Create tasks for receiving and playing audio
tg.create_task(self.receive_audio()) # For receiving audio from Gemini
tg.create_task(self.play_audio()) # For playing audio
# Wait for the text input task to complete (when user types 'q')
await send_text_task
# Cancel all other tasks
raise asyncio.CancelledError("User requested exit")
except asyncio.CancelledError:
# Normal exit
pass
except ExceptionGroup as EG:
# Close the audio stream and print the exception
self.audio_stream.close()
traceback.print_exception(EG)
if __name__ == "__main__":
# Create and run the audio loop
main = AudioLoop()
print("Starting Gemini Live audio chat with Kore voice (type 'q' to quit)...")
print("Important: Use headphones to prevent echo!")
asyncio.run(main.run())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment