Created
March 9, 2025 08:48
-
-
Save satish860/77c9d646858ae385f4895ad3ecb7feab to your computer and use it in GitHub Desktop.
Gemini Voice Assitance
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# Copyright 2025 Google LLC | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
""" | |
## Setup | |
To install the dependencies for this script, run: | |
``` | |
uv pip install -r requirements.txt | |
``` | |
Before running this script, ensure the `GEMINI_API_KEY` environment | |
variable is set in your .env file. | |
Important: **Use headphones**. This script uses the system default audio | |
input and output, which often won't include echo cancellation. So to prevent | |
the model from interrupting itself it is important that you use headphones. | |
## Run | |
To run the script: | |
``` | |
python main.py | |
``` | |
""" | |
import asyncio | |
import os | |
import sys | |
import traceback | |
from dotenv import load_dotenv | |
# Import required libraries for audio | |
import pyaudio # For audio input/output | |
import argparse # For command line arguments | |
from google import genai # Google Generative AI client | |
from google.genai import types # Import types for configuration | |
# For Python versions below 3.11, import backported TaskGroup and ExceptionGroup | |
if sys.version_info < (3, 11, 0): | |
import taskgroup, exceptiongroup | |
asyncio.TaskGroup = taskgroup.TaskGroup | |
asyncio.ExceptionGroup = exceptiongroup.ExceptionGroup | |
# Load environment variables from .env file | |
load_dotenv() | |
# Audio configuration constants | |
FORMAT = pyaudio.paInt16 # Audio format (16-bit PCM) | |
CHANNELS = 1 # Mono audio | |
SEND_SAMPLE_RATE = 16000 # Sample rate for sending audio (16kHz) | |
RECEIVE_SAMPLE_RATE = 24000 # Sample rate for receiving audio (24kHz) | |
CHUNK_SIZE = 1024 # Number of frames per buffer | |
# Gemini model configuration | |
MODEL = "models/gemini-2.0-flash-exp" # Model identifier | |
# Initialize the Gemini client with API key from environment variables | |
client = genai.Client( | |
api_key=os.getenv("GEMINI_API_KEY", "GEMINI_API_KEY"), | |
http_options={"api_version": "v1alpha"} | |
) | |
# Configure response modalities with speech configuration and Kore voice | |
config = types.LiveConnectConfig( | |
response_modalities=["AUDIO"], | |
speech_config=types.SpeechConfig( | |
voice_config=types.VoiceConfig( | |
prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name="Kore") | |
) | |
) | |
) | |
# Initialize PyAudio instance for audio I/O | |
pya = pyaudio.PyAudio() | |
class AudioLoop: | |
"""Main class that handles the audio interaction with Gemini""" | |
def __init__(self): | |
"""Initialize the AudioLoop""" | |
# Queues for audio data (initialized later) | |
self.audio_in_queue = None # For incoming audio from Gemini | |
self.out_queue = None # For outgoing data to Gemini | |
# Session object (initialized later) | |
self.session = None | |
# Audio stream object | |
self.audio_stream = None | |
async def send_text(self): | |
"""Background task to read text input from user and send to Gemini""" | |
while True: | |
# Get text input from user (blocking operation, so use to_thread) | |
text = await asyncio.to_thread( | |
input, | |
"message > ", | |
) | |
# Exit if user types 'q' | |
if text.lower() == "q": | |
break | |
# Send the text to Gemini and mark it as end of turn | |
await self.session.send(input=text or ".", end_of_turn=True) | |
async def send_realtime(self): | |
"""Background task to send data from the queue to Gemini""" | |
while True: | |
# Get the next item from the queue | |
msg = await self.out_queue.get() | |
# Send it to Gemini | |
await self.session.send(input=msg) | |
async def listen_audio(self): | |
"""Background task to capture audio from the microphone and put it in the queue""" | |
# Get the default microphone | |
mic_info = pya.get_default_input_device_info() | |
# Initialize the audio stream (blocking operation, so use to_thread) | |
self.audio_stream = await asyncio.to_thread( | |
pya.open, | |
format=FORMAT, | |
channels=CHANNELS, | |
rate=SEND_SAMPLE_RATE, | |
input=True, | |
input_device_index=mic_info["index"], | |
frames_per_buffer=CHUNK_SIZE, | |
) | |
# In debug mode, ignore overflow errors | |
if __debug__: | |
kwargs = {"exception_on_overflow": False} | |
else: | |
kwargs = {} | |
while True: | |
# Read audio data from the microphone (blocking operation, so use to_thread) | |
data = await asyncio.to_thread(self.audio_stream.read, CHUNK_SIZE, **kwargs) | |
# Put the audio data in the queue for sending to Gemini | |
await self.out_queue.put({"data": data, "mime_type": "audio/pcm"}) | |
async def receive_audio(self): | |
"""Background task to read from Gemini and write audio chunks to the output queue""" | |
while True: | |
# Get the next turn from Gemini | |
turn = self.session.receive() | |
async for response in turn: | |
# If we got audio data, put it in the queue for playback | |
if data := response.data: | |
self.audio_in_queue.put_nowait(data) | |
continue | |
# If we got text, print it | |
if text := response.text: | |
print(text, end="") | |
# If you interrupt the model, it sends a turn_complete. | |
# For interruptions to work, we need to stop playback. | |
# So empty out the audio queue because it may have loaded | |
# much more audio than has played yet. | |
while not self.audio_in_queue.empty(): | |
self.audio_in_queue.get_nowait() | |
async def play_audio(self): | |
"""Background task to play audio data from the queue""" | |
# Initialize the audio output stream (blocking operation, so use to_thread) | |
stream = await asyncio.to_thread( | |
pya.open, | |
format=FORMAT, | |
channels=CHANNELS, | |
rate=RECEIVE_SAMPLE_RATE, | |
output=True, | |
) | |
while True: | |
# Get the next audio chunk from the queue | |
bytestream = await self.audio_in_queue.get() | |
# Play it (blocking operation, so use to_thread) | |
await asyncio.to_thread(stream.write, bytestream) | |
async def run(self): | |
"""Main method to run the audio loop""" | |
try: | |
# Connect to Gemini and create a task group for all background tasks | |
async with ( | |
client.aio.live.connect(model=MODEL, config=config) as session, | |
asyncio.TaskGroup() as tg, | |
): | |
self.session = session | |
# Initialize queues | |
self.audio_in_queue = asyncio.Queue() # For incoming audio from Gemini | |
self.out_queue = asyncio.Queue(maxsize=5) # For outgoing data to Gemini | |
# Create background tasks | |
send_text_task = tg.create_task(self.send_text()) # For text input | |
tg.create_task(self.send_realtime()) # For sending data from the queue | |
tg.create_task(self.listen_audio()) # For capturing audio | |
# Create tasks for receiving and playing audio | |
tg.create_task(self.receive_audio()) # For receiving audio from Gemini | |
tg.create_task(self.play_audio()) # For playing audio | |
# Wait for the text input task to complete (when user types 'q') | |
await send_text_task | |
# Cancel all other tasks | |
raise asyncio.CancelledError("User requested exit") | |
except asyncio.CancelledError: | |
# Normal exit | |
pass | |
except ExceptionGroup as EG: | |
# Close the audio stream and print the exception | |
self.audio_stream.close() | |
traceback.print_exception(EG) | |
if __name__ == "__main__": | |
# Create and run the audio loop | |
main = AudioLoop() | |
print("Starting Gemini Live audio chat with Kore voice (type 'q' to quit)...") | |
print("Important: Use headphones to prevent echo!") | |
asyncio.run(main.run()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment