satish860 · March 9, 2025 08:48
diff --git a/gemini.py b/gemini.py
 # -*- coding: utf-8 -*-
 # Copyright 2025 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 """
 ## Setup

 To install the dependencies for this script, run:

 ``` 
 uv pip install -r requirements.txt
 ```

 Before running this script, ensure the `GEMINI_API_KEY` environment
 variable is set in your .env file.

 Important: **Use headphones**. This script uses the system default audio
 input and output, which often won't include echo cancellation. So to prevent
 the model from interrupting itself it is important that you use headphones. 

 ## Run

 To run the script:

 ```
 python main.py
 ```
 """

 import asyncio
 import os
 import sys
 import traceback
 from dotenv import load_dotenv

 # Import required libraries for audio
 import pyaudio  # For audio input/output

 import argparse  # For command line arguments

 from google import genai  # Google Generative AI client
 from google.genai import types  # Import types for configuration

 # For Python versions below 3.11, import backported TaskGroup and ExceptionGroup
 if sys.version_info < (3, 11, 0):
    import taskgroup, exceptiongroup
    asyncio.TaskGroup = taskgroup.TaskGroup
    asyncio.ExceptionGroup = exceptiongroup.ExceptionGroup

 # Load environment variables from .env file
 load_dotenv()

 # Audio configuration constants
 FORMAT = pyaudio.paInt16  # Audio format (16-bit PCM)
 CHANNELS = 1  # Mono audio
 SEND_SAMPLE_RATE = 16000  # Sample rate for sending audio (16kHz)
 RECEIVE_SAMPLE_RATE = 24000  # Sample rate for receiving audio (24kHz)
 CHUNK_SIZE = 1024  # Number of frames per buffer

 # Gemini model configuration
 MODEL = "models/gemini-2.0-flash-exp"  # Model identifier

 # Initialize the Gemini client with API key from environment variables
 client = genai.Client(
    api_key=os.getenv("GEMINI_API_KEY", "GEMINI_API_KEY"),
    http_options={"api_version": "v1alpha"}
 )

 # Configure response modalities with speech configuration and Kore voice
 config = types.LiveConnectConfig(
    response_modalities=["AUDIO"],
    speech_config=types.SpeechConfig(
        voice_config=types.VoiceConfig(
            prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name="Kore")
        )
    )
 )

 # Initialize PyAudio instance for audio I/O
 pya = pyaudio.PyAudio()


 class AudioLoop:
    """Main class that handles the audio interaction with Gemini"""
    
    def __init__(self):
        """Initialize the AudioLoop"""
        # Queues for audio data (initialized later)
        self.audio_in_queue = None  # For incoming audio from Gemini
        self.out_queue = None  # For outgoing data to Gemini

        # Session object (initialized later)
        self.session = None

        # Audio stream object
        self.audio_stream = None

    async def send_text(self):
        """Background task to read text input from user and send to Gemini"""
        while True:
            # Get text input from user (blocking operation, so use to_thread)
            text = await asyncio.to_thread(
                input,
                "message > ",
            )
            # Exit if user types 'q'
            if text.lower() == "q":
                break
            # Send the text to Gemini and mark it as end of turn
            await self.session.send(input=text or ".", end_of_turn=True)

    async def send_realtime(self):
        """Background task to send data from the queue to Gemini"""
        while True:
            # Get the next item from the queue
            msg = await self.out_queue.get()
            # Send it to Gemini
            await self.session.send(input=msg)

    async def listen_audio(self):
        """Background task to capture audio from the microphone and put it in the queue"""
        # Get the default microphone
        mic_info = pya.get_default_input_device_info()
        # Initialize the audio stream (blocking operation, so use to_thread)
        self.audio_stream = await asyncio.to_thread(
            pya.open,
            format=FORMAT,
            channels=CHANNELS,
            rate=SEND_SAMPLE_RATE,
            input=True,
            input_device_index=mic_info["index"],
            frames_per_buffer=CHUNK_SIZE,
        )
        
        # In debug mode, ignore overflow errors
        if __debug__:
            kwargs = {"exception_on_overflow": False}
        else:
            kwargs = {}
            
        while True:
            # Read audio data from the microphone (blocking operation, so use to_thread)
            data = await asyncio.to_thread(self.audio_stream.read, CHUNK_SIZE, **kwargs)
            # Put the audio data in the queue for sending to Gemini
            await self.out_queue.put({"data": data, "mime_type": "audio/pcm"})

    async def receive_audio(self):
        """Background task to read from Gemini and write audio chunks to the output queue"""
        while True:
            # Get the next turn from Gemini
            turn = self.session.receive()
            async for response in turn:
                # If we got audio data, put it in the queue for playback
                if data := response.data:
                    self.audio_in_queue.put_nowait(data)
                    continue
                # If we got text, print it
                if text := response.text:
                    print(text, end="")

            # If you interrupt the model, it sends a turn_complete.
            # For interruptions to work, we need to stop playback.
            # So empty out the audio queue because it may have loaded
            # much more audio than has played yet.
            while not self.audio_in_queue.empty():
                self.audio_in_queue.get_nowait()

    async def play_audio(self):
        """Background task to play audio data from the queue"""
        # Initialize the audio output stream (blocking operation, so use to_thread)
        stream = await asyncio.to_thread(
            pya.open,
            format=FORMAT,
            channels=CHANNELS,
            rate=RECEIVE_SAMPLE_RATE,
            output=True,
        )
        while True:
            # Get the next audio chunk from the queue
            bytestream = await self.audio_in_queue.get()
            # Play it (blocking operation, so use to_thread)
            await asyncio.to_thread(stream.write, bytestream)

    async def run(self):
        """Main method to run the audio loop"""
        try:
            # Connect to Gemini and create a task group for all background tasks
            async with (
                client.aio.live.connect(model=MODEL, config=config) as session,
                asyncio.TaskGroup() as tg,
            ):
                self.session = session

                # Initialize queues
                self.audio_in_queue = asyncio.Queue()  # For incoming audio from Gemini
                self.out_queue = asyncio.Queue(maxsize=5)  # For outgoing data to Gemini

                # Create background tasks
                send_text_task = tg.create_task(self.send_text())  # For text input
                tg.create_task(self.send_realtime())  # For sending data from the queue
                tg.create_task(self.listen_audio())  # For capturing audio

                # Create tasks for receiving and playing audio
                tg.create_task(self.receive_audio())  # For receiving audio from Gemini
                tg.create_task(self.play_audio())  # For playing audio

                # Wait for the text input task to complete (when user types 'q')
                await send_text_task
                # Cancel all other tasks
                raise asyncio.CancelledError("User requested exit")

        except asyncio.CancelledError:
            # Normal exit
            pass
        except ExceptionGroup as EG:
            # Close the audio stream and print the exception
            self.audio_stream.close()
            traceback.print_exception(EG)


 if __name__ == "__main__":
    # Create and run the audio loop
    main = AudioLoop()
    print("Starting Gemini Live audio chat with Kore voice (type 'q' to quit)...")
    print("Important: Use headphones to prevent echo!")
    asyncio.run(main.run())
	# -- coding: utf-8 --
	# Copyright 2025 Google LLC
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""
	## Setup

	To install the dependencies for this script, run:

	```
	uv pip install -r requirements.txt
	```

	Before running this script, ensure the `GEMINI_API_KEY` environment
	variable is set in your .env file.

	Important: Use headphones. This script uses the system default audio
	input and output, which often won't include echo cancellation. So to prevent
	the model from interrupting itself it is important that you use headphones.

	## Run

	To run the script:

	```
	python main.py
	```
	"""

	import asyncio
	import os
	import sys
	import traceback
	from dotenv import load_dotenv

	# Import required libraries for audio
	import pyaudio # For audio input/output

	import argparse # For command line arguments

	from google import genai # Google Generative AI client
	from google.genai import types # Import types for configuration

	# For Python versions below 3.11, import backported TaskGroup and ExceptionGroup
	if sys.version_info < (3, 11, 0):
	import taskgroup, exceptiongroup
	asyncio.TaskGroup = taskgroup.TaskGroup
	asyncio.ExceptionGroup = exceptiongroup.ExceptionGroup

	# Load environment variables from .env file
	load_dotenv()

	# Audio configuration constants
	FORMAT = pyaudio.paInt16 # Audio format (16-bit PCM)
	CHANNELS = 1 # Mono audio
	SEND_SAMPLE_RATE = 16000 # Sample rate for sending audio (16kHz)
	RECEIVE_SAMPLE_RATE = 24000 # Sample rate for receiving audio (24kHz)
	CHUNK_SIZE = 1024 # Number of frames per buffer

	# Gemini model configuration
	MODEL = "models/gemini-2.0-flash-exp" # Model identifier

	# Initialize the Gemini client with API key from environment variables
	client = genai.Client(
	api_key=os.getenv("GEMINI_API_KEY", "GEMINI_API_KEY"),
	http_options={"api_version": "v1alpha"}
	)

	# Configure response modalities with speech configuration and Kore voice
	config = types.LiveConnectConfig(
	response_modalities=["AUDIO"],
	speech_config=types.SpeechConfig(
	voice_config=types.VoiceConfig(
	prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name="Kore")
	)
	)
	)

	# Initialize PyAudio instance for audio I/O
	pya = pyaudio.PyAudio()


	class AudioLoop:
	"""Main class that handles the audio interaction with Gemini"""

	def __init__(self):
	"""Initialize the AudioLoop"""
	# Queues for audio data (initialized later)
	self.audio_in_queue = None # For incoming audio from Gemini
	self.out_queue = None # For outgoing data to Gemini

	# Session object (initialized later)
	self.session = None

	# Audio stream object
	self.audio_stream = None

	async def send_text(self):
	"""Background task to read text input from user and send to Gemini"""
	while True:
	# Get text input from user (blocking operation, so use to_thread)
	text = await asyncio.to_thread(
	input,
	"message > ",
	)
	# Exit if user types 'q'
	if text.lower() == "q":
	break
	# Send the text to Gemini and mark it as end of turn
	await self.session.send(input=text or ".", end_of_turn=True)

	async def send_realtime(self):
	"""Background task to send data from the queue to Gemini"""
	while True:
	# Get the next item from the queue
	msg = await self.out_queue.get()
	# Send it to Gemini
	await self.session.send(input=msg)

	async def listen_audio(self):
	"""Background task to capture audio from the microphone and put it in the queue"""
	# Get the default microphone
	mic_info = pya.get_default_input_device_info()
	# Initialize the audio stream (blocking operation, so use to_thread)
	self.audio_stream = await asyncio.to_thread(
	pya.open,
	format=FORMAT,
	channels=CHANNELS,
	rate=SEND_SAMPLE_RATE,
	input=True,
	input_device_index=mic_info["index"],
	frames_per_buffer=CHUNK_SIZE,
	)

	# In debug mode, ignore overflow errors
	if __debug__:
	kwargs = {"exception_on_overflow": False}
	else:
	kwargs = {}

	while True:
	# Read audio data from the microphone (blocking operation, so use to_thread)
	data = await asyncio.to_thread(self.audio_stream.read, CHUNK_SIZE, **kwargs)
	# Put the audio data in the queue for sending to Gemini
	await self.out_queue.put({"data": data, "mime_type": "audio/pcm"})

	async def receive_audio(self):
	"""Background task to read from Gemini and write audio chunks to the output queue"""
	while True:
	# Get the next turn from Gemini
	turn = self.session.receive()
	async for response in turn:
	# If we got audio data, put it in the queue for playback
	if data := response.data:
	self.audio_in_queue.put_nowait(data)
	continue
	# If we got text, print it
	if text := response.text:
	print(text, end="")

	# If you interrupt the model, it sends a turn_complete.
	# For interruptions to work, we need to stop playback.
	# So empty out the audio queue because it may have loaded
	# much more audio than has played yet.
	while not self.audio_in_queue.empty():
	self.audio_in_queue.get_nowait()

	async def play_audio(self):
	"""Background task to play audio data from the queue"""
	# Initialize the audio output stream (blocking operation, so use to_thread)
	stream = await asyncio.to_thread(
	pya.open,
	format=FORMAT,
	channels=CHANNELS,
	rate=RECEIVE_SAMPLE_RATE,
	output=True,
	)
	while True:
	# Get the next audio chunk from the queue
	bytestream = await self.audio_in_queue.get()
	# Play it (blocking operation, so use to_thread)
	await asyncio.to_thread(stream.write, bytestream)

	async def run(self):
	"""Main method to run the audio loop"""
	try:
	# Connect to Gemini and create a task group for all background tasks
	async with (
	client.aio.live.connect(model=MODEL, config=config) as session,
	asyncio.TaskGroup() as tg,
	):
	self.session = session

	# Initialize queues
	self.audio_in_queue = asyncio.Queue() # For incoming audio from Gemini
	self.out_queue = asyncio.Queue(maxsize=5) # For outgoing data to Gemini

	# Create background tasks
	send_text_task = tg.create_task(self.send_text()) # For text input
	tg.create_task(self.send_realtime()) # For sending data from the queue
	tg.create_task(self.listen_audio()) # For capturing audio

	# Create tasks for receiving and playing audio
	tg.create_task(self.receive_audio()) # For receiving audio from Gemini
	tg.create_task(self.play_audio()) # For playing audio

	# Wait for the text input task to complete (when user types 'q')
	await send_text_task
	# Cancel all other tasks
	raise asyncio.CancelledError("User requested exit")

	except asyncio.CancelledError:
	# Normal exit
	pass
	except ExceptionGroup as EG:
	# Close the audio stream and print the exception
	self.audio_stream.close()
	traceback.print_exception(EG)


	if __name__ == "__main__":
	# Create and run the audio loop
	main = AudioLoop()
	print("Starting Gemini Live audio chat with Kore voice (type 'q' to quit)...")
	print("Important: Use headphones to prevent echo!")
	asyncio.run(main.run())