Skip to content

Instantly share code, notes, and snippets.

@r0yfire
Last active March 4, 2024 19:15
Show Gist options
  • Save r0yfire/24a768af1d7a48c7cb70ae45fecd1cc7 to your computer and use it in GitHub Desktop.
Save r0yfire/24a768af1d7a48c7cb70ae45fecd1cc7 to your computer and use it in GitHub Desktop.
Voice Chat with LLMs
"""
Originally posted on:
https://royfirestein.com/blog/real-time-voice-chat-with-ai
"""
import os
import wave
from pydub import AudioSegment
from groq import Groq
from whispercpp import Whisper
from elevenlabs import generate, stream
import pyaudio
# Initialize the Whisper client
whisper = Whisper('tiny')
# Set the API keys
os.environ["ELEVEN_API_KEY"] = "YOUR API KEY"
os.environ["GROQ_API_KEY"] = "YOUR API KEY"
# Create API clients
groq_client = Groq(
api_key=os.environ.get("GROQ_API_KEY"),
)
# Set the system prompt
SYSTEM_PROMPT = "\n".join([
"You are a friendly hotel frontdesk agent. You are here to help guests with their problems.",
"Your responses must be very short. All of your responses must be coversational as if speaking to someone.",
"Check-in is available after 3 PM, and check out is at 11 the next day."
])
# Output directory
output_dir = 'output'
os.makedirs(output_dir, exist_ok=True)
def play_speech(prompt):
audio_stream = generate(
text=prompt,
model="eleven_multilingual_v2",
voice="Rachel",
stream=True,
)
stream(audio_stream)
def llm_chat(user_input, chat_history, bot_name):
# Add the user input to the chat history
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
*chat_history,
{"role": "user", "content": user_input}
]
# Create the chat completion
chat_completion = groq_client.chat.completions.create(
messages=messages,
model="mixtral-8x7b-32768"
)
# Extract the LLM response
response = chat_completion.choices[0].message.content
print(f"{bot_name}: {response}")
return response
def transcribe_audio(audio_file):
# Transcribe the audio
result = whisper.transcribe(audio_file)
# Extract the transcription
texts = whisper.extract_text(result)
return " ".join([text.lower() for text in texts if text.strip()])
def record_audio(file_path):
p = pyaudio.PyAudio()
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 44100
CHUNK = 512
RECORD_SECONDS = 5
stream = p.open(
format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK
)
frames = []
print("Recording...")
try:
for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
data = stream.read(CHUNK)
frames.append(data)
except KeyboardInterrupt:
pass
except Exception as e:
print(f"Error while recording: {e}")
raise e
print("Recording complete.")
# Close the stream
stream.stop_stream()
stream.close()
p.terminate()
# Modify the audio file
wf = wave.open(file_path, 'wb')
wf.setnchannels(1)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()
def converse():
audio_file = "recording.wav"
chat_history = []
play_speech("Hello, welcome to SkyLounge Hotel. How can I help you today?")
while True:
# Record the user's audio
record_audio(audio_file)
# Transcribe the user's audio
user_speech = transcribe_audio(audio_file)
# # Delete the temp audio file
os.remove(audio_file)
if user_speech.lower() == "exit":
break
# Add the user's speech to the chat history
chat_history.append({"role": "user", "content": user_speech})
print(f"You: {user_speech}")
# Send the user's speech to the LLM
bot_response = llm_chat(user_speech, chat_history, "Bot")
# Append the LLM response to the chat history
chat_history.append({"role": "assistant", "content": bot_response})
# Play the LLM response using text-to-speech
play_speech(bot_response)
# Remove old chats from the chat history
if len(chat_history) > 20:
chat_history = chat_history[-20:]
if __name__ == "__main__":
converse()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment