Last active
March 4, 2024 19:15
-
-
Save r0yfire/24a768af1d7a48c7cb70ae45fecd1cc7 to your computer and use it in GitHub Desktop.
Voice Chat with LLMs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Originally posted on: | |
https://royfirestein.com/blog/real-time-voice-chat-with-ai | |
""" | |
import os | |
import wave | |
from pydub import AudioSegment | |
from groq import Groq | |
from whispercpp import Whisper | |
from elevenlabs import generate, stream | |
import pyaudio | |
# Initialize the Whisper client | |
whisper = Whisper('tiny') | |
# Set the API keys | |
os.environ["ELEVEN_API_KEY"] = "YOUR API KEY" | |
os.environ["GROQ_API_KEY"] = "YOUR API KEY" | |
# Create API clients | |
groq_client = Groq( | |
api_key=os.environ.get("GROQ_API_KEY"), | |
) | |
# Set the system prompt | |
SYSTEM_PROMPT = "\n".join([ | |
"You are a friendly hotel frontdesk agent. You are here to help guests with their problems.", | |
"Your responses must be very short. All of your responses must be coversational as if speaking to someone.", | |
"Check-in is available after 3 PM, and check out is at 11 the next day." | |
]) | |
# Output directory | |
output_dir = 'output' | |
os.makedirs(output_dir, exist_ok=True) | |
def play_speech(prompt): | |
audio_stream = generate( | |
text=prompt, | |
model="eleven_multilingual_v2", | |
voice="Rachel", | |
stream=True, | |
) | |
stream(audio_stream) | |
def llm_chat(user_input, chat_history, bot_name): | |
# Add the user input to the chat history | |
messages = [ | |
{"role": "system", "content": SYSTEM_PROMPT}, | |
*chat_history, | |
{"role": "user", "content": user_input} | |
] | |
# Create the chat completion | |
chat_completion = groq_client.chat.completions.create( | |
messages=messages, | |
model="mixtral-8x7b-32768" | |
) | |
# Extract the LLM response | |
response = chat_completion.choices[0].message.content | |
print(f"{bot_name}: {response}") | |
return response | |
def transcribe_audio(audio_file): | |
# Transcribe the audio | |
result = whisper.transcribe(audio_file) | |
# Extract the transcription | |
texts = whisper.extract_text(result) | |
return " ".join([text.lower() for text in texts if text.strip()]) | |
def record_audio(file_path): | |
p = pyaudio.PyAudio() | |
FORMAT = pyaudio.paInt16 | |
CHANNELS = 1 | |
RATE = 44100 | |
CHUNK = 512 | |
RECORD_SECONDS = 5 | |
stream = p.open( | |
format=FORMAT, | |
channels=CHANNELS, | |
rate=RATE, | |
input=True, | |
frames_per_buffer=CHUNK | |
) | |
frames = [] | |
print("Recording...") | |
try: | |
for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)): | |
data = stream.read(CHUNK) | |
frames.append(data) | |
except KeyboardInterrupt: | |
pass | |
except Exception as e: | |
print(f"Error while recording: {e}") | |
raise e | |
print("Recording complete.") | |
# Close the stream | |
stream.stop_stream() | |
stream.close() | |
p.terminate() | |
# Modify the audio file | |
wf = wave.open(file_path, 'wb') | |
wf.setnchannels(1) | |
wf.setsampwidth(p.get_sample_size(FORMAT)) | |
wf.setframerate(RATE) | |
wf.writeframes(b''.join(frames)) | |
wf.close() | |
def converse(): | |
audio_file = "recording.wav" | |
chat_history = [] | |
play_speech("Hello, welcome to SkyLounge Hotel. How can I help you today?") | |
while True: | |
# Record the user's audio | |
record_audio(audio_file) | |
# Transcribe the user's audio | |
user_speech = transcribe_audio(audio_file) | |
# # Delete the temp audio file | |
os.remove(audio_file) | |
if user_speech.lower() == "exit": | |
break | |
# Add the user's speech to the chat history | |
chat_history.append({"role": "user", "content": user_speech}) | |
print(f"You: {user_speech}") | |
# Send the user's speech to the LLM | |
bot_response = llm_chat(user_speech, chat_history, "Bot") | |
# Append the LLM response to the chat history | |
chat_history.append({"role": "assistant", "content": bot_response}) | |
# Play the LLM response using text-to-speech | |
play_speech(bot_response) | |
# Remove old chats from the chat history | |
if len(chat_history) > 20: | |
chat_history = chat_history[-20:] | |
if __name__ == "__main__": | |
converse() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment