Skip to content

Instantly share code, notes, and snippets.

@grahama1970
Last active May 8, 2024 14:38
Show Gist options
  • Save grahama1970/29166fc1a91de21bc8f85b260db254e4 to your computer and use it in GitHub Desktop.
Save grahama1970/29166fc1a91de21bc8f85b260db254e4 to your computer and use it in GitHub Desktop.
A chatbot that generates an response based on user emotion
import speech_recognition as sr
import librosa
import numpy as np
from transformers import pipeline, AutoModelForAudioClassification, AutoFeatureExtractor
import openai
import sounddevice as sd
import io
import os
import requests
from dotenv import load_dotenv
load_dotenv()
import logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
class EmotionRecognizer:
"""
A class for recognizing emotions in audio data, generating emotionally emphasized text output, and streaming audio responses.
This class combines speech emotion recognition, text sentiment analysis, and language model interaction to produce an emotionally
emphasized text output based on the input audio data. It can process both audio files and audio streams, including real-time audio
input from the user's microphone. The class also supports streaming audio responses using the OpenAI Text-to-Speech API.
Dependencies:
- speech_recognition
- librosa
- transformers
- openai
- sounddevice
Parameters:
None
Usage:
1. Instantiate the EmotionRecognizer class:
recognizer = EmotionRecognizer()
2. Process an audio file and generate emotionally emphasized text output:
output_text = recognizer.multimodal_emotion_merge(audio_file_path)
3. Process an audio stream and generate emotionally emphasized text output:
output_text = recognizer.process_audio_stream(audio_stream)
4. Start real-time processing from the user's microphone:
recognizer.start_real_time_processing()
5. Interact with a language model and stream an audio response:
- The user speaks into the microphone, and the audio is captured.
- The captured audio is processed to recognize emotions and generate emotionally emphasized text.
- The emotionally emphasized text is sent to a language model (e.g., Llama70B) for generating a response.
- The generated response is converted to speech using the OpenAI Text-to-Speech API.
- The audio response is streamed back to the user in a specified voice.
Examples:
# Processing an audio file
recognizer = EmotionRecognizer()
output_text = recognizer.multimodal_emotion_merge("path/to/audio/file.wav")
print(output_text)
# Expected output: I am VERY MAD?! 😠
# Processing an audio stream
recognizer = EmotionRecognizer()
with sr.Microphone() as source:
audio_stream = recognizer.recognizer.listen(source)
output_text = recognizer.process_audio_stream(audio_stream)
print(output_text)
# Expected output: I loovve you! 😊
# Real-time processing from the user's microphone
recognizer = EmotionRecognizer()
recognizer.start_real_time_processing()
# Speak into the microphone, and the emotionally emphasized text will be printed in real-time
# Interacting with a language model and streaming an audio response
recognizer = EmotionRecognizer()
with sr.Microphone() as source:
print("Say something...")
audio_stream = recognizer.recognizer.listen(source)
emotionally_emphasized_text = recognizer.process_audio_stream(audio_stream)
response_text = send_to_language_model(emotionally_emphasized_text) # Function to interact with Llama70B
recognizer.stream_audio_response(response_text, voice="fable")
# The user's speech will be processed, sent to Llama70B for a response, and the response will be streamed back as audio
"""
def __init__(self):
self.emotion_model, self.feature_extractor = self.load_speech_emotion_model()
self.sentiment_model = pipeline("sentiment-analysis")
self.recognizer = sr.Recognizer()
self.microphone = sr.Microphone()
openai.api_key = os.environ.get("OPENAI_API_KEY")
self.llama_api_url = "https://your-runpod-endpoint.com/v1/generate"
self.llama_api_headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {os.environ.get('RUNPOD_API_KEY')}"
}
def load_speech_emotion_model(self):
"""
Load the pre-trained speech emotion recognition model and feature extractor.
Returns:
model (AutoModelForAudioClassification): The loaded speech emotion recognition model.
feature_extractor (AutoFeatureExtractor): The loaded feature extractor for the model.
"""
model_name = "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
try:
model = AutoModelForAudioClassification.from_pretrained(model_name)
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
except Exception as e:
logging.error(f"Failed to load speech emotion model: {e}")
raise
def analyze_audio_emotion(self, audio_file, chunk_size=3):
"""
Analyze the emotion in short chunks of the given audio file.
Args:
audio_file (str): Path to the audio file.
chunk_size (int, optional): Size of the audio chunks in seconds. Default is 3 seconds.
Returns:
chunk_emotions (list): A list of dictionaries containing the start and end times of each chunk,
and the predicted emotion for that chunk.
"""
# Load the audio file and get the sample rate
try:
audio, sample_rate = librosa.load(audio_file)
except Exception as e:
logging.error(f"Failed to load audio file: {e}")
return []
chunk_emotions = []
for i in range(0, len(audio), int(sample_rate*chunk_size)):
# Extract the current audio chunk
chunk = audio[i:i+int(sample_rate*chunk_size)]
# Extract features from the audio chunk using the loaded feature extractor
inputs = self.feature_extractor(chunk, sampling_rate=sample_rate, return_tensors="pt")
# Predict the emotion for the audio chunk using the loaded model
outputs = self.emotion_model(**inputs)
predicted_id = outputs.logits.argmax(dim=-1).item()
emotion = self.emotion_model.config.id2label[predicted_id]
# Append the chunk details (start, end, emotion) to the list
chunk_emotions.append({
'start': i/sample_rate,
'end': (i+int(sample_rate*chunk_size))/sample_rate,
'emotion': emotion
})
return chunk_emotions
def speech_to_text(self, audio_data: sr.AudioData) -> str:
"""
Transcribe audio data to text using Google Speech Recognition.
Args:
audio_data (sr.AudioData): The audio data to be transcribed. This is typically obtained by recording audio
from a microphone or loading an audio file.
Returns:
str: The transcribed text. If the audio data cannot be transcribed, an empty string is returned.
Raises:
sr.UnknownValueError: If the speech recognition service cannot understand the audio data. This can happen
due to various reasons, such as background noise, unclear speech, or unsupported languages.
sr.RequestError: If there is an error with the speech recognition service request, such as network issues
or service unavailability.
"""
if not audio_data:
logging.warning("No audio data provided for transcription.")
return ""
try:
text = self.recognizer.recognize_google(audio_data)
return text
except sr.UnknownValueError:
logging.error("Could not understand audio. Please check for background noise, clear speech, and supported languages.")
except sr.RequestError as e:
logging.error(f"Error with speech recognition service request: {e}")
return ""
def map_sentiment(self, sentiment: str) -> str:
"""
Map the sentiment label to an emoji representation.
Args:
sentiment (str): The sentiment label ('POSITIVE', 'NEGATIVE', or 'NEUTRAL').
Returns:
str: The corresponding emoji for the given sentiment label.
Raises:
ValueError: If the sentiment label is not one of 'POSITIVE', 'NEGATIVE', or 'NEUTRAL'.
"""
if sentiment == 'POSITIVE':
return "😊"
elif sentiment == 'NEGATIVE':
return "😠"
elif sentiment == 'NEUTRAL':
return ""
else:
raise ValueError(f"Invalid sentiment label: {sentiment}")
def multimodal_emotion_merge(self, audio_file: str) -> str:
"""
Merge the audio emotion analysis and text sentiment analysis to generate an emotionally emphasized text output.
Args:
audio_file (str): Path to the audio file.
Returns:
str: The emotionally emphasized text output.
Raises:
FileNotFoundError: If the provided audio file path does not exist.
sr.UnknownValueError: If the speech recognition service cannot understand the audio data.
sr.RequestError: If there is an error with the speech recognition service request.
"""
try:
with sr.AudioFile(audio_file) as source:
audio_data = self.recognizer.record(source)
except FileNotFoundError:
logging.error(f"Audio file not found: {audio_file}")
return ""
# Transcribe the audio file to text
text = self.speech_to_text(audio_data)
if not text:
return ""
# Analyze the emotion in audio chunks
audio_emotions = self.analyze_audio_emotion(audio_file)
# Get the overall sentiment of the transcribed text
text_sentiment = self.sentiment_model(text)[0]['label']
final_text = ""
for word in text.split():
for chunk in audio_emotions:
# Check if the current word falls within the current audio chunk
if chunk['start'] <= text.find(word) / len(text) <= chunk['end']:
# Format the word based on the predicted emotion for the chunk
if chunk['emotion'] in ['fearful', 'angry', 'disgust']:
final_text += word.upper() + " "
elif chunk['emotion'] in ['happy', 'surprised']:
final_text += word + "! "
elif chunk['emotion'] in ['sad']:
final_text += word.lower() + ".. "
else:
final_text += word + " "
# Append the sentiment emoji to the final text
sentiment_emoji = self.map_sentiment(text_sentiment)
final_text += sentiment_emoji
return final_text.strip()
def send_to_llama_model(self, text: str) -> str:
"""
Send the emotionally emphasized text to the Llama 3 70B model through a TGI server running on RunPod.
Args:
text (str): The emotionally emphasized text to be sent to the Llama model.
Returns:
str: The generated response from the Llama model.
Raises:
requests.exceptions.RequestException: If there is an error with the request to the RunPod API.
"""
payload = {
"prompt": text,
"max_tokens": 100,
"temperature": 0.7,
"top_p": 0.9,
"stop": ["\n"]
}
try:
response = requests.post(self.llama_api_url, headers=self.llama_api_headers, json=payload)
response.raise_for_status()
response_data = response.json()
generated_text = response_data["choices"][0]["text"].strip()
return generated_text
except requests.exceptions.RequestException as e:
logging.error(f"Error with RunPod API request: {e}")
return ""
def start_real_time_processing(self):
"""
Start real-time processing of audio input from the user's microphone.
This method continuously listens to the microphone input, generates emotionally emphasized text output,
sends the text to the Llama model, and streams the audio response back to the user.
"""
with self.microphone as source:
print("Say something...")
while True:
audio_stream = self.recognizer.listen(source)
try:
emotionally_emphasized_text = self.process_audio_stream(audio_stream)
print("Emotionally Emphasized Text:", emotionally_emphasized_text)
llama_response = self.send_to_llama_model(emotionally_emphasized_text)
print("Llama Model Response:", llama_response)
self.stream_audio_response(llama_response)
except sr.UnknownValueError:
print("Could not understand audio")
except sr.RequestError as e:
print(f"Error: {e}")
except requests.exceptions.RequestException as e:
print(f"Error with RunPod API request: {e}")
def stream_audio_response(self, text: str, voice: str = "fable", model: str = "tts-1-hd", response_format: str = "opus") -> None:
"""
Stream an audio response for the given text using the OpenAI Text-to-Speech API.
Args:
text (str): The text to be converted to speech.
voice (str, optional): The voice to use for the audio response. Defaults to "fable".
model (str, optional): The Text-to-Speech model to use. Defaults to "tts-1-hd".
response_format (str, optional): The audio format for the response. Defaults to "opus".
"""
spoken_response = openai.audio.speech.create(
model=model,
voice=voice,
response_format=response_format,
input=text,
)
buffer = spoken_response.iter_bytes()
with sd.InputStream(buffer, channels=1, samplerate=24000, dtype="int16", callback=sd.play):
sd.sleep(int(spoken_response.duration_seconds * 1000))
def stream_audio_response(self, text: str, voice: str = "fable", model: str = "tts-1-hd", response_format: str = "opus") -> None:
"""
Stream an audio response for the given text using the OpenAI Text-to-Speech API.
Args:
text (str): The text to be converted to speech.
voice (str, optional): The voice to use for the audio response. Defaults to "fable".
model (str, optional): The Text-to-Speech model to use. Defaults to "tts-1-hd".
response_format (str, optional): The audio format for the response. Defaults to "opus".
Raises:
openai.error.OpenAIError: If there is an error with the OpenAI Text-to-Speech API request.
"""
try:
spoken_response = openai.audio.speech.create(
model=model,
voice=voice,
response_format=response_format,
input=text,
)
buffer = spoken_response.iter_bytes()
with sd.InputStream(buffer, channels=1, samplerate=24000, dtype="int16", callback=sd.play):
sd.sleep(int(spoken_response.duration_seconds * 1000))
except openai.error.OpenAIError as e:
logging.error(f"Error with OpenAI Text-to-Speech API request: {e}")
if __name__ == "__main__":
recognizer = EmotionRecognizer()
# Example 1: Processing an audio file
audio_file_path = "path/to/audio/file.wav"
try:
output_text = recognizer.multimodal_emotion_merge(audio_file_path)
print(f"Emotionally emphasized text from audio file: {output_text}")
except (FileNotFoundError, sr.UnknownValueError, sr.RequestError) as e:
print(f"Error processing audio file: {e}")
# Example 2: Processing an audio stream
print("Say something to process an audio stream...")
with sr.Microphone() as source:
try:
audio_stream = recognizer.recognizer.listen(source)
output_text = recognizer.process_audio_stream(audio_stream)
print(f"Emotionally emphasized text from audio stream: {output_text}")
except (sr.UnknownValueError, sr.RequestError) as e:
print(f"Error processing audio stream: {e}")
# Example 3: Real-time processing from the user's microphone
print("Starting real-time processing from the microphone...")
print("Speak into the microphone, and the emotionally emphasized text will be printed in real-time.")
recognizer.start_real_time_processing()
# Example 4: Interacting with a language model and streaming an audio response
print("Say something to interact with the Llama model and stream an audio response...")
with sr.Microphone() as source:
try:
audio_stream = recognizer.recognizer.listen(source)
emotionally_emphasized_text = recognizer.process_audio_stream(audio_stream)
print(f"Emotionally emphasized text: {emotionally_emphasized_text}")
llama_response = recognizer.send_to_llama_model(emotionally_emphasized_text)
print(f"Llama model response: {llama_response}")
print("Streaming audio response...")
recognizer.stream_audio_response(llama_response, voice="fable")
except (sr.UnknownValueError, sr.RequestError, requests.exceptions.RequestException) as e:
print(f"Error processing audio or interacting with Llama model: {e}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment