Last active
May 8, 2024 14:38
-
-
Save grahama1970/29166fc1a91de21bc8f85b260db254e4 to your computer and use it in GitHub Desktop.
A chatbot that generates an response based on user emotion
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import speech_recognition as sr | |
import librosa | |
import numpy as np | |
from transformers import pipeline, AutoModelForAudioClassification, AutoFeatureExtractor | |
import openai | |
import sounddevice as sd | |
import io | |
import os | |
import requests | |
from dotenv import load_dotenv | |
load_dotenv() | |
import logging | |
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") | |
class EmotionRecognizer: | |
""" | |
A class for recognizing emotions in audio data, generating emotionally emphasized text output, and streaming audio responses. | |
This class combines speech emotion recognition, text sentiment analysis, and language model interaction to produce an emotionally | |
emphasized text output based on the input audio data. It can process both audio files and audio streams, including real-time audio | |
input from the user's microphone. The class also supports streaming audio responses using the OpenAI Text-to-Speech API. | |
Dependencies: | |
- speech_recognition | |
- librosa | |
- transformers | |
- openai | |
- sounddevice | |
Parameters: | |
None | |
Usage: | |
1. Instantiate the EmotionRecognizer class: | |
recognizer = EmotionRecognizer() | |
2. Process an audio file and generate emotionally emphasized text output: | |
output_text = recognizer.multimodal_emotion_merge(audio_file_path) | |
3. Process an audio stream and generate emotionally emphasized text output: | |
output_text = recognizer.process_audio_stream(audio_stream) | |
4. Start real-time processing from the user's microphone: | |
recognizer.start_real_time_processing() | |
5. Interact with a language model and stream an audio response: | |
- The user speaks into the microphone, and the audio is captured. | |
- The captured audio is processed to recognize emotions and generate emotionally emphasized text. | |
- The emotionally emphasized text is sent to a language model (e.g., Llama70B) for generating a response. | |
- The generated response is converted to speech using the OpenAI Text-to-Speech API. | |
- The audio response is streamed back to the user in a specified voice. | |
Examples: | |
# Processing an audio file | |
recognizer = EmotionRecognizer() | |
output_text = recognizer.multimodal_emotion_merge("path/to/audio/file.wav") | |
print(output_text) | |
# Expected output: I am VERY MAD?! 😠 | |
# Processing an audio stream | |
recognizer = EmotionRecognizer() | |
with sr.Microphone() as source: | |
audio_stream = recognizer.recognizer.listen(source) | |
output_text = recognizer.process_audio_stream(audio_stream) | |
print(output_text) | |
# Expected output: I loovve you! 😊 | |
# Real-time processing from the user's microphone | |
recognizer = EmotionRecognizer() | |
recognizer.start_real_time_processing() | |
# Speak into the microphone, and the emotionally emphasized text will be printed in real-time | |
# Interacting with a language model and streaming an audio response | |
recognizer = EmotionRecognizer() | |
with sr.Microphone() as source: | |
print("Say something...") | |
audio_stream = recognizer.recognizer.listen(source) | |
emotionally_emphasized_text = recognizer.process_audio_stream(audio_stream) | |
response_text = send_to_language_model(emotionally_emphasized_text) # Function to interact with Llama70B | |
recognizer.stream_audio_response(response_text, voice="fable") | |
# The user's speech will be processed, sent to Llama70B for a response, and the response will be streamed back as audio | |
""" | |
def __init__(self): | |
self.emotion_model, self.feature_extractor = self.load_speech_emotion_model() | |
self.sentiment_model = pipeline("sentiment-analysis") | |
self.recognizer = sr.Recognizer() | |
self.microphone = sr.Microphone() | |
openai.api_key = os.environ.get("OPENAI_API_KEY") | |
self.llama_api_url = "https://your-runpod-endpoint.com/v1/generate" | |
self.llama_api_headers = { | |
"Content-Type": "application/json", | |
"Authorization": f"Bearer {os.environ.get('RUNPOD_API_KEY')}" | |
} | |
def load_speech_emotion_model(self): | |
""" | |
Load the pre-trained speech emotion recognition model and feature extractor. | |
Returns: | |
model (AutoModelForAudioClassification): The loaded speech emotion recognition model. | |
feature_extractor (AutoFeatureExtractor): The loaded feature extractor for the model. | |
""" | |
model_name = "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition" | |
try: | |
model = AutoModelForAudioClassification.from_pretrained(model_name) | |
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name) | |
except Exception as e: | |
logging.error(f"Failed to load speech emotion model: {e}") | |
raise | |
def analyze_audio_emotion(self, audio_file, chunk_size=3): | |
""" | |
Analyze the emotion in short chunks of the given audio file. | |
Args: | |
audio_file (str): Path to the audio file. | |
chunk_size (int, optional): Size of the audio chunks in seconds. Default is 3 seconds. | |
Returns: | |
chunk_emotions (list): A list of dictionaries containing the start and end times of each chunk, | |
and the predicted emotion for that chunk. | |
""" | |
# Load the audio file and get the sample rate | |
try: | |
audio, sample_rate = librosa.load(audio_file) | |
except Exception as e: | |
logging.error(f"Failed to load audio file: {e}") | |
return [] | |
chunk_emotions = [] | |
for i in range(0, len(audio), int(sample_rate*chunk_size)): | |
# Extract the current audio chunk | |
chunk = audio[i:i+int(sample_rate*chunk_size)] | |
# Extract features from the audio chunk using the loaded feature extractor | |
inputs = self.feature_extractor(chunk, sampling_rate=sample_rate, return_tensors="pt") | |
# Predict the emotion for the audio chunk using the loaded model | |
outputs = self.emotion_model(**inputs) | |
predicted_id = outputs.logits.argmax(dim=-1).item() | |
emotion = self.emotion_model.config.id2label[predicted_id] | |
# Append the chunk details (start, end, emotion) to the list | |
chunk_emotions.append({ | |
'start': i/sample_rate, | |
'end': (i+int(sample_rate*chunk_size))/sample_rate, | |
'emotion': emotion | |
}) | |
return chunk_emotions | |
def speech_to_text(self, audio_data: sr.AudioData) -> str: | |
""" | |
Transcribe audio data to text using Google Speech Recognition. | |
Args: | |
audio_data (sr.AudioData): The audio data to be transcribed. This is typically obtained by recording audio | |
from a microphone or loading an audio file. | |
Returns: | |
str: The transcribed text. If the audio data cannot be transcribed, an empty string is returned. | |
Raises: | |
sr.UnknownValueError: If the speech recognition service cannot understand the audio data. This can happen | |
due to various reasons, such as background noise, unclear speech, or unsupported languages. | |
sr.RequestError: If there is an error with the speech recognition service request, such as network issues | |
or service unavailability. | |
""" | |
if not audio_data: | |
logging.warning("No audio data provided for transcription.") | |
return "" | |
try: | |
text = self.recognizer.recognize_google(audio_data) | |
return text | |
except sr.UnknownValueError: | |
logging.error("Could not understand audio. Please check for background noise, clear speech, and supported languages.") | |
except sr.RequestError as e: | |
logging.error(f"Error with speech recognition service request: {e}") | |
return "" | |
def map_sentiment(self, sentiment: str) -> str: | |
""" | |
Map the sentiment label to an emoji representation. | |
Args: | |
sentiment (str): The sentiment label ('POSITIVE', 'NEGATIVE', or 'NEUTRAL'). | |
Returns: | |
str: The corresponding emoji for the given sentiment label. | |
Raises: | |
ValueError: If the sentiment label is not one of 'POSITIVE', 'NEGATIVE', or 'NEUTRAL'. | |
""" | |
if sentiment == 'POSITIVE': | |
return "😊" | |
elif sentiment == 'NEGATIVE': | |
return "😠" | |
elif sentiment == 'NEUTRAL': | |
return "" | |
else: | |
raise ValueError(f"Invalid sentiment label: {sentiment}") | |
def multimodal_emotion_merge(self, audio_file: str) -> str: | |
""" | |
Merge the audio emotion analysis and text sentiment analysis to generate an emotionally emphasized text output. | |
Args: | |
audio_file (str): Path to the audio file. | |
Returns: | |
str: The emotionally emphasized text output. | |
Raises: | |
FileNotFoundError: If the provided audio file path does not exist. | |
sr.UnknownValueError: If the speech recognition service cannot understand the audio data. | |
sr.RequestError: If there is an error with the speech recognition service request. | |
""" | |
try: | |
with sr.AudioFile(audio_file) as source: | |
audio_data = self.recognizer.record(source) | |
except FileNotFoundError: | |
logging.error(f"Audio file not found: {audio_file}") | |
return "" | |
# Transcribe the audio file to text | |
text = self.speech_to_text(audio_data) | |
if not text: | |
return "" | |
# Analyze the emotion in audio chunks | |
audio_emotions = self.analyze_audio_emotion(audio_file) | |
# Get the overall sentiment of the transcribed text | |
text_sentiment = self.sentiment_model(text)[0]['label'] | |
final_text = "" | |
for word in text.split(): | |
for chunk in audio_emotions: | |
# Check if the current word falls within the current audio chunk | |
if chunk['start'] <= text.find(word) / len(text) <= chunk['end']: | |
# Format the word based on the predicted emotion for the chunk | |
if chunk['emotion'] in ['fearful', 'angry', 'disgust']: | |
final_text += word.upper() + " " | |
elif chunk['emotion'] in ['happy', 'surprised']: | |
final_text += word + "! " | |
elif chunk['emotion'] in ['sad']: | |
final_text += word.lower() + ".. " | |
else: | |
final_text += word + " " | |
# Append the sentiment emoji to the final text | |
sentiment_emoji = self.map_sentiment(text_sentiment) | |
final_text += sentiment_emoji | |
return final_text.strip() | |
def send_to_llama_model(self, text: str) -> str: | |
""" | |
Send the emotionally emphasized text to the Llama 3 70B model through a TGI server running on RunPod. | |
Args: | |
text (str): The emotionally emphasized text to be sent to the Llama model. | |
Returns: | |
str: The generated response from the Llama model. | |
Raises: | |
requests.exceptions.RequestException: If there is an error with the request to the RunPod API. | |
""" | |
payload = { | |
"prompt": text, | |
"max_tokens": 100, | |
"temperature": 0.7, | |
"top_p": 0.9, | |
"stop": ["\n"] | |
} | |
try: | |
response = requests.post(self.llama_api_url, headers=self.llama_api_headers, json=payload) | |
response.raise_for_status() | |
response_data = response.json() | |
generated_text = response_data["choices"][0]["text"].strip() | |
return generated_text | |
except requests.exceptions.RequestException as e: | |
logging.error(f"Error with RunPod API request: {e}") | |
return "" | |
def start_real_time_processing(self): | |
""" | |
Start real-time processing of audio input from the user's microphone. | |
This method continuously listens to the microphone input, generates emotionally emphasized text output, | |
sends the text to the Llama model, and streams the audio response back to the user. | |
""" | |
with self.microphone as source: | |
print("Say something...") | |
while True: | |
audio_stream = self.recognizer.listen(source) | |
try: | |
emotionally_emphasized_text = self.process_audio_stream(audio_stream) | |
print("Emotionally Emphasized Text:", emotionally_emphasized_text) | |
llama_response = self.send_to_llama_model(emotionally_emphasized_text) | |
print("Llama Model Response:", llama_response) | |
self.stream_audio_response(llama_response) | |
except sr.UnknownValueError: | |
print("Could not understand audio") | |
except sr.RequestError as e: | |
print(f"Error: {e}") | |
except requests.exceptions.RequestException as e: | |
print(f"Error with RunPod API request: {e}") | |
def stream_audio_response(self, text: str, voice: str = "fable", model: str = "tts-1-hd", response_format: str = "opus") -> None: | |
""" | |
Stream an audio response for the given text using the OpenAI Text-to-Speech API. | |
Args: | |
text (str): The text to be converted to speech. | |
voice (str, optional): The voice to use for the audio response. Defaults to "fable". | |
model (str, optional): The Text-to-Speech model to use. Defaults to "tts-1-hd". | |
response_format (str, optional): The audio format for the response. Defaults to "opus". | |
""" | |
spoken_response = openai.audio.speech.create( | |
model=model, | |
voice=voice, | |
response_format=response_format, | |
input=text, | |
) | |
buffer = spoken_response.iter_bytes() | |
with sd.InputStream(buffer, channels=1, samplerate=24000, dtype="int16", callback=sd.play): | |
sd.sleep(int(spoken_response.duration_seconds * 1000)) | |
def stream_audio_response(self, text: str, voice: str = "fable", model: str = "tts-1-hd", response_format: str = "opus") -> None: | |
""" | |
Stream an audio response for the given text using the OpenAI Text-to-Speech API. | |
Args: | |
text (str): The text to be converted to speech. | |
voice (str, optional): The voice to use for the audio response. Defaults to "fable". | |
model (str, optional): The Text-to-Speech model to use. Defaults to "tts-1-hd". | |
response_format (str, optional): The audio format for the response. Defaults to "opus". | |
Raises: | |
openai.error.OpenAIError: If there is an error with the OpenAI Text-to-Speech API request. | |
""" | |
try: | |
spoken_response = openai.audio.speech.create( | |
model=model, | |
voice=voice, | |
response_format=response_format, | |
input=text, | |
) | |
buffer = spoken_response.iter_bytes() | |
with sd.InputStream(buffer, channels=1, samplerate=24000, dtype="int16", callback=sd.play): | |
sd.sleep(int(spoken_response.duration_seconds * 1000)) | |
except openai.error.OpenAIError as e: | |
logging.error(f"Error with OpenAI Text-to-Speech API request: {e}") | |
if __name__ == "__main__": | |
recognizer = EmotionRecognizer() | |
# Example 1: Processing an audio file | |
audio_file_path = "path/to/audio/file.wav" | |
try: | |
output_text = recognizer.multimodal_emotion_merge(audio_file_path) | |
print(f"Emotionally emphasized text from audio file: {output_text}") | |
except (FileNotFoundError, sr.UnknownValueError, sr.RequestError) as e: | |
print(f"Error processing audio file: {e}") | |
# Example 2: Processing an audio stream | |
print("Say something to process an audio stream...") | |
with sr.Microphone() as source: | |
try: | |
audio_stream = recognizer.recognizer.listen(source) | |
output_text = recognizer.process_audio_stream(audio_stream) | |
print(f"Emotionally emphasized text from audio stream: {output_text}") | |
except (sr.UnknownValueError, sr.RequestError) as e: | |
print(f"Error processing audio stream: {e}") | |
# Example 3: Real-time processing from the user's microphone | |
print("Starting real-time processing from the microphone...") | |
print("Speak into the microphone, and the emotionally emphasized text will be printed in real-time.") | |
recognizer.start_real_time_processing() | |
# Example 4: Interacting with a language model and streaming an audio response | |
print("Say something to interact with the Llama model and stream an audio response...") | |
with sr.Microphone() as source: | |
try: | |
audio_stream = recognizer.recognizer.listen(source) | |
emotionally_emphasized_text = recognizer.process_audio_stream(audio_stream) | |
print(f"Emotionally emphasized text: {emotionally_emphasized_text}") | |
llama_response = recognizer.send_to_llama_model(emotionally_emphasized_text) | |
print(f"Llama model response: {llama_response}") | |
print("Streaming audio response...") | |
recognizer.stream_audio_response(llama_response, voice="fable") | |
except (sr.UnknownValueError, sr.RequestError, requests.exceptions.RequestException) as e: | |
print(f"Error processing audio or interacting with Llama model: {e}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment