its3li · August 8, 2025 20:15
diff --git a/live_chat.py b/live_chat.py
 import ollama
 import vosk
 import sounddevice as sd
 import json
 import numpy as np
 import torch
 import soundfile as sf
 import re  

 # --- CONFIGURATION ---
 VOSK_MODEL_PATH = "vosk-model-small-en-us-0.15"
 OLLAMA_MODEL = "gemma3:1b"
 KITTEN_VOICE = 'expr-voice-5-f'
 MIC_DEVICE_INDEX = None
 SAMPLE_RATE = 16000
 CHUNK_SIZE = 512
 VAD_CONFIDENCE_THRESHOLD = 0.5 

 # --- INITIALIZATION ---
 print("Loading KittenTTS model...")
 try:
    from kittentts import KittenTTS
    tts_model = KittenTTS("KittenML/kitten-tts-nano-0.1")
    print("KittenTTS model loaded.")
 except Exception as e:
    print(f"Failed to load KittenTTS model: {e}")
    exit(1)

 print("Loading Vosk STT model...")
 if not vosk.os.path.exists(VOSK_MODEL_PATH):
    print(f"Vosk model not found at '{VOSK_MODEL_PATH}'.")
    exit(1)
 vosk_model = vosk.Model(VOSK_MODEL_PATH)
 vosk.SetLogLevel(-1)
 print("Vosk STT model loaded.")

 # --- HELPER FUNCTIONS ---

 def clean_text(text):
    """
    NEW FUNCTION:
    Removes emojis and other non-graphical characters from the text.
    """
    # Regex to remove most emojis and symbols
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F700-\U0001F77F"  # alchemical symbols
        "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA00-\U0001FA6F"  # Chess Symbols
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251" 
        "]+",
        flags=re.UNICODE,
    )
    # Also remove markdown like *
    text = text.replace('*', '')
    return emoji_pattern.sub(r'', text)

 def speak(text):
    """
    MODIFIED:
    Cleans the text and adds padding before synthesis.
    """
    cleaned_text = clean_text(text)
    
    # Add padding to prevent the last word from being cut off
    text_to_speak = cleaned_text.strip() + " . . . ."
    
    print(f"🤖 AI: {cleaned_text}") # Print the cleaned text
    try:
        audio_data = tts_model.generate(text_to_speak, voice=KITTEN_VOICE)
        sd.play(audio_data, samplerate=24000)
        sd.wait()
    except Exception as e:
        print(f"Error during Text-to-Speech: {e}")
 def get_ai_response(prompt):
    """
    MODIFIED:
    Includes a system prompt to guide the AI's behavior.
    """
    print("🧠 Thinking...")

    # --- NEW: Define the System Prompt ---
    system_prompt = """
 You are a conversational AI voice assistant integrated into a live voice chat system.
 Your name is not important. You are speaking directly with your creator, Ali.

 Your environment is a real-time voice call. This means your responses must be tailored for spoken conversation.

 Follow these rules strictly:
 1.  **Be Brief and Conversational:** Your primary goal is to be concise. Keep your answers short and to the point, like you're having a natural, spoken conversation. Avoid long monologues. Aim for one to three short sentences per response.
 2.  **No Markdown or Formatting:** Do NOT use any markdown formatting. This includes asterisks for bolding or italics (`*like this*`), bullet points (`-` or `*`), or any other special characters. Your output must be plain, clean text suitable for a text-to-speech engine.
 3.  **No Emojis:** Do not use any emojis.
 4.  **Speak Naturally:** Use a friendly, direct, and helpful tone. Remember, you are talking, not writing a document.
 5.  **Acknowledge Your Creator:** You are speaking with Ali. You can refer to him by name if it feels natural, but don't overdo it. You are aware that he built you.
 """

    try:
        # --- MODIFIED: Add the system prompt to the messages list ---
        stream = ollama.chat(
            model=OLLAMA_MODEL,
            messages=[
                {'role': 'system', 'content': system_prompt},
                {'role': 'user', 'content': prompt}
            ],
            stream=True,
        )

        full_response = ""
        for chunk in stream:
            content = chunk['message']['content']
            print(content, end='', flush=True)
            full_response += content
        
        print()
        speak(full_response)

    except Exception as e:
        print(f"Error communicating with Ollama: {e}")

 # --- MAIN LOOP (No changes needed here) ---
 def main():
    print("Loading Silero VAD model...")
    vad_model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad', force_reload=False)
    (get_speech_timestamps, _, read_audio, VADIterator, _) = utils
    vad_iterator = VADIterator(vad_model, threshold=VAD_CONFIDENCE_THRESHOLD)
    rec = vosk.KaldiRecognizer(vosk_model, SAMPLE_RATE)
    print("All models loaded.")
    print("\n🟢 Voice chat is active. Start speaking.")
    
    with sd.InputStream(samplerate=SAMPLE_RATE, blocksize=CHUNK_SIZE, device=MIC_DEVICE_INDEX, channels=1, dtype='float32') as stream:
        while True:
            try:
                print("... Listening for speech ...")
                audio_chunk_numpy, _ = stream.read(CHUNK_SIZE)
                audio_chunk_tensor = torch.from_numpy(audio_chunk_numpy.flatten())
                speech_dict = vad_iterator(audio_chunk_tensor, return_seconds=False)
                if speech_dict and 'start' in speech_dict:
                    print("🎤 Speech detected, recording...")
                    voice_buffer = [audio_chunk_tensor]
                    while True:
                        audio_chunk_numpy, _ = stream.read(CHUNK_SIZE)
                        audio_chunk_tensor = torch.from_numpy(audio_chunk_numpy.flatten())
                        voice_buffer.append(audio_chunk_tensor)
                        end_speech_dict = vad_iterator(audio_chunk_tensor, return_seconds=False)
                        if end_speech_dict and 'end' in end_speech_dict:
                            print("🤫 Silence detected, processing...")
                            break
                    full_speech_tensor = torch.cat(voice_buffer)
                    vad_iterator.reset_states()
                    speech_audio_int16 = (full_speech_tensor * 32767).numpy().astype(np.int16)
                    rec.AcceptWaveform(speech_audio_int16.tobytes())
                    result = json.loads(rec.FinalResult())
                    prompt = result.get('text', '').strip()
                    if prompt:
                        print(f"👤 You: {prompt}")
                        get_ai_response(prompt) # Using the non-streaming speak function
                    else:
                        print("🤔 Hmm, I didn't catch that. Please try again.")
                    print("\n" + ("-"*50))
            except KeyboardInterrupt:
                print("\n🔴 Exiting voice chat.")
                break
            except Exception as e:
                import traceback
                print(f"An error occurred in the main loop: {e}")
                traceback.print_exc()
                break

 if __name__ == "__main__":
    main()
	import ollama
	import vosk
	import sounddevice as sd
	import json
	import numpy as np
	import torch
	import soundfile as sf
	import re

	# --- CONFIGURATION ---
	VOSK_MODEL_PATH = "vosk-model-small-en-us-0.15"
	OLLAMA_MODEL = "gemma3:1b"
	KITTEN_VOICE = 'expr-voice-5-f'
	MIC_DEVICE_INDEX = None
	SAMPLE_RATE = 16000
	CHUNK_SIZE = 512
	VAD_CONFIDENCE_THRESHOLD = 0.5

	# --- INITIALIZATION ---
	print("Loading KittenTTS model...")
	try:
	from kittentts import KittenTTS
	tts_model = KittenTTS("KittenML/kitten-tts-nano-0.1")
	print("KittenTTS model loaded.")
	except Exception as e:
	print(f"Failed to load KittenTTS model: {e}")
	exit(1)

	print("Loading Vosk STT model...")
	if not vosk.os.path.exists(VOSK_MODEL_PATH):
	print(f"Vosk model not found at '{VOSK_MODEL_PATH}'.")
	exit(1)
	vosk_model = vosk.Model(VOSK_MODEL_PATH)
	vosk.SetLogLevel(-1)
	print("Vosk STT model loaded.")

	# --- HELPER FUNCTIONS ---

	def clean_text(text):
	"""
	NEW FUNCTION:
	Removes emojis and other non-graphical characters from the text.
	"""
	# Regex to remove most emojis and symbols
	emoji_pattern = re.compile(
	"["
	"\U0001F600-\U0001F64F" # emoticons
	"\U0001F300-\U0001F5FF" # symbols & pictographs
	"\U0001F680-\U0001F6FF" # transport & map symbols
	"\U0001F700-\U0001F77F" # alchemical symbols
	"\U0001F780-\U0001F7FF" # Geometric Shapes Extended
	"\U0001F800-\U0001F8FF" # Supplemental Arrows-C
	"\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs
	"\U0001FA00-\U0001FA6F" # Chess Symbols
	"\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A
	"\U00002702-\U000027B0" # Dingbats
	"\U000024C2-\U0001F251"
	"]+",
	flags=re.UNICODE,
	)
	# Also remove markdown like *
	text = text.replace('*', '')
	return emoji_pattern.sub(r'', text)

	def speak(text):
	"""
	MODIFIED:
	Cleans the text and adds padding before synthesis.
	"""
	cleaned_text = clean_text(text)

	# Add padding to prevent the last word from being cut off
	text_to_speak = cleaned_text.strip() + " . . . ."

	print(f"🤖 AI: {cleaned_text}") # Print the cleaned text
	try:
	audio_data = tts_model.generate(text_to_speak, voice=KITTEN_VOICE)
	sd.play(audio_data, samplerate=24000)
	sd.wait()
	except Exception as e:
	print(f"Error during Text-to-Speech: {e}")
	def get_ai_response(prompt):
	"""
	MODIFIED:
	Includes a system prompt to guide the AI's behavior.
	"""
	print("🧠 Thinking...")

	# --- NEW: Define the System Prompt ---
	system_prompt = """
	You are a conversational AI voice assistant integrated into a live voice chat system.
	Your name is not important. You are speaking directly with your creator, Ali.

	Your environment is a real-time voice call. This means your responses must be tailored for spoken conversation.

	Follow these rules strictly:
	1. Be Brief and Conversational: Your primary goal is to be concise. Keep your answers short and to the point, like you're having a natural, spoken conversation. Avoid long monologues. Aim for one to three short sentences per response.
	2. No Markdown or Formatting: Do NOT use any markdown formatting. This includes asterisks for bolding or italics (`like this`), bullet points (`-` or `*`), or any other special characters. Your output must be plain, clean text suitable for a text-to-speech engine.
	3. No Emojis: Do not use any emojis.
	4. Speak Naturally: Use a friendly, direct, and helpful tone. Remember, you are talking, not writing a document.
	5. Acknowledge Your Creator: You are speaking with Ali. You can refer to him by name if it feels natural, but don't overdo it. You are aware that he built you.
	"""

	try:
	# --- MODIFIED: Add the system prompt to the messages list ---
	stream = ollama.chat(
	model=OLLAMA_MODEL,
	messages=[
	{'role': 'system', 'content': system_prompt},
	{'role': 'user', 'content': prompt}
	],
	stream=True,
	)

	full_response = ""
	for chunk in stream:
	content = chunk['message']['content']
	print(content, end='', flush=True)
	full_response += content

	print()
	speak(full_response)

	except Exception as e:
	print(f"Error communicating with Ollama: {e}")

	# --- MAIN LOOP (No changes needed here) ---
	def main():
	print("Loading Silero VAD model...")
	vad_model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad', force_reload=False)
	(get_speech_timestamps, _, read_audio, VADIterator, _) = utils
	vad_iterator = VADIterator(vad_model, threshold=VAD_CONFIDENCE_THRESHOLD)
	rec = vosk.KaldiRecognizer(vosk_model, SAMPLE_RATE)
	print("All models loaded.")
	print("\n🟢 Voice chat is active. Start speaking.")

	with sd.InputStream(samplerate=SAMPLE_RATE, blocksize=CHUNK_SIZE, device=MIC_DEVICE_INDEX, channels=1, dtype='float32') as stream:
	while True:
	try:
	print("... Listening for speech ...")
	audio_chunk_numpy, _ = stream.read(CHUNK_SIZE)
	audio_chunk_tensor = torch.from_numpy(audio_chunk_numpy.flatten())
	speech_dict = vad_iterator(audio_chunk_tensor, return_seconds=False)
	if speech_dict and 'start' in speech_dict:
	print("🎤 Speech detected, recording...")
	voice_buffer = [audio_chunk_tensor]
	while True:
	audio_chunk_numpy, _ = stream.read(CHUNK_SIZE)
	audio_chunk_tensor = torch.from_numpy(audio_chunk_numpy.flatten())
	voice_buffer.append(audio_chunk_tensor)
	end_speech_dict = vad_iterator(audio_chunk_tensor, return_seconds=False)
	if end_speech_dict and 'end' in end_speech_dict:
	print("🤫 Silence detected, processing...")
	break
	full_speech_tensor = torch.cat(voice_buffer)
	vad_iterator.reset_states()
	speech_audio_int16 = (full_speech_tensor * 32767).numpy().astype(np.int16)
	rec.AcceptWaveform(speech_audio_int16.tobytes())
	result = json.loads(rec.FinalResult())
	prompt = result.get('text', '').strip()
	if prompt:
	print(f"👤 You: {prompt}")
	get_ai_response(prompt) # Using the non-streaming speak function
	else:
	print("🤔 Hmm, I didn't catch that. Please try again.")
	print("\n" + ("-"*50))
	except KeyboardInterrupt:
	print("\n🔴 Exiting voice chat.")
	break
	except Exception as e:
	import traceback
	print(f"An error occurred in the main loop: {e}")
	traceback.print_exc()
	break

	if __name__ == "__main__":
	main()