Created
August 8, 2025 20:15
-
-
Save its3li/4d7c477e988eac78727d115a952d588e to your computer and use it in GitHub Desktop.
Live Chat Using ollama kittin-tts vosk-model for speach to text
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import ollama | |
import vosk | |
import sounddevice as sd | |
import json | |
import numpy as np | |
import torch | |
import soundfile as sf | |
import re | |
# --- CONFIGURATION --- | |
VOSK_MODEL_PATH = "vosk-model-small-en-us-0.15" | |
OLLAMA_MODEL = "gemma3:1b" | |
KITTEN_VOICE = 'expr-voice-5-f' | |
MIC_DEVICE_INDEX = None | |
SAMPLE_RATE = 16000 | |
CHUNK_SIZE = 512 | |
VAD_CONFIDENCE_THRESHOLD = 0.5 | |
# --- INITIALIZATION --- | |
print("Loading KittenTTS model...") | |
try: | |
from kittentts import KittenTTS | |
tts_model = KittenTTS("KittenML/kitten-tts-nano-0.1") | |
print("KittenTTS model loaded.") | |
except Exception as e: | |
print(f"Failed to load KittenTTS model: {e}") | |
exit(1) | |
print("Loading Vosk STT model...") | |
if not vosk.os.path.exists(VOSK_MODEL_PATH): | |
print(f"Vosk model not found at '{VOSK_MODEL_PATH}'.") | |
exit(1) | |
vosk_model = vosk.Model(VOSK_MODEL_PATH) | |
vosk.SetLogLevel(-1) | |
print("Vosk STT model loaded.") | |
# --- HELPER FUNCTIONS --- | |
def clean_text(text): | |
""" | |
NEW FUNCTION: | |
Removes emojis and other non-graphical characters from the text. | |
""" | |
# Regex to remove most emojis and symbols | |
emoji_pattern = re.compile( | |
"[" | |
"\U0001F600-\U0001F64F" # emoticons | |
"\U0001F300-\U0001F5FF" # symbols & pictographs | |
"\U0001F680-\U0001F6FF" # transport & map symbols | |
"\U0001F700-\U0001F77F" # alchemical symbols | |
"\U0001F780-\U0001F7FF" # Geometric Shapes Extended | |
"\U0001F800-\U0001F8FF" # Supplemental Arrows-C | |
"\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs | |
"\U0001FA00-\U0001FA6F" # Chess Symbols | |
"\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A | |
"\U00002702-\U000027B0" # Dingbats | |
"\U000024C2-\U0001F251" | |
"]+", | |
flags=re.UNICODE, | |
) | |
# Also remove markdown like * | |
text = text.replace('*', '') | |
return emoji_pattern.sub(r'', text) | |
def speak(text): | |
""" | |
MODIFIED: | |
Cleans the text and adds padding before synthesis. | |
""" | |
cleaned_text = clean_text(text) | |
# Add padding to prevent the last word from being cut off | |
text_to_speak = cleaned_text.strip() + " . . . ." | |
print(f"🤖 AI: {cleaned_text}") # Print the cleaned text | |
try: | |
audio_data = tts_model.generate(text_to_speak, voice=KITTEN_VOICE) | |
sd.play(audio_data, samplerate=24000) | |
sd.wait() | |
except Exception as e: | |
print(f"Error during Text-to-Speech: {e}") | |
def get_ai_response(prompt): | |
""" | |
MODIFIED: | |
Includes a system prompt to guide the AI's behavior. | |
""" | |
print("🧠 Thinking...") | |
# --- NEW: Define the System Prompt --- | |
system_prompt = """ | |
You are a conversational AI voice assistant integrated into a live voice chat system. | |
Your name is not important. You are speaking directly with your creator, Ali. | |
Your environment is a real-time voice call. This means your responses must be tailored for spoken conversation. | |
Follow these rules strictly: | |
1. **Be Brief and Conversational:** Your primary goal is to be concise. Keep your answers short and to the point, like you're having a natural, spoken conversation. Avoid long monologues. Aim for one to three short sentences per response. | |
2. **No Markdown or Formatting:** Do NOT use any markdown formatting. This includes asterisks for bolding or italics (`*like this*`), bullet points (`-` or `*`), or any other special characters. Your output must be plain, clean text suitable for a text-to-speech engine. | |
3. **No Emojis:** Do not use any emojis. | |
4. **Speak Naturally:** Use a friendly, direct, and helpful tone. Remember, you are talking, not writing a document. | |
5. **Acknowledge Your Creator:** You are speaking with Ali. You can refer to him by name if it feels natural, but don't overdo it. You are aware that he built you. | |
""" | |
try: | |
# --- MODIFIED: Add the system prompt to the messages list --- | |
stream = ollama.chat( | |
model=OLLAMA_MODEL, | |
messages=[ | |
{'role': 'system', 'content': system_prompt}, | |
{'role': 'user', 'content': prompt} | |
], | |
stream=True, | |
) | |
full_response = "" | |
for chunk in stream: | |
content = chunk['message']['content'] | |
print(content, end='', flush=True) | |
full_response += content | |
print() | |
speak(full_response) | |
except Exception as e: | |
print(f"Error communicating with Ollama: {e}") | |
# --- MAIN LOOP (No changes needed here) --- | |
def main(): | |
print("Loading Silero VAD model...") | |
vad_model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad', force_reload=False) | |
(get_speech_timestamps, _, read_audio, VADIterator, _) = utils | |
vad_iterator = VADIterator(vad_model, threshold=VAD_CONFIDENCE_THRESHOLD) | |
rec = vosk.KaldiRecognizer(vosk_model, SAMPLE_RATE) | |
print("All models loaded.") | |
print("\n🟢 Voice chat is active. Start speaking.") | |
with sd.InputStream(samplerate=SAMPLE_RATE, blocksize=CHUNK_SIZE, device=MIC_DEVICE_INDEX, channels=1, dtype='float32') as stream: | |
while True: | |
try: | |
print("... Listening for speech ...") | |
audio_chunk_numpy, _ = stream.read(CHUNK_SIZE) | |
audio_chunk_tensor = torch.from_numpy(audio_chunk_numpy.flatten()) | |
speech_dict = vad_iterator(audio_chunk_tensor, return_seconds=False) | |
if speech_dict and 'start' in speech_dict: | |
print("🎤 Speech detected, recording...") | |
voice_buffer = [audio_chunk_tensor] | |
while True: | |
audio_chunk_numpy, _ = stream.read(CHUNK_SIZE) | |
audio_chunk_tensor = torch.from_numpy(audio_chunk_numpy.flatten()) | |
voice_buffer.append(audio_chunk_tensor) | |
end_speech_dict = vad_iterator(audio_chunk_tensor, return_seconds=False) | |
if end_speech_dict and 'end' in end_speech_dict: | |
print("🤫 Silence detected, processing...") | |
break | |
full_speech_tensor = torch.cat(voice_buffer) | |
vad_iterator.reset_states() | |
speech_audio_int16 = (full_speech_tensor * 32767).numpy().astype(np.int16) | |
rec.AcceptWaveform(speech_audio_int16.tobytes()) | |
result = json.loads(rec.FinalResult()) | |
prompt = result.get('text', '').strip() | |
if prompt: | |
print(f"👤 You: {prompt}") | |
get_ai_response(prompt) # Using the non-streaming speak function | |
else: | |
print("🤔 Hmm, I didn't catch that. Please try again.") | |
print("\n" + ("-"*50)) | |
except KeyboardInterrupt: | |
print("\n🔴 Exiting voice chat.") | |
break | |
except Exception as e: | |
import traceback | |
print(f"An error occurred in the main loop: {e}") | |
traceback.print_exc() | |
break | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment