Skip to content

Instantly share code, notes, and snippets.

@its3li
Created August 8, 2025 20:15
Show Gist options
  • Save its3li/4d7c477e988eac78727d115a952d588e to your computer and use it in GitHub Desktop.
Save its3li/4d7c477e988eac78727d115a952d588e to your computer and use it in GitHub Desktop.
Live Chat Using ollama kittin-tts vosk-model for speach to text
import ollama
import vosk
import sounddevice as sd
import json
import numpy as np
import torch
import soundfile as sf
import re
# --- CONFIGURATION ---
VOSK_MODEL_PATH = "vosk-model-small-en-us-0.15"
OLLAMA_MODEL = "gemma3:1b"
KITTEN_VOICE = 'expr-voice-5-f'
MIC_DEVICE_INDEX = None
SAMPLE_RATE = 16000
CHUNK_SIZE = 512
VAD_CONFIDENCE_THRESHOLD = 0.5
# --- INITIALIZATION ---
print("Loading KittenTTS model...")
try:
from kittentts import KittenTTS
tts_model = KittenTTS("KittenML/kitten-tts-nano-0.1")
print("KittenTTS model loaded.")
except Exception as e:
print(f"Failed to load KittenTTS model: {e}")
exit(1)
print("Loading Vosk STT model...")
if not vosk.os.path.exists(VOSK_MODEL_PATH):
print(f"Vosk model not found at '{VOSK_MODEL_PATH}'.")
exit(1)
vosk_model = vosk.Model(VOSK_MODEL_PATH)
vosk.SetLogLevel(-1)
print("Vosk STT model loaded.")
# --- HELPER FUNCTIONS ---
def clean_text(text):
"""
NEW FUNCTION:
Removes emojis and other non-graphical characters from the text.
"""
# Regex to remove most emojis and symbols
emoji_pattern = re.compile(
"["
"\U0001F600-\U0001F64F" # emoticons
"\U0001F300-\U0001F5FF" # symbols & pictographs
"\U0001F680-\U0001F6FF" # transport & map symbols
"\U0001F700-\U0001F77F" # alchemical symbols
"\U0001F780-\U0001F7FF" # Geometric Shapes Extended
"\U0001F800-\U0001F8FF" # Supplemental Arrows-C
"\U0001F900-\U0001F9FF" # Supplemental Symbols and Pictographs
"\U0001FA00-\U0001FA6F" # Chess Symbols
"\U0001FA70-\U0001FAFF" # Symbols and Pictographs Extended-A
"\U00002702-\U000027B0" # Dingbats
"\U000024C2-\U0001F251"
"]+",
flags=re.UNICODE,
)
# Also remove markdown like *
text = text.replace('*', '')
return emoji_pattern.sub(r'', text)
def speak(text):
"""
MODIFIED:
Cleans the text and adds padding before synthesis.
"""
cleaned_text = clean_text(text)
# Add padding to prevent the last word from being cut off
text_to_speak = cleaned_text.strip() + " . . . ."
print(f"🤖 AI: {cleaned_text}") # Print the cleaned text
try:
audio_data = tts_model.generate(text_to_speak, voice=KITTEN_VOICE)
sd.play(audio_data, samplerate=24000)
sd.wait()
except Exception as e:
print(f"Error during Text-to-Speech: {e}")
def get_ai_response(prompt):
"""
MODIFIED:
Includes a system prompt to guide the AI's behavior.
"""
print("🧠 Thinking...")
# --- NEW: Define the System Prompt ---
system_prompt = """
You are a conversational AI voice assistant integrated into a live voice chat system.
Your name is not important. You are speaking directly with your creator, Ali.
Your environment is a real-time voice call. This means your responses must be tailored for spoken conversation.
Follow these rules strictly:
1. **Be Brief and Conversational:** Your primary goal is to be concise. Keep your answers short and to the point, like you're having a natural, spoken conversation. Avoid long monologues. Aim for one to three short sentences per response.
2. **No Markdown or Formatting:** Do NOT use any markdown formatting. This includes asterisks for bolding or italics (`*like this*`), bullet points (`-` or `*`), or any other special characters. Your output must be plain, clean text suitable for a text-to-speech engine.
3. **No Emojis:** Do not use any emojis.
4. **Speak Naturally:** Use a friendly, direct, and helpful tone. Remember, you are talking, not writing a document.
5. **Acknowledge Your Creator:** You are speaking with Ali. You can refer to him by name if it feels natural, but don't overdo it. You are aware that he built you.
"""
try:
# --- MODIFIED: Add the system prompt to the messages list ---
stream = ollama.chat(
model=OLLAMA_MODEL,
messages=[
{'role': 'system', 'content': system_prompt},
{'role': 'user', 'content': prompt}
],
stream=True,
)
full_response = ""
for chunk in stream:
content = chunk['message']['content']
print(content, end='', flush=True)
full_response += content
print()
speak(full_response)
except Exception as e:
print(f"Error communicating with Ollama: {e}")
# --- MAIN LOOP (No changes needed here) ---
def main():
print("Loading Silero VAD model...")
vad_model, utils = torch.hub.load(repo_or_dir='snakers4/silero-vad', model='silero_vad', force_reload=False)
(get_speech_timestamps, _, read_audio, VADIterator, _) = utils
vad_iterator = VADIterator(vad_model, threshold=VAD_CONFIDENCE_THRESHOLD)
rec = vosk.KaldiRecognizer(vosk_model, SAMPLE_RATE)
print("All models loaded.")
print("\n🟢 Voice chat is active. Start speaking.")
with sd.InputStream(samplerate=SAMPLE_RATE, blocksize=CHUNK_SIZE, device=MIC_DEVICE_INDEX, channels=1, dtype='float32') as stream:
while True:
try:
print("... Listening for speech ...")
audio_chunk_numpy, _ = stream.read(CHUNK_SIZE)
audio_chunk_tensor = torch.from_numpy(audio_chunk_numpy.flatten())
speech_dict = vad_iterator(audio_chunk_tensor, return_seconds=False)
if speech_dict and 'start' in speech_dict:
print("🎤 Speech detected, recording...")
voice_buffer = [audio_chunk_tensor]
while True:
audio_chunk_numpy, _ = stream.read(CHUNK_SIZE)
audio_chunk_tensor = torch.from_numpy(audio_chunk_numpy.flatten())
voice_buffer.append(audio_chunk_tensor)
end_speech_dict = vad_iterator(audio_chunk_tensor, return_seconds=False)
if end_speech_dict and 'end' in end_speech_dict:
print("🤫 Silence detected, processing...")
break
full_speech_tensor = torch.cat(voice_buffer)
vad_iterator.reset_states()
speech_audio_int16 = (full_speech_tensor * 32767).numpy().astype(np.int16)
rec.AcceptWaveform(speech_audio_int16.tobytes())
result = json.loads(rec.FinalResult())
prompt = result.get('text', '').strip()
if prompt:
print(f"👤 You: {prompt}")
get_ai_response(prompt) # Using the non-streaming speak function
else:
print("🤔 Hmm, I didn't catch that. Please try again.")
print("\n" + ("-"*50))
except KeyboardInterrupt:
print("\n🔴 Exiting voice chat.")
break
except Exception as e:
import traceback
print(f"An error occurred in the main loop: {e}")
traceback.print_exc()
break
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment