-
-
Save iamaziz/98746a9fe203d43da48a932aab1b6bb1 to your computer and use it in GitHub Desktop.
speech to text to speech (Srteamlit App) - example in the comments
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" To use: install Ollama (or LLM studio), clone OpenVoice, run this script in the OpenVoice directory | |
git clone https://github.com/myshell-ai/OpenVoice | |
cd OpenVoice | |
git clone https://huggingface.co/myshell-ai/OpenVoice | |
cp -r OpenVoice/* . | |
pip install whisper pynput pyaudio streamlit ollama | |
script source: https://x.com/Thom_Wolf/status/1758140066285658351?s=20 | |
""" | |
import streamlit as st | |
import ollama | |
import time | |
import pyaudio | |
import numpy as np | |
import torch | |
import os | |
import re | |
import se_extractor | |
import whisper | |
from pynput import keyboard | |
from api import BaseSpeakerTTS, ToneColorConverter | |
from utils import split_sentences_latin | |
SYSTEM_MESSAGE = "You are Arisa an AI assistant. KEEP YOUR RESPONSES VERY SHORT AND CONVERSATIONAL." | |
SPEAKER_WAV = None | |
llm_client = ollama.Client() | |
tts_en_ckpt_base = os.path.join(os.path.dirname(__file__), "checkpoints/base_speakers/EN") | |
tts_ckpt_converter = os.path.join(os.path.dirname(__file__), "checkpoints/converter") | |
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu" | |
tts_model = BaseSpeakerTTS(f'{tts_en_ckpt_base}/config.json', device=device) | |
tts_model.load_ckpt(f'{tts_en_ckpt_base}/checkpoint.pth') | |
tone_color_converter = ToneColorConverter(f'{tts_ckpt_converter}/config.json', device=device) | |
tone_color_converter.load_ckpt(f'{tts_ckpt_converter}/checkpoint.pth') | |
en_source_default_se = torch.load(f"{tts_en_ckpt_base}/en_default_se.pth").to(device) | |
target_se, _ = se_extractor.get_se(SPEAKER_WAV, tone_color_converter, target_dir='processed', vad=True) if SPEAKER_WAV else (None, None) | |
sampling_rate = tts_model.hps.data.sampling_rate | |
mark = tts_model.language_marks.get("english", None) | |
asr_model = whisper.load_model("base.en") | |
def play_audio(text): | |
p = pyaudio.PyAudio() | |
stream = p.open(format=pyaudio.paFloat32, channels=1, rate=sampling_rate, output=True) | |
texts = split_sentences_latin(text) | |
for t in texts: | |
audio_list = [] | |
t = re.sub(r'([a-z])([A-Z])', r'\1 \2', t) | |
t = f'[{mark}]{t}[{mark}]' | |
stn_tst = tts_model.get_text(t, tts_model.hps, False) | |
with torch.no_grad(): | |
x_tst = stn_tst.unsqueeze(0).to(tts_model.device) | |
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(tts_model.device) | |
sid = torch.LongTensor([tts_model.hps.speakers["default"]]).to(tts_model.device) | |
audio = tts_model.model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.6)[0][0, 0].data.cpu().float().numpy() | |
if target_se is not None: | |
audio = tone_color_converter.convert_from_tensor(audio=audio, src_se=en_source_default_se, tgt_se=target_se) | |
audio_list.append(audio) | |
data = tts_model.audio_numpy_concat(audio_list, sr=sampling_rate).tobytes() | |
stream.write(data) | |
stream.stop_stream() | |
stream.close() | |
p.terminate() | |
def record_and_transcribe_audio(): | |
recording = False | |
def on_press(key): | |
nonlocal recording | |
if key == keyboard.Key.ctrl: | |
recording = True | |
def on_release(key): | |
nonlocal recording | |
if key == keyboard.Key.ctrl: | |
recording = False | |
return False | |
with st.spinner("Recording..."): | |
listener = keyboard.Listener( | |
on_press=on_press, | |
on_release=on_release) | |
listener.start() | |
print('Press CTRL to record...') | |
while not recording: | |
time.sleep(0.1) | |
print('Start recording...') | |
# warning while recording | |
placeholder = st.empty() | |
placeholder.warning("Recording...") | |
p = pyaudio.PyAudio() | |
stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, frames_per_buffer=1024, input=True) | |
frames = [] | |
while recording: | |
data = stream.read(1024, exception_on_overflow = False) | |
frames.append(np.frombuffer(data, dtype=np.int16)) | |
print('Finished recording') | |
data = np.hstack(frames, dtype=np.float32) / 32768.0 | |
result = asr_model.transcribe(data)['text'] | |
stream.stop_stream() | |
stream.close() | |
p.terminate() | |
placeholder.empty() | |
return result | |
def conversation(): | |
conversation_history = [{'role': 'system', 'content': SYSTEM_MESSAGE}] | |
while True: | |
user_input = record_and_transcribe_audio() | |
conversation_history.append({'role': 'user', 'content': user_input}) | |
if user_input: | |
with st.chat_message(name="user"): | |
st.write(user_input) | |
response = llm_client.chat(model=OLLAMA_MODEL, messages=conversation_history) | |
chatbot_response = response['message']['content'] | |
with st.chat_message(name="assistant"): | |
st.write(chatbot_response) | |
# display chatbot | |
conversation_history.append({'role': 'assistant', 'content': chatbot_response}) | |
print(f"Model: {OLLAMA_MODEL}") | |
print(conversation_history) | |
play_audio(chatbot_response) | |
if len(conversation_history) > 20: | |
conversation_history = conversation_history[-20:] | |
def app(): | |
global OLLAMA_MODEL | |
# header | |
st.set_page_config(page_title="Speech to Text to Speech Chatbot", layout="wide") | |
st.title("Speech to Text to Speech Chatbot") | |
# choose model | |
models = ollama.list()['models'] | |
models = [""] + [model['name'] for model in models] | |
OLLAMA_MODEL = st.selectbox("Choose a model", models) | |
# start conversation | |
if OLLAMA_MODEL: | |
st.info("Press CTRL and speak to talk to the chatbot...") | |
with st.container(height=700, border=True): | |
conversation() | |
if __name__ == "__main__": | |
app() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Example (voice on)
Screen.Recording.2024-02-16.at.12.07.37.mov