-
-
Save thomwolf/e9c3f978d0f82600a7c24cb0bf80d606 to your computer and use it in GitHub Desktop.
""" To use: install LLM studio (or Ollama), clone OpenVoice, run this script in the OpenVoice directory | |
git clone https://github.com/myshell-ai/OpenVoice | |
cd OpenVoice | |
git clone https://huggingface.co/myshell-ai/OpenVoice | |
cp -r OpenVoice/* . | |
pip install whisper pynput pyaudio | |
""" | |
from openai import OpenAI | |
import time | |
import pyaudio | |
import numpy as np | |
import torch | |
import os | |
import re | |
import se_extractor | |
import whisper | |
from pynput import keyboard | |
from api import BaseSpeakerTTS, ToneColorConverter | |
from utils import split_sentences_latin | |
SYSTEM_MESSAGE = "You are Bob an AI assistant. KEEP YOUR RESPONSES VERY SHORT AND CONVERSATIONAL." | |
SPEAKER_WAV = None | |
llm_client = OpenAI(base_url="http://localhost:1234/v1", api_key="not-needed") | |
tts_en_ckpt_base = os.path.join(os.path.dirname(__file__), "checkpoints/base_speakers/EN") | |
tts_ckpt_converter = os.path.join(os.path.dirname(__file__), "checkpoints/converter") | |
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu" | |
tts_model = BaseSpeakerTTS(f'{tts_en_ckpt_base}/config.json', device=device) | |
tts_model.load_ckpt(f'{tts_en_ckpt_base}/checkpoint.pth') | |
tone_color_converter = ToneColorConverter(f'{tts_ckpt_converter}/config.json', device=device) | |
tone_color_converter.load_ckpt(f'{tts_ckpt_converter}/checkpoint.pth') | |
en_source_default_se = torch.load(f"{tts_en_ckpt_base}/en_default_se.pth").to(device) | |
target_se, _ = se_extractor.get_se(SPEAKER_WAV, tone_color_converter, target_dir='processed', vad=True) if SPEAKER_WAV else (None, None) | |
sampling_rate = tts_model.hps.data.sampling_rate | |
mark = tts_model.language_marks.get("english", None) | |
asr_model = whisper.load_model("base.en") | |
def play_audio(text): | |
p = pyaudio.PyAudio() | |
stream = p.open(format=pyaudio.paFloat32, channels=1, rate=sampling_rate, output=True) | |
texts = split_sentences_latin(text) | |
for t in texts: | |
audio_list = [] | |
t = re.sub(r'([a-z])([A-Z])', r'\1 \2', t) | |
t = f'[{mark}]{t}[{mark}]' | |
stn_tst = tts_model.get_text(t, tts_model.hps, False) | |
with torch.no_grad(): | |
x_tst = stn_tst.unsqueeze(0).to(tts_model.device) | |
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(tts_model.device) | |
sid = torch.LongTensor([tts_model.hps.speakers["default"]]).to(tts_model.device) | |
audio = tts_model.model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.6)[0][0, 0].data.cpu().float().numpy() | |
if target_se is not None: | |
audio = tone_color_converter.convert_from_tensor(audio=audio, src_se=en_source_default_se, tgt_se=target_se) | |
audio_list.append(audio) | |
data = tts_model.audio_numpy_concat(audio_list, sr=sampling_rate).tobytes() | |
stream.write(data) | |
stream.stop_stream() | |
stream.close() | |
p.terminate() | |
def record_and_transcribe_audio(): | |
recording = False | |
def on_press(key): | |
nonlocal recording | |
if key == keyboard.Key.shift: | |
recording = True | |
def on_release(key): | |
nonlocal recording | |
if key == keyboard.Key.shift: | |
recording = False | |
return False | |
listener = keyboard.Listener( | |
on_press=on_press, | |
on_release=on_release) | |
listener.start() | |
print('Press shift to record...') | |
while not recording: | |
time.sleep(0.1) | |
print('Start recording...') | |
p = pyaudio.PyAudio() | |
stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, frames_per_buffer=1024, input=True) | |
frames = [] | |
while recording: | |
data = stream.read(1024, exception_on_overflow = False) | |
frames.append(np.frombuffer(data, dtype=np.int16)) | |
print('Finished recording') | |
data = np.hstack(frames, dtype=np.float32) / 32768.0 | |
result = asr_model.transcribe(data)['text'] | |
stream.stop_stream() | |
stream.close() | |
p.terminate() | |
return result | |
def conversation(): | |
conversation_history = [{'role': 'system', 'content': SYSTEM_MESSAGE}] | |
while True: | |
user_input = record_and_transcribe_audio() | |
conversation_history.append({'role': 'user', 'content': user_input}) | |
response = llm_client.chat.completions.create(model="local-model", messages=conversation_history) | |
chatbot_response = response.choices[0].message.content | |
conversation_history.append({'role': 'assistant', 'content': chatbot_response}) | |
print(conversation_history) | |
play_audio(chatbot_response) | |
if len(conversation_history) > 20: | |
conversation_history = conversation_history[-20:] | |
conversation() |
Beautiful.
Does anyone have it running on a M1 MacBook Pro? Was able to get over several hurdles but stuck with the PyAudio error when its trying to stream text to speech. ||PaMacCore (AUHAL)|| Error on line 2715: err=''what'', msg=Unspecified Audio Hardware Error. OSError: [Errno -9999] Unanticipated host error. Any help is greatly appreciated!
Meanwhile some changes I had to make. 1. Change line 90 to data = np.hstack(frames).astype(np.float32) / 32768.0 2. Add Terminal to Input Monitoring within Privacy & Security Settings
@haseeb-heaven I solved this issue forcing the device to the CPU
tts_en_ckpt_base = os.path.join(os.path.dirname(__file__), "checkpoints/base_speakers/EN")
tts_ckpt_converter = os.path.join(os.path.dirname(__file__), "checkpoints/converter")
device = "cpu" # <<<
Remove the Float point warn disabling the fp16
in the transcribe section
result = asr_model.transcribe(data, fp16=False)['text']
stream.stop_stream()
stream.close()
p.terminate()
I got more error
python fast_speech_text.py Loaded checkpoint 'checkpoints/base_speakers/EN/checkpoint.pth' missing/unexpected keys: [] [] Loaded checkpoint 'checkpoints/converter/checkpoint.pth' missing/unexpected keys: [] [] Press shift to record... Start recording... Finished recording /opt/homebrew/Caskroom/miniforge/base/envs/hm_env/lib/python3.8/site-packages/whisper/transcribe.py:115: UserWarning: FP16 is not supported on CPU; using FP32 instead warnings.warn("FP16 is not supported on CPU; using FP32 instead") [{'role': 'system', 'content': 'You are Heaven an AI assistant. KEEP YOUR RESPONSES VERY SHORT AND CONVERSATIONAL.'}, {'role': 'user', 'content': ' What are first 10 prime numbers?'}, {'role': 'assistant', 'content': ' The first ten prime numbers are 2, 3, 5, 7, 11, 13, 17, 19, 23, and 29.'}] ||PaMacCore (AUHAL)|| Error on line 2747: err=''what'', msg=Unspecified Audio Hardware Error Traceback (most recent call last): File "fast_speech_text.py", line 120, in <module> conversation() File "fast_speech_text.py", line 115, in conversation play_audio(chatbot_response) File "fast_speech_text.py", line 44, in play_audio stream = p.open(format=pyaudio.paFloat32, channels=1, rate=sampling_rate, output=True) File "/opt/homebrew/Caskroom/miniforge/base/envs/hm_env/lib/python3.8/site-packages/pyaudio/__init__.py", line 639, in open stream = PyAudio.Stream(self, *args, **kwargs) File "/opt/homebrew/Caskroom/miniforge/base/envs/hm_env/lib/python3.8/site-packages/pyaudio/__init__.py", line 447, in __init__ pa.start_stream(self._stream) OSError: [Errno -9986] Internal PortAudio error (hm_env) haseeb-mir@Haseebs-MacBook-Pro OpenVoice %
I wrote a port to Spanish with facebook/tts_transformer-es-css10 model can u visit here: https://gist.github.com/iddar/9a502842ea1867d9e849cd0c03384bf2
result = asr_model.transcribe(data, fp16=False)['text']
Thanks it worked like magic
Does anyone have it running on a M1 MacBook Pro? Was able to get over several hurdles but stuck with the PyAudio error when its trying to stream text to speech. ||PaMacCore (AUHAL)|| Error on line 2715: err=''what'', msg=Unspecified Audio Hardware Error. OSError: [Errno -9999] Unanticipated host error. Any help is greatly appreciated!
Meanwhile some changes I had to make. 1. Change line 90 to data = np.hstack(frames).astype(np.float32) / 32768.0 2. Add Terminal to Input Monitoring within Privacy & Security Settings
yes, it’s working on my device and I have the same MacBook M2 Pro and you need to make the changes as in the chat history that I have faced and I solved them now it’s working properly you need to install Python 3.8 version and then installed all the dependencies and it would run properly
someone tried to run this in ubuntu-wsl and managed to get the input from the keyboard?
someone tried to run this in ubuntu-wsl and managed to get the input from the keyboard?
try to Change the logic to use any other human input like a new line o std in
For MAC users: M1/M2 pro
- Before installing pyaudio make sure you do - brew install portaudio
- np.hstack(frames).astype(np.float32) - Thanks to @ajram23 above
- Make sure you have git-lfs else checkpoints from huggingface won't be downloaded properly and you would end up getting some pkl error
- Use requirements.txt from OpenVoice github folder to install all dependencies in one go and then additionally - pip install whisper pynput pyaudio
- I used LLM studio with mistral as backend, make sure to start server there within local inference server
Thats lot of requirements seems like to complex project.
But it is working on my MacBook M1 now after so much complications
Yes it does on my system too. Would make changes to it , integrate speech brain probably
I got more error