-
-
Save iddar/9a502842ea1867d9e849cd0c03384bf2 to your computer and use it in GitHub Desktop.
speech to text to speech spanish version
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" Para usar: instale LLM studio (o Ollama), clone OpenVoice, ejecute este script en el directorio OpenVoice | |
git clone https://github.com/myshell-ai/OpenVoice | |
cd OpenVoice | |
git clone https://huggingface.co/myshell-ai/OpenVoice | |
cp -r OpenVoice/* . | |
pip install whisper pynput pyaudio | |
este proyecto se base en el script compartido por thomwolf en https://gist.github.com/thomwolf/e9c3f978d0f82600a7c24cb0bf80d606 | |
""" | |
from openai import OpenAI | |
import time | |
import pyaudio | |
import numpy as np | |
import torch | |
import os | |
import re | |
import se_extractor | |
import whisper | |
from pynput import keyboard | |
from api import BaseSpeakerTTS, ToneColorConverter | |
from utils import split_sentences_latin | |
from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub | |
from fairseq.models.text_to_speech.hub_interface import TTSHubInterface | |
import IPython.display as ipd | |
from playsound import playsound | |
from scipy.io.wavfile import write as write_wav | |
models, cfg, task = load_model_ensemble_and_task_from_hf_hub( | |
"facebook/tts_transformer-es-css10", | |
arg_overrides={"vocoder": "hifigan", "fp16": False} | |
) | |
model = models[0] | |
TTSHubInterface.update_cfg_with_data_cfg(cfg, task.data_cfg) | |
generator = task.build_generator([model], cfg) | |
# SYSTEM_MESSAGE = "You are Bob an AI assistant. KEEP YOUR RESPONSES VERY SHORT AND CONVERSATIONAL." | |
SYSTEM_MESSAGE_ES = "Eres Bob un asistente de IA. MANTÉN TUS RESPUESTAS MUY CORTAS Y CONVERSACIONALES." | |
SPEAKER_WAV = None | |
llm_client = OpenAI(base_url="http://localhost:1234/v1", api_key="not-needed") | |
tts_en_ckpt_base = os.path.join(os.path.dirname(__file__), "checkpoints/base_speakers/EN") | |
tts_ckpt_converter = os.path.join(os.path.dirname(__file__), "checkpoints/converter") | |
device = "cpu" | |
tts_model = BaseSpeakerTTS(f'{tts_en_ckpt_base}/config.json', device=device) | |
tts_model.load_ckpt(f'{tts_en_ckpt_base}/checkpoint.pth') | |
tone_color_converter = ToneColorConverter(f'{tts_ckpt_converter}/config.json', device=device) | |
tone_color_converter.load_ckpt(f'{tts_ckpt_converter}/checkpoint.pth') | |
en_source_default_se = torch.load(f"{tts_en_ckpt_base}/en_default_se.pth").to(device) | |
target_se, _ = se_extractor.get_se(SPEAKER_WAV, tone_color_converter, target_dir='processed', vad=True) if SPEAKER_WAV else (None, None) | |
sampling_rate = tts_model.hps.data.sampling_rate | |
mark = tts_model.language_marks.get("english", None) | |
asr_model = whisper.load_model("small") | |
def sound(wav, fs=8000): | |
p = pyaudio.PyAudio() | |
stream = p.open(format=pyaudio.paInt16, channels=len(wav.shape), rate=fs, output=True) | |
stream.write(wav.tobytes()) | |
stream.stop_stream() | |
stream.close() | |
p.terminate() | |
def play_audio(text): | |
sample = TTSHubInterface.get_model_input(task, text) | |
wav, rate = TTSHubInterface.get_prediction(task, model, generator, sample) | |
wav_cpu = wav.to('cpu') | |
write_wav("audio1.wav", rate, wav_cpu.numpy()) | |
playsound("audio1.wav") | |
def record_and_transcribe_audio(): | |
recording = False | |
def on_press(key): | |
nonlocal recording | |
if key == keyboard.Key.shift: | |
recording = True | |
def on_release(key): | |
nonlocal recording | |
if key == keyboard.Key.shift: | |
recording = False | |
return False | |
listener = keyboard.Listener( | |
on_press=on_press, | |
on_release=on_release) | |
listener.start() | |
print('Press shift to record...') | |
while not recording: | |
time.sleep(0.1) | |
print('Start recording...') | |
p = pyaudio.PyAudio() | |
stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, frames_per_buffer=1024, input=True) | |
frames = [] | |
while recording: | |
data = stream.read(1024, exception_on_overflow = False) | |
frames.append(np.frombuffer(data, dtype=np.int16)) | |
print('Finished recording') | |
concatenated_frames = np.hstack(frames) | |
data = concatenated_frames.astype(np.float32) / 32768.0 | |
result = asr_model.transcribe(data, fp16=False)['text'] | |
stream.stop_stream() | |
stream.close() | |
p.terminate() | |
return result | |
def conversation(): | |
conversation_history = [{'role': 'system', 'content': SYSTEM_MESSAGE_ES}] | |
while True: | |
user_input = record_and_transcribe_audio() | |
conversation_history.append({'role': 'user', 'content': user_input}) | |
response = llm_client.chat.completions.create(model="mistral", messages=conversation_history) | |
chatbot_response = response.choices[0].message.content | |
conversation_history.append({'role': 'assistant', 'content': chatbot_response}) | |
print(conversation_history) | |
play_audio(chatbot_response) | |
if len(conversation_history) > 20: | |
conversation_history = conversation_history[-20:] | |
conversation() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment