Last active
August 17, 2023 16:23
-
-
Save kun432/13e4f63e4b7a9c3d4034f42c8a18374b to your computer and use it in GitHub Desktop.
clone_voice.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "view-in-github", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "<a href=\"https://colab.research.google.com/gist/kun432/13e4f63e4b7a9c3d4034f42c8a18374b/clone_voice.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "# bark-with-voice-clone sample\n", | |
| "\n", | |
| "refs: https://github.com/serp-ai/bark-with-voice-clone\n", | |
| "\n", | |
| "## NOTES:\n", | |
| "\n", | |
| "- need to enable GPU in notebook settings. (seems you can choose CPU, but I have not tried.)" | |
| ], | |
| "metadata": { | |
| "id": "oECN8S6vLwS6" | |
| } | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "# 1. prepare" | |
| ], | |
| "metadata": { | |
| "id": "LA8zqBCSM1yt" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "#!git clone https://github.com/serp-ai/bark-with-voice-clone\n", | |
| "#%cd bark-with-voice-clone\n", | |
| "\n", | |
| "!pip install git+https://github.com/suno-ai/bark.git && \\\n", | |
| " pip uninstall -y torch torchvision torchaudio && \\\n", | |
| " pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118" | |
| ], | |
| "metadata": { | |
| "id": "cinJm24JPm7w" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "# 2. train" | |
| ], | |
| "metadata": { | |
| "id": "ub94XhWGNttn" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "id": "lAbemdNKKsc4", | |
| "cellView": "form" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "#@title set text to record\n", | |
| "text = \"\\u3053\\u3093\\u306B\\u3061\\u306F\\u3002\\u4ECA\\u65E5\\u306F\\u3044\\u3044\\u304A\\u5929\\u6C17\\u3067\\u3059\\u306D\\u3002\\u3053\\u3093\\u306A\\u65E5\\u306F\\u5916\\u306B\\u51FA\\u304B\\u3051\\u305F\\u304F\\u306A\\u308A\\u307E\\u3059\\u304C\\u3001\\u4E88\\u5B9A\\u306F\\u3042\\u308A\\u307E\\u3059\\u304B\\uFF1F\" #@param {type:\"string\"}" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "#@title record your voice (exec this and you will see record button to record your voice)\n", | |
| "# refs: https://zenn.dev/kurehajime/scraps/c3b3f0cdbeff0a\n", | |
| "\n", | |
| "from IPython.display import display, Javascript\n", | |
| "from google.colab.output import eval_js\n", | |
| "from base64 import b64decode\n", | |
| "\n", | |
| "audio_filename = \"audio\"\n", | |
| "\n", | |
| "def record_js(filename=f\"{audio_filename}.mp3\"):\n", | |
| " js = Javascript('''\n", | |
| " async function record() {\n", | |
| " let rec;\n", | |
| " let chanks;\n", | |
| "\n", | |
| " const div = document.createElement('div');\n", | |
| " const startRecord = document.createElement('button');\n", | |
| " startRecord.textContent = 'Rec';\n", | |
| " div.appendChild(startRecord);\n", | |
| "\n", | |
| " const stopRecord = document.createElement('button');\n", | |
| " stopRecord.textContent = 'Stop';\n", | |
| " stopRecord.style.display = 'none'\n", | |
| " div.appendChild(stopRecord);\n", | |
| "\n", | |
| " const audio = document.createElement('audio');\n", | |
| " div.appendChild(audio);\n", | |
| "\n", | |
| " document.body.appendChild(div);\n", | |
| "\n", | |
| " function handlerFunction(stream,resolve) {\n", | |
| " rec = new MediaRecorder(stream);\n", | |
| " rec.ondataavailable = e => {\n", | |
| " chanks.push(e.data);\n", | |
| " if (rec.state == \"inactive\") {\n", | |
| " let blob = new Blob(chanks, { type: 'audio/mpeg-3' });\n", | |
| " audio.src = URL.createObjectURL(blob);\n", | |
| " audio.controls = true;\n", | |
| " audio.autoplay = true;\n", | |
| " resolve();\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| "\n", | |
| " startRecord.onclick = e => {\n", | |
| " startRecord.style.display = 'none'\n", | |
| " stopRecord.style.display = 'block'\n", | |
| " chanks = [];\n", | |
| " rec.start();\n", | |
| " }\n", | |
| "\n", | |
| " stopRecord.onclick = e => {\n", | |
| " startRecord.style.display = 'block'\n", | |
| " stopRecord.style.display = 'none'\n", | |
| " rec.stop();\n", | |
| " }\n", | |
| "\n", | |
| " function blobToBase64(blob) {\n", | |
| " return new Promise((resolve, _) => {\n", | |
| " const reader = new FileReader();\n", | |
| " reader.onloadend = () => resolve(reader.result);\n", | |
| " reader.readAsDataURL(blob);\n", | |
| " });\n", | |
| " }\n", | |
| "\n", | |
| " await new Promise((resolve) => {\n", | |
| " navigator.mediaDevices.getUserMedia({ audio: true })\n", | |
| " .then(stream => { handlerFunction(stream,resolve) })\n", | |
| " });\n", | |
| " let blob = new Blob(chanks, { type: 'audio/mpeg-3' });\n", | |
| " return await blobToBase64(blob);\n", | |
| " }\n", | |
| " ''')\n", | |
| " display(js)\n", | |
| " data = eval_js('record()')\n", | |
| " binary = b64decode(data.split(',')[1])\n", | |
| " with open(filename, 'wb') as f:\n", | |
| " f.write(binary)\n", | |
| " return filename\n", | |
| "\n", | |
| "filename = record_js()" | |
| ], | |
| "metadata": { | |
| "id": "medDaIz0nqPW", | |
| "cellView": "form" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "train from your voice" | |
| ], | |
| "metadata": { | |
| "id": "uyJkfwAWv5So" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "id": "a4ogAsDHKscz" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "from bark.generation import load_codec_model, generate_text_semantic\n", | |
| "from encodec.utils import convert_audio\n", | |
| "\n", | |
| "import torchaudio\n", | |
| "import torch\n", | |
| "\n", | |
| "import matplotlib.pyplot as plt\n", | |
| "\n", | |
| "model = load_codec_model(use_gpu=True)\n", | |
| "\n", | |
| "# Load and pre-process the audio waveform\n", | |
| "#audio_filepath = 'audio.mp3' # the audio you want to clone (will get truncated so 5-10 seconds is probably fine, existing samples that I checked are around 7 seconds)\n", | |
| "device = 'cuda' # or 'cpu'\n", | |
| "wav, sr = torchaudio.load(filename)\n", | |
| "\n", | |
| "print(wav.shape)\n", | |
| "print(sr)\n", | |
| "plt.plot(wav.t().numpy());\n", | |
| "\n", | |
| "wav = convert_audio(wav, sr, model.sample_rate, model.channels)\n", | |
| "wav = wav.unsqueeze(0).to(device)\n", | |
| "\n", | |
| "# Extract discrete codes from EnCodec\n", | |
| "with torch.no_grad():\n", | |
| " encoded_frames = model.encode(wav)\n", | |
| "codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze() # [n_q, T]\n", | |
| "\n", | |
| "# get seconds of audio\n", | |
| "seconds = wav.shape[-1] / model.sample_rate\n", | |
| "# generate semantic tokens\n", | |
| "semantic_tokens = generate_text_semantic(text, max_gen_duration_s=seconds, top_k=50, top_p=.95, temp=0.7)\n", | |
| "\n", | |
| "# move codes to cpu\n", | |
| "codes = codes.cpu().numpy()\n", | |
| "\n", | |
| "import sys\n", | |
| "import platform\n", | |
| "python_version=f\"{sys.version_info.major}.{sys.version_info.minor}\"\n", | |
| "\n", | |
| "import numpy as np\n", | |
| "voice_name = 'ja_speaker_9' # whatever you want the name of the voice to be\n", | |
| "output_path = f'/usr/local/lib/python{python_version}/dist-packages/bark/assets/prompts/' + voice_name + '.npz'\n", | |
| "#output_path = '/usr/local/lib/python3.9/dist-packages/bark/assets/prompts/' + voice_name + '.npz'\n", | |
| "np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "If succeeded, your voice model will be shown below:" | |
| ], | |
| "metadata": { | |
| "id": "k0H7TPNx3jSs" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "id": "lwu8kKjxKsc7" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "!ls -lt {output_path}" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# That's it! Now you can head over to the generate.ipynb and use your voice_name for the 'history_prompt'" | |
| ], | |
| "metadata": { | |
| "id": "m-AnYhLTqxDA" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "## 3. Generate" | |
| ], | |
| "metadata": { | |
| "id": "uDVovdinv_BW" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "id": "TBQfThM9Ksc7" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "# Heres the generation stuff copy-pasted for convenience" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "id": "X_Swx_LuKsc8" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "from bark.api import generate_audio\n", | |
| "from transformers import BertTokenizer\n", | |
| "from bark.generation import SAMPLE_RATE, preload_models, codec_decode, generate_coarse, generate_fine, generate_text_semantic\n", | |
| "\n", | |
| "# load the tokenizer\n", | |
| "tokenizer = BertTokenizer.from_pretrained(\"bert-base-multilingual-cased\")\n", | |
| "\n", | |
| "# download and load all models\n", | |
| "preload_models(\n", | |
| " text_use_gpu=True,\n", | |
| " text_use_small=False,\n", | |
| " coarse_use_gpu=True,\n", | |
| " coarse_use_small=False,\n", | |
| " fine_use_gpu=True,\n", | |
| " fine_use_small=False,\n", | |
| " codec_use_gpu=True,\n", | |
| " force_reload=False\n", | |
| ")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "set your prompt" | |
| ], | |
| "metadata": { | |
| "id": "EYDcOX9b2nEJ" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# Enter your prompt and speaker here\n", | |
| "text_prompt = \"\\u306F\\u3058\\u3081\\u307E\\u3057\\u3066\\u3002\\u3048\\u30FC\\u3068 ... \\u3068\\u308A\\u3042\\u3048\\u305A\\u3001\\u3044\\u308D\\u3044\\u308D\\u558B\\u3089\\u305B\\u3066\\u307F\\u3066\\u304F\\u3060\\u3055\\u3044\\u306D\\u3002\\u3088\\u308D\\u3057\\u304F\\u304A\\u306D\\u304C\\u3044\\u3057\\u307E\\u3059\\u3002[laughs]\" #@param {type:\"string\"}" | |
| ], | |
| "metadata": { | |
| "cellView": "form", | |
| "id": "vms3w_EM2sKL" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "generate audio with simple parameters" | |
| ], | |
| "metadata": { | |
| "id": "weJoWy2P3Cz1" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "id": "8exAm-BHKsc-" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "from IPython.display import Audio\n", | |
| "\n", | |
| "# simple generation\n", | |
| "audio_array = generate_audio(text_prompt, history_prompt=voice_name, text_temp=0.7, waveform_temp=0.7)\n", | |
| "\n", | |
| "# play audio\n", | |
| "Audio(audio_array, rate=SAMPLE_RATE)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "Also you can generate audio with more control (takes more time than above)" | |
| ], | |
| "metadata": { | |
| "id": "dq5NcRpE3HmR" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "id": "CGNSH9ScKsc-" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "from IPython.display import Audio\n", | |
| "\n", | |
| "# generation with more control\n", | |
| "x_semantic = generate_text_semantic(\n", | |
| " text_prompt,\n", | |
| " history_prompt=voice_name,\n", | |
| " temp=0.7,\n", | |
| " top_k=50,\n", | |
| " top_p=0.95,\n", | |
| ")\n", | |
| "\n", | |
| "x_coarse_gen = generate_coarse(\n", | |
| " x_semantic,\n", | |
| " history_prompt=voice_name,\n", | |
| " temp=0.7,\n", | |
| " top_k=50,\n", | |
| " top_p=0.95,\n", | |
| ")\n", | |
| "x_fine_gen = generate_fine(\n", | |
| " x_coarse_gen,\n", | |
| " history_prompt=voice_name,\n", | |
| " temp=0.5,\n", | |
| ")\n", | |
| "audio_array = codec_decode(x_fine_gen)\n", | |
| "\n", | |
| "# play audio\n", | |
| "Audio(audio_array, rate=SAMPLE_RATE)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "## 4. Download Audio" | |
| ], | |
| "metadata": { | |
| "id": "AdG_Uj3ky3Fv" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "id": "gvsAHmloKsc_" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "from scipy.io.wavfile import write as write_wav\n", | |
| "import os\n", | |
| "\n", | |
| "# save audio\n", | |
| "output_dir=\"/content/output\"\n", | |
| "os.makedirs(output_dir, exist_ok=True)\n", | |
| "filepath = f\"{output_dir}/audio.wav\" # change this to your desired output path\n", | |
| "write_wav(filepath, SAMPLE_RATE, audio_array)" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.10.8" | |
| }, | |
| "orig_nbformat": 4, | |
| "colab": { | |
| "provenance": [], | |
| "include_colab_link": true | |
| }, | |
| "accelerator": "GPU", | |
| "gpuClass": "standard" | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 0 | |
| } |
Same error here
Author
colaboratory seems change python version from 3.9 to 3.10. fixed.
not working
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Was training my voice but: