Skip to content

Instantly share code, notes, and snippets.

@kun432
Last active August 17, 2023 16:23
Show Gist options
  • Select an option

  • Save kun432/13e4f63e4b7a9c3d4034f42c8a18374b to your computer and use it in GitHub Desktop.

Select an option

Save kun432/13e4f63e4b7a9c3d4034f42c8a18374b to your computer and use it in GitHub Desktop.
clone_voice.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/kun432/13e4f63e4b7a9c3d4034f42c8a18374b/clone_voice.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"source": [
"# bark-with-voice-clone sample\n",
"\n",
"refs: https://github.com/serp-ai/bark-with-voice-clone\n",
"\n",
"## NOTES:\n",
"\n",
"- need to enable GPU in notebook settings. (seems you can choose CPU, but I have not tried.)"
],
"metadata": {
"id": "oECN8S6vLwS6"
}
},
{
"cell_type": "markdown",
"source": [
"# 1. prepare"
],
"metadata": {
"id": "LA8zqBCSM1yt"
}
},
{
"cell_type": "code",
"source": [
"#!git clone https://github.com/serp-ai/bark-with-voice-clone\n",
"#%cd bark-with-voice-clone\n",
"\n",
"!pip install git+https://github.com/suno-ai/bark.git && \\\n",
" pip uninstall -y torch torchvision torchaudio && \\\n",
" pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118"
],
"metadata": {
"id": "cinJm24JPm7w"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"# 2. train"
],
"metadata": {
"id": "ub94XhWGNttn"
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "lAbemdNKKsc4",
"cellView": "form"
},
"outputs": [],
"source": [
"#@title set text to record\n",
"text = \"\\u3053\\u3093\\u306B\\u3061\\u306F\\u3002\\u4ECA\\u65E5\\u306F\\u3044\\u3044\\u304A\\u5929\\u6C17\\u3067\\u3059\\u306D\\u3002\\u3053\\u3093\\u306A\\u65E5\\u306F\\u5916\\u306B\\u51FA\\u304B\\u3051\\u305F\\u304F\\u306A\\u308A\\u307E\\u3059\\u304C\\u3001\\u4E88\\u5B9A\\u306F\\u3042\\u308A\\u307E\\u3059\\u304B\\uFF1F\" #@param {type:\"string\"}"
]
},
{
"cell_type": "code",
"source": [
"#@title record your voice (exec this and you will see record button to record your voice)\n",
"# refs: https://zenn.dev/kurehajime/scraps/c3b3f0cdbeff0a\n",
"\n",
"from IPython.display import display, Javascript\n",
"from google.colab.output import eval_js\n",
"from base64 import b64decode\n",
"\n",
"audio_filename = \"audio\"\n",
"\n",
"def record_js(filename=f\"{audio_filename}.mp3\"):\n",
" js = Javascript('''\n",
" async function record() {\n",
" let rec;\n",
" let chanks;\n",
"\n",
" const div = document.createElement('div');\n",
" const startRecord = document.createElement('button');\n",
" startRecord.textContent = 'Rec';\n",
" div.appendChild(startRecord);\n",
"\n",
" const stopRecord = document.createElement('button');\n",
" stopRecord.textContent = 'Stop';\n",
" stopRecord.style.display = 'none'\n",
" div.appendChild(stopRecord);\n",
"\n",
" const audio = document.createElement('audio');\n",
" div.appendChild(audio);\n",
"\n",
" document.body.appendChild(div);\n",
"\n",
" function handlerFunction(stream,resolve) {\n",
" rec = new MediaRecorder(stream);\n",
" rec.ondataavailable = e => {\n",
" chanks.push(e.data);\n",
" if (rec.state == \"inactive\") {\n",
" let blob = new Blob(chanks, { type: 'audio/mpeg-3' });\n",
" audio.src = URL.createObjectURL(blob);\n",
" audio.controls = true;\n",
" audio.autoplay = true;\n",
" resolve();\n",
" }\n",
" }\n",
" }\n",
"\n",
" startRecord.onclick = e => {\n",
" startRecord.style.display = 'none'\n",
" stopRecord.style.display = 'block'\n",
" chanks = [];\n",
" rec.start();\n",
" }\n",
"\n",
" stopRecord.onclick = e => {\n",
" startRecord.style.display = 'block'\n",
" stopRecord.style.display = 'none'\n",
" rec.stop();\n",
" }\n",
"\n",
" function blobToBase64(blob) {\n",
" return new Promise((resolve, _) => {\n",
" const reader = new FileReader();\n",
" reader.onloadend = () => resolve(reader.result);\n",
" reader.readAsDataURL(blob);\n",
" });\n",
" }\n",
"\n",
" await new Promise((resolve) => {\n",
" navigator.mediaDevices.getUserMedia({ audio: true })\n",
" .then(stream => { handlerFunction(stream,resolve) })\n",
" });\n",
" let blob = new Blob(chanks, { type: 'audio/mpeg-3' });\n",
" return await blobToBase64(blob);\n",
" }\n",
" ''')\n",
" display(js)\n",
" data = eval_js('record()')\n",
" binary = b64decode(data.split(',')[1])\n",
" with open(filename, 'wb') as f:\n",
" f.write(binary)\n",
" return filename\n",
"\n",
"filename = record_js()"
],
"metadata": {
"id": "medDaIz0nqPW",
"cellView": "form"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"train from your voice"
],
"metadata": {
"id": "uyJkfwAWv5So"
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "a4ogAsDHKscz"
},
"outputs": [],
"source": [
"from bark.generation import load_codec_model, generate_text_semantic\n",
"from encodec.utils import convert_audio\n",
"\n",
"import torchaudio\n",
"import torch\n",
"\n",
"import matplotlib.pyplot as plt\n",
"\n",
"model = load_codec_model(use_gpu=True)\n",
"\n",
"# Load and pre-process the audio waveform\n",
"#audio_filepath = 'audio.mp3' # the audio you want to clone (will get truncated so 5-10 seconds is probably fine, existing samples that I checked are around 7 seconds)\n",
"device = 'cuda' # or 'cpu'\n",
"wav, sr = torchaudio.load(filename)\n",
"\n",
"print(wav.shape)\n",
"print(sr)\n",
"plt.plot(wav.t().numpy());\n",
"\n",
"wav = convert_audio(wav, sr, model.sample_rate, model.channels)\n",
"wav = wav.unsqueeze(0).to(device)\n",
"\n",
"# Extract discrete codes from EnCodec\n",
"with torch.no_grad():\n",
" encoded_frames = model.encode(wav)\n",
"codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze() # [n_q, T]\n",
"\n",
"# get seconds of audio\n",
"seconds = wav.shape[-1] / model.sample_rate\n",
"# generate semantic tokens\n",
"semantic_tokens = generate_text_semantic(text, max_gen_duration_s=seconds, top_k=50, top_p=.95, temp=0.7)\n",
"\n",
"# move codes to cpu\n",
"codes = codes.cpu().numpy()\n",
"\n",
"import sys\n",
"import platform\n",
"python_version=f\"{sys.version_info.major}.{sys.version_info.minor}\"\n",
"\n",
"import numpy as np\n",
"voice_name = 'ja_speaker_9' # whatever you want the name of the voice to be\n",
"output_path = f'/usr/local/lib/python{python_version}/dist-packages/bark/assets/prompts/' + voice_name + '.npz'\n",
"#output_path = '/usr/local/lib/python3.9/dist-packages/bark/assets/prompts/' + voice_name + '.npz'\n",
"np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)\n"
]
},
{
"cell_type": "markdown",
"source": [
"If succeeded, your voice model will be shown below:"
],
"metadata": {
"id": "k0H7TPNx3jSs"
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "lwu8kKjxKsc7"
},
"outputs": [],
"source": [
"!ls -lt {output_path}"
]
},
{
"cell_type": "code",
"source": [
"# That's it! Now you can head over to the generate.ipynb and use your voice_name for the 'history_prompt'"
],
"metadata": {
"id": "m-AnYhLTqxDA"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"## 3. Generate"
],
"metadata": {
"id": "uDVovdinv_BW"
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "TBQfThM9Ksc7"
},
"outputs": [],
"source": [
"# Heres the generation stuff copy-pasted for convenience"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "X_Swx_LuKsc8"
},
"outputs": [],
"source": [
"from bark.api import generate_audio\n",
"from transformers import BertTokenizer\n",
"from bark.generation import SAMPLE_RATE, preload_models, codec_decode, generate_coarse, generate_fine, generate_text_semantic\n",
"\n",
"# load the tokenizer\n",
"tokenizer = BertTokenizer.from_pretrained(\"bert-base-multilingual-cased\")\n",
"\n",
"# download and load all models\n",
"preload_models(\n",
" text_use_gpu=True,\n",
" text_use_small=False,\n",
" coarse_use_gpu=True,\n",
" coarse_use_small=False,\n",
" fine_use_gpu=True,\n",
" fine_use_small=False,\n",
" codec_use_gpu=True,\n",
" force_reload=False\n",
")"
]
},
{
"cell_type": "markdown",
"source": [
"set your prompt"
],
"metadata": {
"id": "EYDcOX9b2nEJ"
}
},
{
"cell_type": "code",
"source": [
"# Enter your prompt and speaker here\n",
"text_prompt = \"\\u306F\\u3058\\u3081\\u307E\\u3057\\u3066\\u3002\\u3048\\u30FC\\u3068 ... \\u3068\\u308A\\u3042\\u3048\\u305A\\u3001\\u3044\\u308D\\u3044\\u308D\\u558B\\u3089\\u305B\\u3066\\u307F\\u3066\\u304F\\u3060\\u3055\\u3044\\u306D\\u3002\\u3088\\u308D\\u3057\\u304F\\u304A\\u306D\\u304C\\u3044\\u3057\\u307E\\u3059\\u3002[laughs]\" #@param {type:\"string\"}"
],
"metadata": {
"cellView": "form",
"id": "vms3w_EM2sKL"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"source": [
"generate audio with simple parameters"
],
"metadata": {
"id": "weJoWy2P3Cz1"
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "8exAm-BHKsc-"
},
"outputs": [],
"source": [
"from IPython.display import Audio\n",
"\n",
"# simple generation\n",
"audio_array = generate_audio(text_prompt, history_prompt=voice_name, text_temp=0.7, waveform_temp=0.7)\n",
"\n",
"# play audio\n",
"Audio(audio_array, rate=SAMPLE_RATE)"
]
},
{
"cell_type": "markdown",
"source": [
"Also you can generate audio with more control (takes more time than above)"
],
"metadata": {
"id": "dq5NcRpE3HmR"
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "CGNSH9ScKsc-"
},
"outputs": [],
"source": [
"from IPython.display import Audio\n",
"\n",
"# generation with more control\n",
"x_semantic = generate_text_semantic(\n",
" text_prompt,\n",
" history_prompt=voice_name,\n",
" temp=0.7,\n",
" top_k=50,\n",
" top_p=0.95,\n",
")\n",
"\n",
"x_coarse_gen = generate_coarse(\n",
" x_semantic,\n",
" history_prompt=voice_name,\n",
" temp=0.7,\n",
" top_k=50,\n",
" top_p=0.95,\n",
")\n",
"x_fine_gen = generate_fine(\n",
" x_coarse_gen,\n",
" history_prompt=voice_name,\n",
" temp=0.5,\n",
")\n",
"audio_array = codec_decode(x_fine_gen)\n",
"\n",
"# play audio\n",
"Audio(audio_array, rate=SAMPLE_RATE)"
]
},
{
"cell_type": "markdown",
"source": [
"## 4. Download Audio"
],
"metadata": {
"id": "AdG_Uj3ky3Fv"
}
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "gvsAHmloKsc_"
},
"outputs": [],
"source": [
"from scipy.io.wavfile import write as write_wav\n",
"import os\n",
"\n",
"# save audio\n",
"output_dir=\"/content/output\"\n",
"os.makedirs(output_dir, exist_ok=True)\n",
"filepath = f\"{output_dir}/audio.wav\" # change this to your desired output path\n",
"write_wav(filepath, SAMPLE_RATE, audio_array)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.8"
},
"orig_nbformat": 4,
"colab": {
"provenance": [],
"include_colab_link": true
},
"accelerator": "GPU",
"gpuClass": "standard"
},
"nbformat": 4,
"nbformat_minor": 0
}
@edgenull
Copy link
Copy Markdown

edgenull commented May 1, 2023

Was training my voice but:

---------------------------------------------------------------------------

FileNotFoundError                         Traceback (most recent call last)

[<ipython-input-7-a50bd504d688>](https://localhost:8080/#) in <cell line: 39>()
     37 voice_name = 'ja_speaker_9' # whatever you want the name of the voice to be
     38 output_path = '/usr/local/lib/python3.9/dist-packages/bark/assets/prompts/' + voice_name + '.npz'
---> 39 np.savez(output_path, fine_prompt=codes, coarse_prompt=codes[:2, :], semantic_prompt=semantic_tokens)

4 frames

/usr/local/lib/python3.10/dist-packages/numpy/core/overrides.py in savez(*args, **kwargs)

[/usr/local/lib/python3.10/dist-packages/numpy/lib/npyio.py](https://localhost:8080/#) in savez(file, *args, **kwds)
    610 
    611     """
--> 612     _savez(file, args, kwds, False)
    613 
    614 

[/usr/local/lib/python3.10/dist-packages/numpy/lib/npyio.py](https://localhost:8080/#) in _savez(file, args, kwds, compress, allow_pickle, pickle_kwargs)
    707         compression = zipfile.ZIP_STORED
    708 
--> 709     zipf = zipfile_factory(file, mode="w", compression=compression)
    710 
    711     for key, val in namedict.items():

[/usr/local/lib/python3.10/dist-packages/numpy/lib/npyio.py](https://localhost:8080/#) in zipfile_factory(file, *args, **kwargs)
     99     import zipfile
    100     kwargs['allowZip64'] = True
--> 101     return zipfile.ZipFile(file, *args, **kwargs)
    102 
    103 

[/usr/lib/python3.10/zipfile.py](https://localhost:8080/#) in __init__(self, file, mode, compression, allowZip64, compresslevel, strict_timestamps)
   1249             while True:
   1250                 try:
-> 1251                     self.fp = io.open(file, filemode)
   1252                 except OSError:
   1253                     if filemode in modeDict:

FileNotFoundError: [Errno 2] No such file or directory: '/usr/local/lib/python3.9/dist-packages/bark/assets/prompts/ja_speaker_9.npz'

@inki53
Copy link
Copy Markdown

inki53 commented May 4, 2023

Same error here

@kun432
Copy link
Copy Markdown
Author

kun432 commented May 7, 2023

colaboratory seems change python version from 3.9 to 3.10. fixed.

@revolutionarybukhari
Copy link
Copy Markdown

not working

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment