xeron56 · October 8, 2019 11:22
diff --git a/tts_example.ipynb b/tts_example.ipynb
 {
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "TTS_example.ipynb",
      "version": "0.3.2",
      "provenance": [],
      "collapsed_sections": [],
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/erogol/97516ad65b44dbddb8cd694953187c5b/tts_example.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "cjD0xW0cEMVT",
        "colab_type": "text"
      },
      "source": [
        "## Hands-on example for TTS  [https://github.com/mozilla/TTS](https://github.com/mozilla/TTS)\n",
        "\n",
        "This notebook trains Tacotron model on LJSpeech dataset."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab_type": "code",
        "id": "XGiNTMShZYvj",
        "colab": {}
      },
      "source": [
        "# download LJSpeech dataset\n",
        "!wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2\n",
        "# decompress\n",
        "!tar -xvjf LJSpeech-1.1.tar.bz2"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "__k0BrbfLQ-F",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# create train-val splits\n",
        "!shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv\n",
        "!head -n 12000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv\n",
        "!tail -n 1100 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "pyJwcU9pDUE-",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# get TTS to your local\n",
        "!git clone https://github.com/mozilla/TTS"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "zV-vHTWyirQv",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# install espeak backend if you like to use phonemes instead of raw characters\n",
        "!sudo apt-get install espeak\n",
        "!pip install soundfile"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "xwvg3-nVDL5t",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "cd TTS"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "G1OnsNyJJtem",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# install TTS requirements\n",
        "!pip install -r requirements.txt"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "y7_Xao7uNOvX",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# set config.json for LJSpeech\n",
        "%%writefile config.json\n",
        "{\n",
        "        \"run_name\": \"mozilla-tacotron-tagent-bn\",\n",
        "        \"run_description\": \"compare the attention with gst model which does not align with the same config\",\n",
        "    \n",
        "        \"audio\":{\n",
        "            // Audio processing parameters\n",
        "            \"num_mels\": 80,         // size of the mel spec frame. \n",
        "            \"num_freq\": 1025,       // number of stft frequency levels. Size of the linear spectogram frame.\n",
        "            \"sample_rate\": 22050,   // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.\n",
        "            \"frame_length_ms\": 50,  // stft window length in ms.\n",
        "            \"frame_shift_ms\": 12.5, // stft window hop-lengh in ms.\n",
        "            \"preemphasis\": 0.98,    // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.\n",
        "            \"min_level_db\": -100,   // normalization range\n",
        "            \"ref_level_db\": 20,     // reference level db, theoretically 20db is the sound of air.\n",
        "            \"power\": 1.5,           // value to sharpen wav signals after GL algorithm.\n",
        "            \"griffin_lim_iters\": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.\n",
        "            // Normalization parameters\n",
        "            \"signal_norm\": true,    // normalize the spec values in range [0, 1]\n",
        "            \"symmetric_norm\": false, // move normalization to range [-1, 1]\n",
        "            \"max_norm\": 1,          // scale normalization to range [-max_norm, max_norm] or [0, max_norm]\n",
        "            \"clip_norm\": true,      // clip normalized values into the range.\n",
        "            \"mel_fmin\": 0.0,         // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!\n",
        "            \"mel_fmax\": 8000.0,        // maximum freq level for mel-spec. Tune for dataset!!\n",
        "            \"do_trim_silence\": true  // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)\n",
        "        },\n",
        "    \n",
        "        \"distributed\":{\n",
        "            \"backend\": \"nccl\",\n",
        "            \"url\": \"tcp:\\/\\/localhost:54321\"\n",
        "        },\n",
        "    \n",
        "        \"reinit_layers\": [],\n",
        "    \n",
        "        \"model\": \"Tacotron\",          // one of the model in models/    \n",
        "        \"grad_clip\": 1,                // upper limit for gradients for clipping.\n",
        "        \"epochs\": 1000,                // total number of epochs to train.\n",
        "        \"lr\": 0.0001,                  // Initial learning rate. If Noam decay is active, maximum learning rate.\n",
        "        \"lr_decay\": false,             // if true, Noam learning rate decaying is applied through training.\n",
        "        \"warmup_steps\": 4000,          // Noam decay steps to increase the learning rate from 0 to \"lr\"\n",
        "        \"windowing\": false,            // Enables attention windowing. Used only in eval mode.\n",
        "        \"memory_size\": 5,              // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. \n",
        "        \"attention_norm\": \"sigmoid\",   // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron.\n",
        "        \"prenet_type\": \"original\",     // \"original\" or \"bn\".\n",
        "        \"prenet_dropout\": true,        // enable/disable dropout at prenet. \n",
        "        \"use_forward_attn\": true,      // if it uses forward attention. In general, it aligns faster.\n",
        "        \"forward_attn_mask\": false,    // Apply forward attention mask af inference to prevent bad modes. Try it if your model does not align well.\n",
        "        \"transition_agent\": true,     // enable/disable transition agent of forward attention.\n",
        "        \"location_attn\": false,        // enable_disable location sensitive attention. It is enabled for TACOTRON by default.\n",
        "        \"loss_masking\": true,         // enable / disable loss masking against the sequence padding.\n",
        "        \"enable_eos_bos_chars\": false, // enable/disable beginning of sentence and end of sentence chars.\n",
        "        \"stopnet\": true,               // Train stopnet predicting the end of synthesis. \n",
        "        \"separate_stopnet\": true,     // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER.\n",
        "        \"tb_model_param_stats\": false,     // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. \n",
        "        \n",
        "        \"batch_size\": 32,       // Batch size for training. Lower values than 32 might cause hard to learn attention.\n",
        "        \"eval_batch_size\":16,   \n",
        "        \"r\": 5,                 // Number of frames to predict for step.\n",
        "        \"wd\": 0.000001,         // Weight decay weight.\n",
        "        \"checkpoint\": true,     // If true, it saves checkpoints per \"save_step\"\n",
        "        \"save_step\": 1000,      // Number of training steps expected to save traning stats and checkpoints.\n",
        "        \"print_step\": 10,       // Number of steps to log traning on console.\n",
        "        \"batch_group_size\": 0,  //Number of batches to shuffle after bucketing.\n",
        "\n",
        "      \"test_delay_epochs\":1,\n",
        "      \"run_eval\": true,\n",
        "      \"test_sentences_file\": null, \n",
        "      \"data_path\": \"../../Data/LJSpeech-1.1/\",  // can overwritten from command argument\n",
        "      \"meta_file_train\": \"metadata_train.csv\",      // metafile for training dataloader\n",
        "      \"meta_file_val\": \"metadata_val.csv\",    // metafile for validation dataloader\n",
        "      \"dataset\": \"ljspeech\",     // one of TTS.dataset.preprocessors, only valid id dataloader == \"TTSDataset\", rest uses \"tts_cache\" by default.\n",
        "      \"min_seq_len\": 0,       // DATASET-RELATED: minimum text length to use in training\n",
        "      \"max_seq_len\": 300,     // DATASET-RELATED: maximum text length\n",
        "      \"output_path\": \"../keep/\",\n",
        "      \"num_loader_workers\": 2,\n",
        "      \"num_val_loader_workers\": 2,\n",
        "       \"phoneme_cache_path\": \"ljspeech_phonemes\",  // phoneme computation is slow, therefore, it caches results in the given folder.\n",
        "      \"use_phonemes\": true,           // use phonemes instead of raw characters. It is suggested for better pronounciation.\n",
        "      \"phoneme_language\": \"en-us\",     // depending on your target language, pick one from  https://github.com/bootphon/phonemizer#languages\n",
        "      \"text_cleaner\": \"phoneme_cleaners\"\n",
        "    }\n",
        "    "
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "8L3JjJOBErxq",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# pull the trigger\n",
        "!python train.py --config_path config.json --data_path ../LJSpeech-1.1/ | tee training.log"
      ],
      "execution_count": 0,
      "outputs": []
    }
  ]
 }
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"name": "TTS_example.ipynb",
	"version": "0.3.2",
	"provenance": [],
	"collapsed_sections": [],
	"include_colab_link": true
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"accelerator": "GPU"
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/erogol/97516ad65b44dbddb8cd694953187c5b/tts_example.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "cjD0xW0cEMVT",
	"colab_type": "text"
	},
	"source": [
	"## Hands-on example for TTS [https://github.com/mozilla/TTS](https://github.com/mozilla/TTS)\n",
	"\n",
	"This notebook trains Tacotron model on LJSpeech dataset."
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"colab_type": "code",
	"id": "XGiNTMShZYvj",
	"colab": {}
	},
	"source": [
	"# download LJSpeech dataset\n",
	"!wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2\n",
	"# decompress\n",
	"!tar -xvjf LJSpeech-1.1.tar.bz2"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "__k0BrbfLQ-F",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"# create train-val splits\n",
	"!shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv\n",
	"!head -n 12000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv\n",
	"!tail -n 1100 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "pyJwcU9pDUE-",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"# get TTS to your local\n",
	"!git clone https://github.com/mozilla/TTS"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "zV-vHTWyirQv",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"# install espeak backend if you like to use phonemes instead of raw characters\n",
	"!sudo apt-get install espeak\n",
	"!pip install soundfile"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "xwvg3-nVDL5t",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"cd TTS"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "G1OnsNyJJtem",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"# install TTS requirements\n",
	"!pip install -r requirements.txt"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "y7_Xao7uNOvX",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"# set config.json for LJSpeech\n",
	"%%writefile config.json\n",
	"{\n",
	" \"run_name\": \"mozilla-tacotron-tagent-bn\",\n",
	" \"run_description\": \"compare the attention with gst model which does not align with the same config\",\n",
	" \n",
	" \"audio\":{\n",
	" // Audio processing parameters\n",
	" \"num_mels\": 80, // size of the mel spec frame. \n",
	" \"num_freq\": 1025, // number of stft frequency levels. Size of the linear spectogram frame.\n",
	" \"sample_rate\": 22050, // DATASET-RELATED: wav sample-rate. If different than the original data, it is resampled.\n",
	" \"frame_length_ms\": 50, // stft window length in ms.\n",
	" \"frame_shift_ms\": 12.5, // stft window hop-lengh in ms.\n",
	" \"preemphasis\": 0.98, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.\n",
	" \"min_level_db\": -100, // normalization range\n",
	" \"ref_level_db\": 20, // reference level db, theoretically 20db is the sound of air.\n",
	" \"power\": 1.5, // value to sharpen wav signals after GL algorithm.\n",
	" \"griffin_lim_iters\": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.\n",
	" // Normalization parameters\n",
	" \"signal_norm\": true, // normalize the spec values in range [0, 1]\n",
	" \"symmetric_norm\": false, // move normalization to range [-1, 1]\n",
	" \"max_norm\": 1, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]\n",
	" \"clip_norm\": true, // clip normalized values into the range.\n",
	" \"mel_fmin\": 0.0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!\n",
	" \"mel_fmax\": 8000.0, // maximum freq level for mel-spec. Tune for dataset!!\n",
	" \"do_trim_silence\": true // enable trimming of slience of audio as you load it. LJspeech (false), TWEB (false), Nancy (true)\n",
	" },\n",
	" \n",
	" \"distributed\":{\n",
	" \"backend\": \"nccl\",\n",
	" \"url\": \"tcp:\\/\\/localhost:54321\"\n",
	" },\n",
	" \n",
	" \"reinit_layers\": [],\n",
	" \n",
	" \"model\": \"Tacotron\", // one of the model in models/ \n",
	" \"grad_clip\": 1, // upper limit for gradients for clipping.\n",
	" \"epochs\": 1000, // total number of epochs to train.\n",
	" \"lr\": 0.0001, // Initial learning rate. If Noam decay is active, maximum learning rate.\n",
	" \"lr_decay\": false, // if true, Noam learning rate decaying is applied through training.\n",
	" \"warmup_steps\": 4000, // Noam decay steps to increase the learning rate from 0 to \"lr\"\n",
	" \"windowing\": false, // Enables attention windowing. Used only in eval mode.\n",
	" \"memory_size\": 5, // ONLY TACOTRON - memory queue size used to queue network predictions to feed autoregressive connection. Useful if r < 5. \n",
	" \"attention_norm\": \"sigmoid\", // softmax or sigmoid. Suggested to use softmax for Tacotron2 and sigmoid for Tacotron.\n",
	" \"prenet_type\": \"original\", // \"original\" or \"bn\".\n",
	" \"prenet_dropout\": true, // enable/disable dropout at prenet. \n",
	" \"use_forward_attn\": true, // if it uses forward attention. In general, it aligns faster.\n",
	" \"forward_attn_mask\": false, // Apply forward attention mask af inference to prevent bad modes. Try it if your model does not align well.\n",
	" \"transition_agent\": true, // enable/disable transition agent of forward attention.\n",
	" \"location_attn\": false, // enable_disable location sensitive attention. It is enabled for TACOTRON by default.\n",
	" \"loss_masking\": true, // enable / disable loss masking against the sequence padding.\n",
	" \"enable_eos_bos_chars\": false, // enable/disable beginning of sentence and end of sentence chars.\n",
	" \"stopnet\": true, // Train stopnet predicting the end of synthesis. \n",
	" \"separate_stopnet\": true, // Train stopnet seperately if 'stopnet==true'. It prevents stopnet loss to influence the rest of the model. It causes a better model, but it trains SLOWER.\n",
	" \"tb_model_param_stats\": false, // true, plots param stats per layer on tensorboard. Might be memory consuming, but good for debugging. \n",
	" \n",
	" \"batch_size\": 32, // Batch size for training. Lower values than 32 might cause hard to learn attention.\n",
	" \"eval_batch_size\":16, \n",
	" \"r\": 5, // Number of frames to predict for step.\n",
	" \"wd\": 0.000001, // Weight decay weight.\n",
	" \"checkpoint\": true, // If true, it saves checkpoints per \"save_step\"\n",
	" \"save_step\": 1000, // Number of training steps expected to save traning stats and checkpoints.\n",
	" \"print_step\": 10, // Number of steps to log traning on console.\n",
	" \"batch_group_size\": 0, //Number of batches to shuffle after bucketing.\n",
	"\n",
	" \"test_delay_epochs\":1,\n",
	" \"run_eval\": true,\n",
	" \"test_sentences_file\": null, \n",
	" \"data_path\": \"../../Data/LJSpeech-1.1/\", // can overwritten from command argument\n",
	" \"meta_file_train\": \"metadata_train.csv\", // metafile for training dataloader\n",
	" \"meta_file_val\": \"metadata_val.csv\", // metafile for validation dataloader\n",
	" \"dataset\": \"ljspeech\", // one of TTS.dataset.preprocessors, only valid id dataloader == \"TTSDataset\", rest uses \"tts_cache\" by default.\n",
	" \"min_seq_len\": 0, // DATASET-RELATED: minimum text length to use in training\n",
	" \"max_seq_len\": 300, // DATASET-RELATED: maximum text length\n",
	" \"output_path\": \"../keep/\",\n",
	" \"num_loader_workers\": 2,\n",
	" \"num_val_loader_workers\": 2,\n",
	" \"phoneme_cache_path\": \"ljspeech_phonemes\", // phoneme computation is slow, therefore, it caches results in the given folder.\n",
	" \"use_phonemes\": true, // use phonemes instead of raw characters. It is suggested for better pronounciation.\n",
	" \"phoneme_language\": \"en-us\", // depending on your target language, pick one from https://github.com/bootphon/phonemizer#languages\n",
	" \"text_cleaner\": \"phoneme_cleaners\"\n",
	" }\n",
	" "
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "8L3JjJOBErxq",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"# pull the trigger\n",
	"!python train.py --config_path config.json --data_path ../LJSpeech-1.1/ \| tee training.log"
	],
	"execution_count": 0,
	"outputs": []
	}
	]
	}