Skip to content

Instantly share code, notes, and snippets.

@naotokui
Created August 1, 2018 07:47
Show Gist options
  • Save naotokui/a2b331dd206b13a70800e862cfe7da3c to your computer and use it in GitHub Desktop.
Save naotokui/a2b331dd206b13a70800e862cfe7da3c to your computer and use it in GitHub Desktop.
drumkit sound classification model used in https://codepen.io/naotokui/pen/rrGGNJ
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# data pre-process\n",
"\n",
"The drumkit sound dataset used for the training is available here:\n",
"https://s3-ap-northeast-1.amazonaws.com/codepen-dev/drumkit_dataset.zip"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['0_kick', '1_snare', '2_hihat_closed', '3_hihat_open', '4_tom_low', '5_tom_mid', '6_tom_high', '7_clap', '8_rim']\n"
]
}
],
"source": [
"import librosa\n",
"import librosa.display\n",
"import numpy as np\n",
"from glob import glob\n",
"%matplotlib inline\n",
"\n",
"N_FFT = 1024\n",
"HOP_LENGTH = 256 \n",
"SR = 16000\n",
"MELSPEC_SIZE = 128;\n",
"\n",
"len_src = 3. \n",
"ref_n_src = int(SR * len_src)\n",
"\n",
"drum_dirs = [r.split('/')[-1] for r in sorted(glob('./selected_drums/*'))]\n",
"NB_CLASS = len(drum_dirs)\n",
"\n",
"print drum_dirs\n",
"\n",
"def get_melspec(filepath, hop_length=HOP_LENGTH, n_mels=128):\n",
"\n",
" y_tmp = np.zeros(ref_n_src)\n",
" \n",
" y, sr = librosa.core.load(filepath, sr = SR, mono=True)\n",
" y = y[:ref_n_src]\n",
" y_tmp[:len(y)] = y[:ref_n_src]\n",
" \n",
" # sfft -> mel conversion\n",
" melspec = librosa.feature.melspectrogram(y=y_tmp, sr=sr,\n",
" n_fft=N_FFT, hop_length=hop_length, n_mels=n_mels)\n",
" S = librosa.power_to_db(melspec, np.max) \n",
" \n",
" return S"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\r",
" 0%| | 0/27069 [00:00<?, ?it/s]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"27069\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 27069/27069 [3:13:11<00:00, 2.34it/s] \n"
]
}
],
"source": [
"from glob import glob\n",
"from tqdm import tqdm\n",
"from random import shuffle\n",
"\n",
"\n",
"filepaths = glob(\"./selected_drums/*/*\")\n",
"# filepaths2 = glob(\"./augmented/*/*\")\n",
"# filepaths.extend(filepaths2)\n",
"print len(filepaths)\n",
"\n",
"\n",
"shuffle(filepaths)\n",
"\n",
"drum_genres = []\n",
"drum_melspecs=[]\n",
"NB_CLASS = len(drum_dirs)\n",
"\n",
"\n",
"for filepath in tqdm(filepaths):\n",
" dir_ = filepath.split(\"/\")[-2]\n",
" genre = drum_dirs.index(dir_)\n",
" \n",
" try:\n",
" melspec = get_melspec(filepath, HOP_LENGTH, MELSPEC_SIZE)\n",
"\n",
" if melspec.shape[1] > MELSPEC_SIZE:\n",
" melspec = melspec[:,:MELSPEC_SIZE]\n",
" else:\n",
" melspec.resize((MELSPEC_SIZE,MELSPEC_SIZE)) \n",
"\n",
" drum_genres.append(genre)\n",
" drum_melspecs.append(melspec)\n",
" except:\n",
" print (\"error\", filepath)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(27069,)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Using TensorFlow backend.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"(27069, 9)\n",
"(27069, 128, 128, 1)\n"
]
}
],
"source": [
"drum_genres = np.array(drum_genres)\n",
"print(drum_genres.shape)\n",
"\n",
"from keras.utils import to_categorical\n",
"\n",
"drum_genres = to_categorical(drum_genres, NB_CLASS)\n",
"print(drum_genres.shape)\n",
"\n",
"drum_melspecs = np.array(drum_melspecs)\n",
"drum_melspecs = np.expand_dims(drum_melspecs, 3)\n",
"print(drum_melspecs.shape)\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"np.savez(\"drum_data_128.npz\", melspecs=drum_melspecs, genres=drum_genres)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"## loading from pre-processed npz file\n",
"# drum_melspecs = np.load(\"drum_data_128.npz\")['melspecs']\n",
"# drum_genres = np.load(\"drum_data_128.npz\")['genres']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# training"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Using TensorFlow backend.\n"
]
}
],
"source": [
"import os\n",
"os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n",
"\n",
"from keras.layers import Input, Dense, Conv2D, MaxPooling2D, Flatten\n",
"from keras.layers import BatchNormalization,Activation\n",
"from keras.layers.advanced_activations import ELU\n",
"\n",
"from keras.models import Model\n",
"from keras import backend as K\n",
"\n",
"SIZE = MELSPEC_SIZE\n",
"\n",
"input_img = Input(shape=(SIZE, SIZE, 1)) # normalized, 128 x 128\n",
"\n",
"x = Conv2D(32, (3, 3), padding='same', kernel_initializer='he_normal')(input_img) #nb_filter, nb_row, nb_col\n",
"x = BatchNormalization(axis=1)(x)\n",
"x = ELU(alpha=1.0)(x)\n",
"x = MaxPooling2D((4, 4))(x)\n",
"\n",
"x = Conv2D(64, (3, 3), padding='same',kernel_initializer='he_normal')(x)\n",
"x = BatchNormalization(axis=1)(x)\n",
"x = ELU(alpha=1.0)(x)\n",
"x = MaxPooling2D((2, 2))(x)\n",
"\n",
"\n",
"# x = Conv2D(64, (3, 3), activation='relu', padding='same', kernel_initializer='he_normal')(x)\n",
"# x = BatchNormalization(axis=1)(x)\n",
"# x = ELU(alpha=1.0)(x)\n",
"# x = MaxPooling2D((2, 2), padding='same')(x)\n",
"# print K.int_shape(x)\n",
"\n",
"x = Conv2D(32, (3, 3), padding='same', kernel_initializer='he_normal')(x)\n",
"x = BatchNormalization(axis=1)(x)\n",
"x = ELU(alpha=1.0)(x)\n",
"x = MaxPooling2D((2, 4))(x)\n",
"\n",
"x = Conv2D(32, (3, 3), padding='same', kernel_initializer='he_normal')(x)\n",
"x = BatchNormalization(axis=1)(x)\n",
"x = ELU(alpha=1.0)(x)\n",
"x = MaxPooling2D((2, 2))(x)\n",
"x = Flatten()(x)\n",
"x = Dense(NB_CLASS)(x)\n",
"y = Activation(\"softmax\")(x)\n",
"\n",
"model = Model(input_img, y)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"_________________________________________________________________\n",
"Layer (type) Output Shape Param # \n",
"=================================================================\n",
"input_1 (InputLayer) (None, 128, 128, 1) 0 \n",
"_________________________________________________________________\n",
"conv2d_1 (Conv2D) (None, 128, 128, 32) 320 \n",
"_________________________________________________________________\n",
"batch_normalization_1 (Batch (None, 128, 128, 32) 512 \n",
"_________________________________________________________________\n",
"elu_1 (ELU) (None, 128, 128, 32) 0 \n",
"_________________________________________________________________\n",
"max_pooling2d_1 (MaxPooling2 (None, 32, 32, 32) 0 \n",
"_________________________________________________________________\n",
"conv2d_2 (Conv2D) (None, 32, 32, 64) 18496 \n",
"_________________________________________________________________\n",
"batch_normalization_2 (Batch (None, 32, 32, 64) 128 \n",
"_________________________________________________________________\n",
"elu_2 (ELU) (None, 32, 32, 64) 0 \n",
"_________________________________________________________________\n",
"max_pooling2d_2 (MaxPooling2 (None, 16, 16, 64) 0 \n",
"_________________________________________________________________\n",
"conv2d_3 (Conv2D) (None, 16, 16, 32) 18464 \n",
"_________________________________________________________________\n",
"batch_normalization_3 (Batch (None, 16, 16, 32) 64 \n",
"_________________________________________________________________\n",
"elu_3 (ELU) (None, 16, 16, 32) 0 \n",
"_________________________________________________________________\n",
"max_pooling2d_3 (MaxPooling2 (None, 8, 4, 32) 0 \n",
"_________________________________________________________________\n",
"conv2d_4 (Conv2D) (None, 8, 4, 32) 9248 \n",
"_________________________________________________________________\n",
"batch_normalization_4 (Batch (None, 8, 4, 32) 32 \n",
"_________________________________________________________________\n",
"elu_4 (ELU) (None, 8, 4, 32) 0 \n",
"_________________________________________________________________\n",
"max_pooling2d_4 (MaxPooling2 (None, 4, 2, 32) 0 \n",
"_________________________________________________________________\n",
"flatten_1 (Flatten) (None, 256) 0 \n",
"_________________________________________________________________\n",
"dense_1 (Dense) (None, 9) 2313 \n",
"_________________________________________________________________\n",
"activation_1 (Activation) (None, 9) 0 \n",
"=================================================================\n",
"Total params: 49,577\n",
"Trainable params: 49,209\n",
"Non-trainable params: 368\n",
"_________________________________________________________________\n"
]
}
],
"source": [
"model.summary()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{0: 3.808244231851435, 1: 3.4399542508577965, 2: 13.159455517744288, 3: 8.788636363636364, 4: 13.089458413926499, 5: 17.703727926749508, 6: 13.747587607922803, 7: 21.18075117370892, 8: 243.86486486486487}\n"
]
}
],
"source": [
"class_weight = {}\n",
"total = drum_genres.shape[0]\n",
"for i in range(NB_CLASS):\n",
" nb = np.sum(np.argmax(drum_genres, axis=1) == i)\n",
" class_weight[i] = total / float(nb) \n",
"print class_weight\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"27069 24362\n"
]
}
],
"source": [
"nb_total = drum_melspecs.shape[0]\n",
"nb_train = int(nb_total * 0.9)\n",
"print nb_total, nb_train\n",
"\n",
"train_melspecs = drum_melspecs[:nb_train]\n",
"train_genres = drum_genres[:nb_train]\n",
"\n",
"val_melspecs = drum_melspecs[nb_train:]\n",
"val_genres = drum_genres[nb_train:]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(24362, 9)"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_melspecs.shape\n",
"train_genres.shape\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Train on 24362 samples, validate on 2707 samples\n",
"Epoch 1/100\n",
"24362/24362 [==============================] - 15s 620us/step - loss: 9.6032 - acc: 0.5979 - val_loss: 1.0786 - val_acc: 0.6151\n",
"Epoch 2/100\n",
"24362/24362 [==============================] - 14s 591us/step - loss: 7.6791 - acc: 0.7147 - val_loss: 0.7552 - val_acc: 0.7577\n",
"Epoch 3/100\n",
"24362/24362 [==============================] - 14s 588us/step - loss: 6.9066 - acc: 0.7466 - val_loss: 0.7529 - val_acc: 0.7639\n",
"Epoch 4/100\n",
"24362/24362 [==============================] - 14s 584us/step - loss: 6.4276 - acc: 0.7641 - val_loss: 0.6297 - val_acc: 0.7917\n",
"Epoch 5/100\n",
"24362/24362 [==============================] - 14s 589us/step - loss: 5.9326 - acc: 0.7826 - val_loss: 0.6280 - val_acc: 0.7883\n",
"Epoch 6/100\n",
"24362/24362 [==============================] - 14s 589us/step - loss: 5.6260 - acc: 0.7909 - val_loss: 0.5761 - val_acc: 0.8038\n",
"Epoch 7/100\n",
"24362/24362 [==============================] - 14s 581us/step - loss: 5.2758 - acc: 0.8036 - val_loss: 0.5437 - val_acc: 0.8138\n",
"Epoch 8/100\n",
"24362/24362 [==============================] - 14s 589us/step - loss: 4.9793 - acc: 0.8125 - val_loss: 0.5270 - val_acc: 0.8109\n",
"Epoch 9/100\n",
"24362/24362 [==============================] - 14s 580us/step - loss: 4.6689 - acc: 0.8223 - val_loss: 0.5058 - val_acc: 0.8245\n",
"Epoch 10/100\n",
"24362/24362 [==============================] - 14s 586us/step - loss: 4.4284 - acc: 0.8323 - val_loss: 0.5059 - val_acc: 0.8168\n",
"Epoch 11/100\n",
"24362/24362 [==============================] - 14s 591us/step - loss: 4.1443 - acc: 0.8411 - val_loss: 0.4714 - val_acc: 0.8312\n",
"Epoch 12/100\n",
"24362/24362 [==============================] - 14s 586us/step - loss: 3.8199 - acc: 0.8530 - val_loss: 0.4355 - val_acc: 0.8400\n",
"Epoch 13/100\n",
"24362/24362 [==============================] - 14s 592us/step - loss: 3.5505 - acc: 0.8624 - val_loss: 0.4160 - val_acc: 0.8437\n",
"Epoch 14/100\n",
"24362/24362 [==============================] - 14s 590us/step - loss: 3.2395 - acc: 0.8756 - val_loss: 0.3993 - val_acc: 0.8537\n",
"Epoch 15/100\n",
"24362/24362 [==============================] - 14s 590us/step - loss: 2.9702 - acc: 0.8843 - val_loss: 0.3749 - val_acc: 0.8655\n",
"Epoch 16/100\n",
"24362/24362 [==============================] - 14s 593us/step - loss: 2.7573 - acc: 0.8922 - val_loss: 0.3710 - val_acc: 0.8633\n",
"Epoch 17/100\n",
"24362/24362 [==============================] - 14s 587us/step - loss: 2.4935 - acc: 0.9005 - val_loss: 0.3590 - val_acc: 0.8696\n",
"Epoch 18/100\n",
"24362/24362 [==============================] - 14s 594us/step - loss: 2.2897 - acc: 0.9097 - val_loss: 0.3597 - val_acc: 0.8733\n",
"Epoch 19/100\n",
"24362/24362 [==============================] - 14s 565us/step - loss: 2.0280 - acc: 0.9197 - val_loss: 0.3834 - val_acc: 0.8633\n",
"Epoch 20/100\n",
"24362/24362 [==============================] - 14s 561us/step - loss: 1.8911 - acc: 0.9221 - val_loss: 0.3902 - val_acc: 0.8626\n",
"Epoch 21/100\n",
"24362/24362 [==============================] - 14s 556us/step - loss: 2.1735 - acc: 0.9143 - val_loss: 0.4030 - val_acc: 0.8659\n",
"Epoch 22/100\n",
"24362/24362 [==============================] - 14s 572us/step - loss: 1.7177 - acc: 0.9296 - val_loss: 0.3557 - val_acc: 0.8814\n",
"Epoch 23/100\n",
"24362/24362 [==============================] - 14s 588us/step - loss: 1.3327 - acc: 0.9449 - val_loss: 0.3461 - val_acc: 0.8881\n",
"Epoch 24/100\n",
"24362/24362 [==============================] - 14s 594us/step - loss: 1.2125 - acc: 0.9498 - val_loss: 0.3828 - val_acc: 0.8818\n",
"Epoch 25/100\n",
"24362/24362 [==============================] - 14s 587us/step - loss: 1.1479 - acc: 0.9516 - val_loss: 0.3617 - val_acc: 0.8855\n",
"Epoch 26/100\n",
"24362/24362 [==============================] - 14s 592us/step - loss: 1.0277 - acc: 0.9575 - val_loss: 0.3443 - val_acc: 0.8947\n",
"Epoch 27/100\n",
"24362/24362 [==============================] - 14s 594us/step - loss: 0.9107 - acc: 0.9624 - val_loss: 0.3447 - val_acc: 0.8958\n",
"Epoch 28/100\n",
"24362/24362 [==============================] - 14s 591us/step - loss: 0.9255 - acc: 0.9601 - val_loss: 0.3940 - val_acc: 0.8833\n",
"Epoch 29/100\n",
"24362/24362 [==============================] - 14s 592us/step - loss: 0.8915 - acc: 0.9615 - val_loss: 0.4088 - val_acc: 0.8907\n",
"Epoch 30/100\n",
"24362/24362 [==============================] - 14s 585us/step - loss: 0.8569 - acc: 0.9626 - val_loss: 0.3988 - val_acc: 0.8840\n",
"Epoch 31/100\n",
"24362/24362 [==============================] - 14s 594us/step - loss: 0.8020 - acc: 0.9649 - val_loss: 0.3974 - val_acc: 0.8914\n",
"Epoch 00031: early stopping\n"
]
},
{
"data": {
"text/plain": [
"<keras.callbacks.History at 0x7fc058f91c50>"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from keras.callbacks import EarlyStopping\n",
"es = EarlyStopping(verbose=1, patience=5)\n",
"\n",
"model.compile(optimizer='adam', loss='categorical_crossentropy', metrics = ['acc'])\n",
"\n",
"model.fit(train_melspecs, train_genres, batch_size=64, \n",
" epochs=100, verbose=1, \n",
" shuffle=False,validation_data = (val_melspecs, val_genres), class_weight=class_weight, callbacks=[es])\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"model.save(\"model/drum_spec_model_128.h5\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"then you can conver keras model into tensorflow.js model with the following command \n",
"\n",
"```$ tensorflowjs_converter --input_format keras \\\n",
" path/to/my_model.h5 \\\n",
" path/to/tfjs_target_dir```"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.15"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
@domyounglee
Copy link

Thanks for sharing the code, Naotokui. :)
But I found some missing facts in your code.
In the BatchNormalization function, the axis should be -1 if the Channel is the last dimension.( if you run on Tensorflow based Keras)

And could you tell me where the augmented dataset is?
Actually the Instance size of the shared dataset(https://s3-ap-northeast-1.amazonaws.com/codepen-dev/drumkit_dataset.zip)
is different from the shown result in the notebook.
I have only 2574 number of datasets where 27069 instances are trained according to your notebook result.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment