Skip to content

Instantly share code, notes, and snippets.

@pierrelouisbescond
Created June 13, 2020 11:46
Show Gist options
  • Save pierrelouisbescond/6b0914dcf47f4f74c79fc497d3f17cd2 to your computer and use it in GitHub Desktop.
Save pierrelouisbescond/6b0914dcf47f4f74c79fc497d3f17cd2 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "augmented_data.ipynb",
"provenance": [],
"collapsed_sections": [],
"toc_visible": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "code",
"metadata": {
"id": "kgrZuKtll4st",
"colab_type": "code",
"colab": {}
},
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"from sklearn.preprocessing import normalize\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"import tensorflow as tf\n",
"from tensorflow import keras\n",
"from tensorflow.keras import models\n",
"from tensorflow.keras import layers\n",
"from tensorflow.keras.layers import Dense\n",
"from tensorflow.keras.layers import Input\n",
"from tensorflow.keras.layers import Activation\n",
"from tensorflow.keras.layers import BatchNormalization\n",
"from tensorflow.keras.layers import Dropout\n",
"\n",
"from google.colab import drive"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "i58dfbBAtJAD",
"colab_type": "code",
"outputId": "d0d8b55a-916a-4df6-e29e-11abb83990a4",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
}
},
"source": [
"drive.mount('/content/drive')"
],
"execution_count": 2,
"outputs": [
{
"output_type": "stream",
"text": [
"Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "v59Om7CB2aCh",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "cc0b5577-223f-41f6-b487-af08efd2ea9d"
},
"source": [
"df_initial = pd.read_excel(\"/content/drive/My Drive/Medium/aug_data_initial_dataset.xls\", index_col=0)\n",
"df_initial.shape"
],
"execution_count": 3,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"(251, 52)"
]
},
"metadata": {
"tags": []
},
"execution_count": 3
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "tSxEFgGK28dz",
"colab_type": "code",
"colab": {}
},
"source": [
"X = df_initial.drop(\"y\", axis=1)\n",
"y = df_initial[\"y\"]"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "xadSBgTgmVUP",
"colab_type": "code",
"colab": {}
},
"source": [
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "HYSP7ZjjUlcZ",
"colab_type": "code",
"colab": {}
},
"source": [
"# The piecewise function will decrease the learning rate according to the number of epochs\n",
"def piecewise_lr(epoch):\n",
" if epoch < 100:\n",
" return 0.1\n",
" elif epoch < 250:\n",
" return 0.01\n",
" else:\n",
" return 0.001\n",
"\n",
"lr_callback = tf.keras.callbacks.LearningRateScheduler(piecewise_lr, verbose=True)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "62m4-De9nXwo",
"colab_type": "code",
"colab": {}
},
"source": [
"# Model architecture creation into the form of a function\n",
"def build_model():\n",
" model = keras.Sequential([\n",
" layers.Dense(df_initial.shape[1], activation='relu', input_shape=[X.shape[1]]),\n",
" layers.BatchNormalization(),\n",
" layers.Dense(12, activation='relu'),\n",
" layers.BatchNormalization(),\n",
" layers.Dense(1)\n",
" ])\n",
"\n",
" optimizer = tf.keras.optimizers.SGD(0.1)\n",
"\n",
" model.compile(loss='mae',\n",
" optimizer=optimizer,\n",
" metrics=['mae'])\n",
" return model"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "1bKYMlPVokLU",
"colab_type": "code",
"colab": {}
},
"source": [
"initial_model = build_model()"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "RoDZPQ0Vo_D0",
"colab_type": "code",
"colab": {}
},
"source": [
"early_stop = keras.callbacks.ModelCheckpoint('best_initial_model2.h5',\n",
" monitor='val_loss',\n",
" verbose=1,\n",
" save_best_only=True,\n",
" save_weights_only=False,\n",
" mode='auto',\n",
" save_freq='epoch')\n",
"\n",
"history = initial_model.fit(X_train, y_train,\n",
" epochs = 500,\n",
" batch_size=10, \n",
" validation_data=(X_test, y_test),\n",
" callbacks=[early_stop, lr_callback],\n",
" verbose=2,\n",
" );"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "fZOoCuXh-9lS",
"colab_type": "text"
},
"source": [
"Best MAE obtained with initial_model: 0.04838."
]
},
{
"cell_type": "code",
"metadata": {
"id": "TcH-2UaRKv3G",
"colab_type": "code",
"colab": {}
},
"source": [
"# Data Augmentation factors\n",
"replication_factor = 2\n",
"sensor_accuracy_pct = 1\n",
"\n",
"# X_train and y_train are converted to DataFrame to ease their replication\n",
"X_train_df = pd.DataFrame(X_train, columns=df_initial.columns[:-1])\n",
"y_train_df = pd.DataFrame(y_train, columns=[\"y\"])\n",
"\n",
"# Augmented data creation, replicated from the original dataset\n",
"augmented_X_train = pd.concat([X_train_df]*replication_factor, ignore_index=True)\n",
"augmented_y_train = pd.concat([y_train_df]*replication_factor, ignore_index=True)\n",
"\n",
"# y values are modified randomly within the accuracy range \n",
"augmented_y_train[\"y\"] = augmented_y_train[\"y\"]*( (100-sensor_accuracy_pct)/100 + 2 * sensor_accuracy_pct * np.random.random()/100)\n",
"\n",
"# Concatenation of initial and augmented datasets\n",
"augmented_X_train = pd.concat([X_train_df,augmented_X_train], ignore_index=True)\n",
"augmented_y_train = pd.concat([y_train_df,augmented_y_train], ignore_index=True)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "8U5nvNJkO9uO",
"colab_type": "code",
"colab": {}
},
"source": [
"augmented_model = build_model()"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "48tsXu0xOm13",
"colab_type": "code",
"colab": {}
},
"source": [
"early_stop = keras.callbacks.ModelCheckpoint('best_augmented_model2.h5',\n",
" monitor='val_loss',\n",
" verbose=1,\n",
" save_best_only=True,\n",
" save_weights_only=False,\n",
" mode='auto',\n",
" save_freq='epoch')\n",
"\n",
"history = augmented_model.fit(augmented_X_train, augmented_y_train,\n",
" epochs = 500,\n",
" batch_size=10, \n",
" validation_data=(X_test, y_test),\n",
" callbacks=[early_stop, lr_callback],\n",
" verbose=2,\n",
" );"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "Nxkf2y1T_H5O",
"colab_type": "text"
},
"source": [
"Best MAE obtained with initial_model: **0.04838.**\n",
"\n",
"Best MAE obtained with augmented_model (replication: 1): **0.04521.**\n",
"\n",
"Best MAE obtained with augmented_model (replication: 2): **0.04198.**\n",
"\n",
"Best MAE obtained with augmented_model (replication: 5): **0.04029.**\n",
"\n",
"Best MAE obtained with augmented_model (replication:10): **0.04620.**"
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment