pierrelouisbescond · June 13, 2020 11:46
diff --git a/augmented_data.ipynb b/augmented_data.ipynb
 {
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "augmented_data.ipynb",
      "provenance": [],
      "collapsed_sections": [],
      "toc_visible": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "cells": [
    {
      "cell_type": "code",
      "metadata": {
        "id": "kgrZuKtll4st",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "import pandas as pd\n",
        "import numpy as np\n",
        "\n",
        "from sklearn.preprocessing import normalize\n",
        "from sklearn.model_selection import train_test_split\n",
        "\n",
        "import tensorflow as tf\n",
        "from tensorflow import keras\n",
        "from tensorflow.keras import models\n",
        "from tensorflow.keras import layers\n",
        "from tensorflow.keras.layers import Dense\n",
        "from tensorflow.keras.layers import Input\n",
        "from tensorflow.keras.layers import Activation\n",
        "from tensorflow.keras.layers import BatchNormalization\n",
        "from tensorflow.keras.layers import Dropout\n",
        "\n",
        "from google.colab import drive"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "i58dfbBAtJAD",
        "colab_type": "code",
        "outputId": "d0d8b55a-916a-4df6-e29e-11abb83990a4",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        }
      },
      "source": [
        "drive.mount('/content/drive')"
      ],
      "execution_count": 2,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "v59Om7CB2aCh",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "cc0b5577-223f-41f6-b487-af08efd2ea9d"
      },
      "source": [
        "df_initial = pd.read_excel(\"/content/drive/My Drive/Medium/aug_data_initial_dataset.xls\", index_col=0)\n",
        "df_initial.shape"
      ],
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "(251, 52)"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 3
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "tSxEFgGK28dz",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "X = df_initial.drop(\"y\", axis=1)\n",
        "y = df_initial[\"y\"]"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "xadSBgTgmVUP",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "HYSP7ZjjUlcZ",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# The piecewise function will decrease the learning rate according to the number of epochs\n",
        "def piecewise_lr(epoch):\n",
        "  if epoch < 100:\n",
        "    return 0.1\n",
        "  elif epoch < 250:\n",
        "    return 0.01\n",
        "  else:\n",
        "    return 0.001\n",
        "\n",
        "lr_callback = tf.keras.callbacks.LearningRateScheduler(piecewise_lr, verbose=True)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "62m4-De9nXwo",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# Model architecture creation into the form of a function\n",
        "def build_model():\n",
        "  model = keras.Sequential([\n",
        "    layers.Dense(df_initial.shape[1], activation='relu', input_shape=[X.shape[1]]),\n",
        "    layers.BatchNormalization(),\n",
        "    layers.Dense(12, activation='relu'),\n",
        "    layers.BatchNormalization(),\n",
        "    layers.Dense(1)\n",
        "  ])\n",
        "\n",
        "  optimizer = tf.keras.optimizers.SGD(0.1)\n",
        "\n",
        "  model.compile(loss='mae',\n",
        "                optimizer=optimizer,\n",
        "                metrics=['mae'])\n",
        "  return model"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "1bKYMlPVokLU",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "initial_model = build_model()"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "RoDZPQ0Vo_D0",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "early_stop = keras.callbacks.ModelCheckpoint('best_initial_model2.h5',\n",
        "                                              monitor='val_loss',\n",
        "                                              verbose=1,\n",
        "                                              save_best_only=True,\n",
        "                                              save_weights_only=False,\n",
        "                                              mode='auto',\n",
        "                                              save_freq='epoch')\n",
        "\n",
        "history = initial_model.fit(X_train, y_train,\n",
        "                           epochs = 500,\n",
        "                           batch_size=10, \n",
        "                           validation_data=(X_test, y_test),\n",
        "                           callbacks=[early_stop, lr_callback],\n",
        "                           verbose=2,\n",
        "                           );"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "fZOoCuXh-9lS",
        "colab_type": "text"
      },
      "source": [
        "Best MAE obtained with initial_model: 0.04838."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "TcH-2UaRKv3G",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "# Data Augmentation factors\n",
        "replication_factor = 2\n",
        "sensor_accuracy_pct = 1\n",
        "\n",
        "# X_train and y_train are converted to DataFrame to ease their replication\n",
        "X_train_df = pd.DataFrame(X_train, columns=df_initial.columns[:-1])\n",
        "y_train_df = pd.DataFrame(y_train, columns=[\"y\"])\n",
        "\n",
        "# Augmented data creation, replicated from the original dataset\n",
        "augmented_X_train = pd.concat([X_train_df]*replication_factor, ignore_index=True)\n",
        "augmented_y_train = pd.concat([y_train_df]*replication_factor, ignore_index=True)\n",
        "\n",
        "# y values are modified randomly within the accuracy range \n",
        "augmented_y_train[\"y\"] =  augmented_y_train[\"y\"]*( (100-sensor_accuracy_pct)/100 + 2 * sensor_accuracy_pct * np.random.random()/100)\n",
        "\n",
        "# Concatenation of initial and augmented datasets\n",
        "augmented_X_train = pd.concat([X_train_df,augmented_X_train], ignore_index=True)\n",
        "augmented_y_train = pd.concat([y_train_df,augmented_y_train], ignore_index=True)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "8U5nvNJkO9uO",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "augmented_model = build_model()"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "48tsXu0xOm13",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "early_stop = keras.callbacks.ModelCheckpoint('best_augmented_model2.h5',\n",
        "                                              monitor='val_loss',\n",
        "                                              verbose=1,\n",
        "                                              save_best_only=True,\n",
        "                                              save_weights_only=False,\n",
        "                                              mode='auto',\n",
        "                                              save_freq='epoch')\n",
        "\n",
        "history = augmented_model.fit(augmented_X_train, augmented_y_train,\n",
        "                              epochs = 500,\n",
        "                              batch_size=10, \n",
        "                              validation_data=(X_test, y_test),\n",
        "                              callbacks=[early_stop, lr_callback],\n",
        "                              verbose=2,\n",
        "                              );"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Nxkf2y1T_H5O",
        "colab_type": "text"
      },
      "source": [
        "Best MAE obtained with initial_model: **0.04838.**\n",
        "\n",
        "Best MAE obtained with augmented_model (replication: 1): **0.04521.**\n",
        "\n",
        "Best MAE obtained with augmented_model (replication: 2): **0.04198.**\n",
        "\n",
        "Best MAE obtained with augmented_model (replication: 5): **0.04029.**\n",
        "\n",
        "Best MAE obtained with augmented_model (replication:10): **0.04620.**"
      ]
    }
  ]
 }
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"name": "augmented_data.ipynb",
	"provenance": [],
	"collapsed_sections": [],
	"toc_visible": true
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	}
	},
	"cells": [
	{
	"cell_type": "code",
	"metadata": {
	"id": "kgrZuKtll4st",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"import pandas as pd\n",
	"import numpy as np\n",
	"\n",
	"from sklearn.preprocessing import normalize\n",
	"from sklearn.model_selection import train_test_split\n",
	"\n",
	"import tensorflow as tf\n",
	"from tensorflow import keras\n",
	"from tensorflow.keras import models\n",
	"from tensorflow.keras import layers\n",
	"from tensorflow.keras.layers import Dense\n",
	"from tensorflow.keras.layers import Input\n",
	"from tensorflow.keras.layers import Activation\n",
	"from tensorflow.keras.layers import BatchNormalization\n",
	"from tensorflow.keras.layers import Dropout\n",
	"\n",
	"from google.colab import drive"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "i58dfbBAtJAD",
	"colab_type": "code",
	"outputId": "d0d8b55a-916a-4df6-e29e-11abb83990a4",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 34
	}
	},
	"source": [
	"drive.mount('/content/drive')"
	],
	"execution_count": 2,
	"outputs": [
	{
	"output_type": "stream",
	"text": [
	"Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"
	],
	"name": "stdout"
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "v59Om7CB2aCh",
	"colab_type": "code",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 34
	},
	"outputId": "cc0b5577-223f-41f6-b487-af08efd2ea9d"
	},
	"source": [
	"df_initial = pd.read_excel(\"/content/drive/My Drive/Medium/aug_data_initial_dataset.xls\", index_col=0)\n",
	"df_initial.shape"
	],
	"execution_count": 3,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"(251, 52)"
	]
	},
	"metadata": {
	"tags": []
	},
	"execution_count": 3
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "tSxEFgGK28dz",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"X = df_initial.drop(\"y\", axis=1)\n",
	"y = df_initial[\"y\"]"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "xadSBgTgmVUP",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "HYSP7ZjjUlcZ",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"# The piecewise function will decrease the learning rate according to the number of epochs\n",
	"def piecewise_lr(epoch):\n",
	" if epoch < 100:\n",
	" return 0.1\n",
	" elif epoch < 250:\n",
	" return 0.01\n",
	" else:\n",
	" return 0.001\n",
	"\n",
	"lr_callback = tf.keras.callbacks.LearningRateScheduler(piecewise_lr, verbose=True)"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "62m4-De9nXwo",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"# Model architecture creation into the form of a function\n",
	"def build_model():\n",
	" model = keras.Sequential([\n",
	" layers.Dense(df_initial.shape[1], activation='relu', input_shape=[X.shape[1]]),\n",
	" layers.BatchNormalization(),\n",
	" layers.Dense(12, activation='relu'),\n",
	" layers.BatchNormalization(),\n",
	" layers.Dense(1)\n",
	" ])\n",
	"\n",
	" optimizer = tf.keras.optimizers.SGD(0.1)\n",
	"\n",
	" model.compile(loss='mae',\n",
	" optimizer=optimizer,\n",
	" metrics=['mae'])\n",
	" return model"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "1bKYMlPVokLU",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"initial_model = build_model()"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "RoDZPQ0Vo_D0",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"early_stop = keras.callbacks.ModelCheckpoint('best_initial_model2.h5',\n",
	" monitor='val_loss',\n",
	" verbose=1,\n",
	" save_best_only=True,\n",
	" save_weights_only=False,\n",
	" mode='auto',\n",
	" save_freq='epoch')\n",
	"\n",
	"history = initial_model.fit(X_train, y_train,\n",
	" epochs = 500,\n",
	" batch_size=10, \n",
	" validation_data=(X_test, y_test),\n",
	" callbacks=[early_stop, lr_callback],\n",
	" verbose=2,\n",
	" );"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "fZOoCuXh-9lS",
	"colab_type": "text"
	},
	"source": [
	"Best MAE obtained with initial_model: 0.04838."
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "TcH-2UaRKv3G",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"# Data Augmentation factors\n",
	"replication_factor = 2\n",
	"sensor_accuracy_pct = 1\n",
	"\n",
	"# X_train and y_train are converted to DataFrame to ease their replication\n",
	"X_train_df = pd.DataFrame(X_train, columns=df_initial.columns[:-1])\n",
	"y_train_df = pd.DataFrame(y_train, columns=[\"y\"])\n",
	"\n",
	"# Augmented data creation, replicated from the original dataset\n",
	"augmented_X_train = pd.concat([X_train_df]*replication_factor, ignore_index=True)\n",
	"augmented_y_train = pd.concat([y_train_df]*replication_factor, ignore_index=True)\n",
	"\n",
	"# y values are modified randomly within the accuracy range \n",
	"augmented_y_train[\"y\"] = augmented_y_train[\"y\"]( (100-sensor_accuracy_pct)/100 + 2 sensor_accuracy_pct * np.random.random()/100)\n",
	"\n",
	"# Concatenation of initial and augmented datasets\n",
	"augmented_X_train = pd.concat([X_train_df,augmented_X_train], ignore_index=True)\n",
	"augmented_y_train = pd.concat([y_train_df,augmented_y_train], ignore_index=True)"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "8U5nvNJkO9uO",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"augmented_model = build_model()"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "48tsXu0xOm13",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"early_stop = keras.callbacks.ModelCheckpoint('best_augmented_model2.h5',\n",
	" monitor='val_loss',\n",
	" verbose=1,\n",
	" save_best_only=True,\n",
	" save_weights_only=False,\n",
	" mode='auto',\n",
	" save_freq='epoch')\n",
	"\n",
	"history = augmented_model.fit(augmented_X_train, augmented_y_train,\n",
	" epochs = 500,\n",
	" batch_size=10, \n",
	" validation_data=(X_test, y_test),\n",
	" callbacks=[early_stop, lr_callback],\n",
	" verbose=2,\n",
	" );"
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "Nxkf2y1T_H5O",
	"colab_type": "text"
	},
	"source": [
	"Best MAE obtained with initial_model: 0.04838.\n",
	"\n",
	"Best MAE obtained with augmented_model (replication: 1): 0.04521.\n",
	"\n",
	"Best MAE obtained with augmented_model (replication: 2): 0.04198.\n",
	"\n",
	"Best MAE obtained with augmented_model (replication: 5): 0.04029.\n",
	"\n",
	"Best MAE obtained with augmented_model (replication:10): 0.04620."
	]
	}
	]
	}