Created
February 14, 2021 17:12
-
-
Save sofianhamiti/e860c17fdc3e04730cfe074dc624c645 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# Train sklearn RF model on Boston dataset and predict" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Setup Environment" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "!pip install -q -U pip\n", | |
| "!pip install -q scikit-learn==0.24.1\n", | |
| "!pip install -q joblib" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import logging\n", | |
| "import joblib\n", | |
| "import numpy as np\n", | |
| "import pandas as pd\n", | |
| "from sklearn.datasets import load_boston\n", | |
| "from sklearn.ensemble import RandomForestRegressor\n", | |
| "from sklearn.model_selection import train_test_split\n", | |
| "\n", | |
| "logging.basicConfig(level=logging.INFO)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Prepare data\n", | |
| "We load the Boston dataset from sklearn and split it into train and test sets" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# we use the Boston housing dataset \n", | |
| "data = load_boston()\n", | |
| "\n", | |
| "X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.25, random_state=42)\n", | |
| "\n", | |
| "trainX = pd.DataFrame(X_train, columns=data.feature_names)\n", | |
| "trainX['target'] = y_train\n", | |
| "\n", | |
| "testX = pd.DataFrame(X_test, columns=data.feature_names)\n", | |
| "testX['target'] = y_test" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Train" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def generate_model(filename, n_estimators, min_samples_leaf):\n", | |
| " logging.info('preparing train and test datasets')\n", | |
| " features = 'CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT'\n", | |
| " X_train = trainX[features.split()]\n", | |
| " X_test = testX[features.split()]\n", | |
| " y_train = trainX['target']\n", | |
| " y_test = testX['target']\n", | |
| "\n", | |
| " # TRAIN\n", | |
| " logging.info('training model')\n", | |
| " model = RandomForestRegressor(\n", | |
| " n_estimators=n_estimators,\n", | |
| " min_samples_leaf=min_samples_leaf,\n", | |
| " n_jobs=-1\n", | |
| " )\n", | |
| "\n", | |
| " model.fit(X_train, y_train)\n", | |
| "\n", | |
| " # ABS ERROR AND COUPLE PERF METRICS\n", | |
| " logging.info('evaluating model')\n", | |
| " abs_err = np.abs(model.predict(X_test) - y_test)\n", | |
| "\n", | |
| " for q in [10, 50, 90]:\n", | |
| " logging.info(f'AE-at-{q}th-percentile: {np.percentile(a=abs_err, q=q)}')\n", | |
| "\n", | |
| " # SAVE MODEL\n", | |
| " logging.info(f'saving model binary: {filename}.pkl')\n", | |
| " joblib.dump(model, f'{filename}.pkl')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# train model\n", | |
| "generate_model('random_forest', 100, 3)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Predict" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def model_predict(filename, data):\n", | |
| " # LOAD MODEL\n", | |
| " model = joblib.load(filename)\n", | |
| " # PREDICT\n", | |
| " predictions = model.predict([data])\n", | |
| " logging.info(f'predictions: {predictions}')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# Load test data and predict on first row\n", | |
| "features = 'CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT'\n", | |
| "X_test = testX[features.split()]\n", | |
| "\n", | |
| "model_predict('random_forest.pkl', X_test.iloc[0])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "conda_python3", | |
| "language": "python", | |
| "name": "conda_python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.6.10" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment