Skip to content

Instantly share code, notes, and snippets.

@sofianhamiti
Created February 14, 2021 17:12
Show Gist options
  • Save sofianhamiti/e860c17fdc3e04730cfe074dc624c645 to your computer and use it in GitHub Desktop.
Save sofianhamiti/e860c17fdc3e04730cfe074dc624c645 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Train sklearn RF model on Boston dataset and predict"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Setup Environment"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!pip install -q -U pip\n",
"!pip install -q scikit-learn==0.24.1\n",
"!pip install -q joblib"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import logging\n",
"import joblib\n",
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.datasets import load_boston\n",
"from sklearn.ensemble import RandomForestRegressor\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"logging.basicConfig(level=logging.INFO)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Prepare data\n",
"We load the Boston dataset from sklearn and split it into train and test sets"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# we use the Boston housing dataset \n",
"data = load_boston()\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.25, random_state=42)\n",
"\n",
"trainX = pd.DataFrame(X_train, columns=data.feature_names)\n",
"trainX['target'] = y_train\n",
"\n",
"testX = pd.DataFrame(X_test, columns=data.feature_names)\n",
"testX['target'] = y_test"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def generate_model(filename, n_estimators, min_samples_leaf):\n",
" logging.info('preparing train and test datasets')\n",
" features = 'CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT'\n",
" X_train = trainX[features.split()]\n",
" X_test = testX[features.split()]\n",
" y_train = trainX['target']\n",
" y_test = testX['target']\n",
"\n",
" # TRAIN\n",
" logging.info('training model')\n",
" model = RandomForestRegressor(\n",
" n_estimators=n_estimators,\n",
" min_samples_leaf=min_samples_leaf,\n",
" n_jobs=-1\n",
" )\n",
"\n",
" model.fit(X_train, y_train)\n",
"\n",
" # ABS ERROR AND COUPLE PERF METRICS\n",
" logging.info('evaluating model')\n",
" abs_err = np.abs(model.predict(X_test) - y_test)\n",
"\n",
" for q in [10, 50, 90]:\n",
" logging.info(f'AE-at-{q}th-percentile: {np.percentile(a=abs_err, q=q)}')\n",
"\n",
" # SAVE MODEL\n",
" logging.info(f'saving model binary: {filename}.pkl')\n",
" joblib.dump(model, f'{filename}.pkl')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# train model\n",
"generate_model('random_forest', 100, 3)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Predict"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def model_predict(filename, data):\n",
" # LOAD MODEL\n",
" model = joblib.load(filename)\n",
" # PREDICT\n",
" predictions = model.predict([data])\n",
" logging.info(f'predictions: {predictions}')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load test data and predict on first row\n",
"features = 'CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT'\n",
"X_test = testX[features.split()]\n",
"\n",
"model_predict('random_forest.pkl', X_test.iloc[0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "conda_python3",
"language": "python",
"name": "conda_python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.10"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment