Skip to content

Instantly share code, notes, and snippets.

@bmmalone
Created August 15, 2018 08:39
Show Gist options
  • Save bmmalone/db368c73e1c8c915f3d9f11056aba26e to your computer and use it in GitHub Desktop.
Save bmmalone/db368c73e1c8c915f3d9f11056aba26e to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import joblib\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"\n",
"import sklearn.datasets\n",
"import sklearn.linear_model\n",
"import sklearn.metrics\n",
"import sklearn.model_selection\n",
"import sklearn.pipeline\n",
"import sklearn.preprocessing\n",
"\n",
"# extras\n",
"\n",
"# progress bar\n",
"import tqdm\n",
"\n",
"# make things look a little nicer\n",
"import seaborn as sns; sns.set(style='white', color_codes=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Load the data"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"((506, 13), (506,))"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X, y = sklearn.datasets.load_boston(return_X_y=True)\n",
"\n",
"X.shape, y.shape"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[6.3200e-03, 1.8000e+01, 2.3100e+00, 0.0000e+00, 5.3800e-01,\n",
" 6.5750e+00, 6.5200e+01, 4.0900e+00, 1.0000e+00, 2.9600e+02,\n",
" 1.5300e+01, 3.9690e+02, 4.9800e+00],\n",
" [2.7310e-02, 0.0000e+00, 7.0700e+00, 0.0000e+00, 4.6900e-01,\n",
" 6.4210e+00, 7.8900e+01, 4.9671e+00, 2.0000e+00, 2.4200e+02,\n",
" 1.7800e+01, 3.9690e+02, 9.1400e+00],\n",
" [2.7290e-02, 0.0000e+00, 7.0700e+00, 0.0000e+00, 4.6900e-01,\n",
" 7.1850e+00, 6.1100e+01, 4.9671e+00, 2.0000e+00, 2.4200e+02,\n",
" 1.7800e+01, 3.9283e+02, 4.0300e+00],\n",
" [3.2370e-02, 0.0000e+00, 2.1800e+00, 0.0000e+00, 4.5800e-01,\n",
" 6.9980e+00, 4.5800e+01, 6.0622e+00, 3.0000e+00, 2.2200e+02,\n",
" 1.8700e+01, 3.9463e+02, 2.9400e+00],\n",
" [6.9050e-02, 0.0000e+00, 2.1800e+00, 0.0000e+00, 4.5800e-01,\n",
" 7.1470e+00, 5.4200e+01, 6.0622e+00, 3.0000e+00, 2.2200e+02,\n",
" 1.8700e+01, 3.9690e+02, 5.3300e+00]])"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X[:5]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([24. , 21.6, 34.7, 33.4, 36.2])"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y[:5]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Create a helper to return our pipeline.\n",
"\n",
"We will use the same type of pipeline for all folds."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def _keep_columns(X, columns_to_keep):\n",
" return X[:,columns_to_keep]\n",
"\n",
"def get_pipeline(columns_to_keep):\n",
" \n",
" column_selector = sklearn.preprocessing.FunctionTransformer(\n",
" _keep_columns, kw_args={'columns_to_keep':columns_to_keep}\n",
" )\n",
" \n",
" pipeline = sklearn.pipeline.Pipeline([\n",
" ('column_selector', column_selector),\n",
" ('scaler', sklearn.preprocessing.RobustScaler()),\n",
" ('estimator', sklearn.linear_model.BayesianRidge())\n",
" ])\n",
" \n",
" return pipeline"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Split the data into 10-folds using cross-validation, train, and make predictions\n",
"\n",
"We will not use a validation set to optimize hyperparameters. We could further split the training set if we desired to create a validation set.\n",
"\n",
"For classification, we should use stratified cross-validation, but the sklearn implementation of stratified cross-validation does not work."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"num_folds = 10"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"def get_fname(fold):\n",
" fname = \"model-and-predictions.fold-{}.jpkl\".format(fold)\n",
" return fname\n",
"\n",
"def train_fold(X_train, X_test, y_train, y_test, fold):\n",
" \n",
" # we could read this from a file or something\n",
" columns_to_keep = np.array([1,3,5,7,9,11])\n",
" \n",
" p = get_pipeline(columns_to_keep)\n",
" p.fit(X_train, y_train)\n",
" y_pred = p.predict(X_test)\n",
" \n",
" # we could also compute summary statistics here, but I will just\n",
" # save the results so we can do this later.\n",
" fname = get_fname(fold)\n",
" \n",
" # we could also extract other things and save it here. For example,\n",
" # if column 0 in X contains some identity index, we could exclude it\n",
" # using the `columns_to_keep` but still save it in the joblib pickle\n",
" # file here\n",
" \n",
" # right now, save the trained model, the predictions, and the true values\n",
" joblib.dump((p, y_pred, y_test), fname)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 10/10 [00:00<00:00, 130.27it/s]\n"
]
}
],
"source": [
"# sequentially train models for each fold\n",
"kf = sklearn.model_selection.KFold(n_splits=num_folds, random_state=8675309)\n",
"for fold, (train, test) in tqdm.tqdm(enumerate(kf.split(X)), total=num_folds):\n",
" X_train, X_test = X[train], X[test]\n",
" y_train, y_test = y[train], y[test]\n",
" \n",
" train_fold(X_train, X_test, y_train, y_test, fold)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Evaluate the results\n",
"\n",
"In particular, we want to look at the predictions for all instances in the dataset. However, we only want to look at a particualr instance when it was in the testing set.\n",
"\n",
"We saved the predictions and true values in the pickle files, so we can just read them back in.\n",
"\n",
"In principle, we could also inspect other things, such as the properties of the learned models."
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 10/10 [00:00<00:00, 321.83it/s]\n"
]
}
],
"source": [
"# first, collect all of the predictions\n",
"all_y_pred = []\n",
"all_y_true = []\n",
"\n",
"for fold in tqdm.trange(num_folds):\n",
" fname = get_fname(fold)\n",
" (p, y_pred, y_test) = joblib.load(fname)\n",
" \n",
" all_y_pred.extend(y_pred)\n",
" all_y_true.extend(y_test)\n",
" \n",
"all_y_pred = np.array(all_y_pred)\n",
"all_y_true = np.array(all_y_true)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"6.414604257746351"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# now, calcualte RMSE\n",
"\n",
"# importantly, the lists include predictions for *each* sample,\n",
"# but only when that sample was in the test set\n",
"np.sqrt(sklearn.metrics.mean_squared_error(all_y_true, all_y_pred))"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# we can plot predicted vs. actual\n",
"\n",
"fig, ax = plt.subplots()\n",
"\n",
"ax.scatter(all_y_pred, all_y_true, color='b', alpha=0.4)\n",
"\n",
"ax.set_xlabel(\"Predicted\", fontsize=20)\n",
"ax.set_ylabel(\"True\", fontsize=20)\n",
"\n",
"# i just looked at the data to pick these\n",
"min_ = 0\n",
"max_ = 55\n",
"\n",
"ax.plot((min_, max_), (min_, max_), ls='--', c='0.5')\n",
"\n",
"ax.set_xlim(min_, max_)\n",
"ax.set_ylim(min_, max_)\n",
"\n",
"ax.set_aspect('equal')\n",
"sns.despine()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "automl",
"language": "python",
"name": "automl"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment