Last active
March 3, 2018 08:07
-
-
Save DahlitzFlorian/f555aa1b9725a206a29da6883c97fe82 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# Logistic Regression #" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "heading_collapsed": true | |
| }, | |
| "source": [ | |
| "## Data preparation and analysis ##" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "source": [ | |
| "Import the needed libaries and set some things for later visualization." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 33, | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import pandas as pd\n", | |
| "import numpy as np\n", | |
| "\n", | |
| "from sklearn import preprocessing\n", | |
| "from sklearn.linear_model import LogisticRegression\n", | |
| "from sklearn.model_selection import train_test_split\n", | |
| "\n", | |
| "import matplotlib.pyplot as plt\n", | |
| "import seaborn as sns\n", | |
| "\n", | |
| "plt.rc(\"font\", size=14)\n", | |
| "sns.set(style=\"white\")\n", | |
| "sns.set(style=\"whitegrid\", color_codes=True)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "source": [ | |
| "Read sample-data from csv-file and print the shape (datasets and fields) and the columns' values." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 34, | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "(41188, 21)\n", | |
| "['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'emp_var_rate', 'cons_price_idx', 'cons_conf_idx', 'euribor3m', 'nr_employed', 'y']\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "data = pd.read_csv(\"src/banking.csv\", header=0)\n", | |
| "data = data.dropna()\n", | |
| "print(data.shape)\n", | |
| "print(list(data.columns))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "source": [ | |
| "Show the first five rows of the dataframe." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 35, | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| " age job marital education default housing loan \\\n", | |
| "0 44 blue-collar married basic.4y unknown yes no \n", | |
| "1 53 technician married unknown no no no \n", | |
| "2 28 management single university.degree no yes no \n", | |
| "3 39 services married high.school no no no \n", | |
| "4 55 retired married basic.4y no yes no \n", | |
| "\n", | |
| " contact month day_of_week ... campaign pdays previous poutcome \\\n", | |
| "0 cellular aug thu ... 1 999 0 nonexistent \n", | |
| "1 cellular nov fri ... 1 999 0 nonexistent \n", | |
| "2 cellular jun thu ... 3 6 2 success \n", | |
| "3 cellular apr fri ... 2 999 0 nonexistent \n", | |
| "4 cellular aug fri ... 1 3 1 success \n", | |
| "\n", | |
| " emp_var_rate cons_price_idx cons_conf_idx euribor3m nr_employed y \n", | |
| "0 1.4 93.444 -36.1 4.963 5228.1 0 \n", | |
| "1 -0.1 93.200 -42.0 4.021 5195.8 0 \n", | |
| "2 -1.7 94.055 -39.8 0.729 4991.6 1 \n", | |
| "3 -1.8 93.075 -47.1 1.405 5099.1 0 \n", | |
| "4 -2.9 92.201 -31.4 0.869 5076.2 1 \n", | |
| "\n", | |
| "[5 rows x 21 columns]\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "print(data.head())" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "source": [ | |
| "Show the unique values of the column \"education\"." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 36, | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "array(['basic.4y', 'unknown', 'university.degree', 'high.school',\n", | |
| " 'basic.9y', 'professional.course', 'basic.6y', 'illiterate'],\n", | |
| " dtype=object)" | |
| ] | |
| }, | |
| "execution_count": 36, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "data[\"education\"].unique()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "source": [ | |
| "The column \"education\" has three values, which are quite similar: \n", | |
| " - basic.9y\n", | |
| " - basic.6y\n", | |
| " - basic.4y\n", | |
| "The following lines replace them with the value <i>Basic</i> to group them." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 37, | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "data['education']=np.where(data['education'] =='basic.9y', 'Basic', data['education'])\n", | |
| "data['education']=np.where(data['education'] =='basic.6y', 'Basic', data['education'])\n", | |
| "data['education']=np.where(data['education'] =='basic.4y', 'Basic', data['education'])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "source": [ | |
| "Having another look at the unique values of the \"education\" column to see the changes." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 38, | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "array(['Basic', 'unknown', 'university.degree', 'high.school',\n", | |
| " 'professional.course', 'illiterate'], dtype=object)" | |
| ] | |
| }, | |
| "execution_count": 38, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "data[\"education\"].unique()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "source": [ | |
| "Count the values of the \"y\" column (in this case the depended variable) and display it." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 39, | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "0 36548\n", | |
| "1 4640\n", | |
| "Name: y, dtype: int64" | |
| ] | |
| }, | |
| "execution_count": 39, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "data[\"y\"].value_counts()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "source": [ | |
| "Visualize the last line of code using a histogram." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 40, | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "<matplotlib.axes._subplots.AxesSubplot at 0x116d4ffd0>" | |
| ] | |
| }, | |
| "execution_count": 40, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| }, | |
| { | |
| "data": { | |
| "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY0AAAEFCAYAAAAfRLtkAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAFjpJREFUeJzt3X9MVff9x/HXuRfcLJdbdmPWSawKzsagUwM3sH2DpJnaa00a7abDH6GLmM6aiiFbDUrlosOKxgwTxV/hL7Nl01IXQ7KlXSU6AjqYN0EDZm3/cKwKbWeIkXvbiXDO94/FuzGhfgjeexGej7+8x8+F901O7vOewz33Wo7jOAIAwIAr0QMAAJ4eRAMAYIxoAACMEQ0AgDGiAQAwlpToAWItFAolegQAeCrl5OQ8sm3CR0Ma/oEDAEY20gtuTk8BAIwRDQCAMaIBADBGNAAAxogGAMAY0QAAGCMaAABjRAMAYIxoAACMTYorwsfq6vY3Ej0Cxhn/kZOJHgFICI40AADGiAYAwBjRAAAYIxoAAGNEAwBgLGbvnhocHNTu3bt18+ZNWZalvXv3amBgQFu2bNHs2bMlSevXr9fKlStVW1urS5cuKSkpSeXl5Vq4cKG6urq0c+dOWZaluXPnqrKyUi6Xa9i1AID4iFk0Ll68KEk6c+aMWltbdfjwYf3whz/Upk2bVFxcHF3X2dmptrY21dfXq6enRyUlJTp37pyqq6tVWlqqvLw8BYNBNTY2Kj09fdi1AID4iFk0li1bphdffFGS1N3dLa/Xq46ODt28eVONjY2aNWuWysvLFQqFlJ+fL8uylJ6ersHBQfX29qqzs1O5ubmSpIKCArW0tCgjI2PYtT6f72tn4Stf8aSxT2GyiunFfUlJSSorK9OHH36oI0eO6PPPP9fatWu1YMECnThxQseOHVNqaqrS0tKi90lJSVFfX58cx5FlWUO2hcPhYdc+Lhpj/brXq6frxnR/TDx8hTAmuoR93evBgwf1wQcfqKKiQvn5+VqwYIEkafny5bpx44Y8Ho8ikUh0fSQSUWpqqlwu15BtXq93xLUAgPiIWTTOnz+vU6dOSZKmTp0qy7K0bds2Xb9+XZJ05coVzZ8/X9nZ2WpubpZt2+ru7pZt2/L5fMrKylJra6skqampSX6/f8S1AID4iNnpqZdeekm7du3Sxo0bNTAwoPLyck2fPl1VVVVKTk7WtGnTVFVVJY/HI7/fr8LCQtm2rWAwKEkqKytTRUWFampqlJmZqUAgILfbPexaAEB8WI7jOIkeIpZCodDY/6bBBxbif/CBhZjoRnru5OI+AIAxogEAMEY0AADGiAYAwBjRAAAYIxoAAGNEAwBgjGgAAIwRDQCAMaIBADBGNAAAxogGAMAY0QAAGCMaAABjRAMAYIxoAACMEQ0AgDGiAQAwRjQAAMaIBgDAWFKsfvDg4KB2796tmzdvyrIs7d27V9/4xje0c+dOWZaluXPnqrKyUi6XS7W1tbp06ZKSkpJUXl6uhQsXqqury3gtACA+YhaNixcvSpLOnDmj1tZWHT58WI7jqLS0VHl5eQoGg2psbFR6erra2tpUX1+vnp4elZSU6Ny5c6qurjZeCwCIj5hFY9myZXrxxRclSd3d3fJ6vbp8+bJyc3MlSQUFBWppaVFGRoby8/NlWZbS09M1ODio3t5edXZ2Gq/1+XyxehgAgP8Ss2hIUlJSksrKyvThhx/qyJEjamlpkWVZkqSUlBT19fUpHA4rLS0tep+H2x3HMV77uGiEQqEYPDpMZuxTmKxiGg1JOnjwoN566y395Cc/0f3796PbI5GIvF6vPB6PIpHIkO2pqalyuVzGax8nJydnTI/h6um6Md0fE89Y9ylgvBvphVHM3j11/vx5nTp1SpI0depUWZalBQsWqLW1VZLU1NQkv9+v7OxsNTc3y7ZtdXd3y7Zt+Xw+ZWVlGa8FAMRHzI40XnrpJe3atUsbN27UwMCAysvLNWfOHFVUVKimpkaZmZkKBAJyu93y+/0qLCyUbdsKBoOSpLKyMuO1AID4sBzHcRI9RCyFQqGxn57a/sYTmgYThf/IyUSPAMTUSM+dXNwHADBGNAAAxogGAMAY0QAAGCMaAABjRAMAYIxoAACMEQ0AgDGiAQAwRjQAAMaIBgDAGNEAABgjGgAAY0QDAGCMaAAAjBENAIAxogEAMEY0AADGiAYAwBjRAAAYS4rFD33w4IHKy8t1+/Zt9ff3a+vWrZo+fbq2bNmi2bNnS5LWr1+vlStXqra2VpcuXVJSUpLKy8u1cOFCdXV1aefOnbIsS3PnzlVlZaVcLtewawEA8ROTaDQ0NCgtLU2HDh3S3bt3tXr1ar355pvatGmTiouLo+s6OzvV1tam+vp69fT0qKSkROfOnVN1dbVKS0uVl5enYDCoxsZGpaenD7sWABA/MYnGihUrFAgEJEmO48jtdqujo0M3b95UY2OjZs2apfLycoVCIeXn58uyLKWnp2twcFC9vb3q7OxUbm6uJKmgoEAtLS3KyMgYdq3P54vFQwAADCMm0UhJSZEkhcNhbd++XaWlperv79fatWu1YMECnThxQseOHVNqaqrS0tKG3K+vr0+O48iyrCHbwuHwsGtNohEKhZ7wI8Rkxz6FySom0ZCknp4evfnmm9qwYYNeeeUV3bt3T16vV5K0fPlyVVVVaenSpYpEItH7RCIRpaamyuVyDdnm9Xrl8XiGXWsiJydnTI/l6um6Md0fE89Y9ylgvBvphVFM3j11584dFRcXa8eOHVqzZo0kafPmzbp+/bok6cqVK5o/f76ys7PV3Nws27bV3d0t27bl8/mUlZWl1tZWSVJTU5P8fv+IawEA8ROTI42TJ0/q3r17On78uI4fPy5J2rlzp/bv36/k5GRNmzZNVVVV8ng88vv9KiwslG3bCgaDkqSysjJVVFSopqZGmZmZCgQCcrvdw64FAMSP5TiOk+ghYikUCo399NT2N57QNJgo/EdOJnoEIKZGeu7k4j4AgDGiAQAwRjQAAMaIBgDAGNEAABgjGgAAY0QDAGCMaAAAjBENAIAxogEAMEY0AADGiAYAwBjRAAAYIxoAAGNEAwBgjGgAAIwRDQCAMaNoVFVVPbKtrKzsiQ8DABjfvvY7wt9++219+umn6ujo0CeffBLdPjAwoL6+vpgPBwAYX742Glu3btXt27f1zjvvaNu2bdHtbrdbc+bMiflwAIDx5WujMWPGDM2YMUMNDQ0Kh8Pq6+uT4ziSpC+//FJpaWlxGRIAMD58bTQeOnXqlE6dOjUkEpZlqbGxcdj1Dx48UHl5uW7fvq3+/n5t3bpV3/3ud7Vz505ZlqW5c+eqsrJSLpdLtbW1unTpkpKSklReXq6FCxeqq6vLeC0AIH6MolFfX68LFy7I5/MZ/dCGhgalpaXp0KFDunv3rlavXq158+aptLRUeXl5CgaDamxsVHp6utra2lRfX6+enh6VlJTo3Llzqq6uNl4LAIgfo2hMnz5dzz77rPEPXbFihQKBgCTJcRy53W51dnYqNzdXklRQUKCWlhZlZGQoPz9flmUpPT1dg4OD6u3tHdVak5CFQiHj2QET7FOYrIyiMXv2bG3YsEF5eXmaMmVKdPt//3H8v6WkpEiSwuGwtm/frtLSUh08eFCWZUX/v6+vT+FweMgpr4fbHccxXmsSjZycHJOHOaKrp+vGdH9MPGPdp4DxbqQXRkbXaTz33HNasmTJkGA8Tk9Pj1577TWtWrVKr7zyilyu//yqSCQir9crj8ejSCQyZHtqauqo1gIA4sfoSGOkI4qR3LlzR8XFxQoGg/rBD34gScrKylJra6vy8vLU1NSk73//+5o5c6YOHTqkzZs367PPPpNt2/L5fKNaCwCIH6NozJs3L3q66KFvf/vb+vOf/zzs+pMnT+revXs6fvy4jh8/LunfFwru27dPNTU1yszMVCAQkNvtlt/vV2FhoWzbVjAYlPTvq80rKiqM1gIA4sdyHl54YejBgwe6cOGC2tvbtWvXrljN9cSEQqGx/01j+xtPaBpMFP4jJxM9AhBTIz13jvoDC5OTk/Xyyy/rL3/5yxMZDADw9DA6PXX+/Pnovx3H0SeffKLk5OSYDQUAGJ+MotHa2jrk9re+9S0dPnw4JgMBAMYvo2hUV1frwYMHunnzpgYHBzV37lwlJRndFQAwgRg983d0dGj79u1KS0uTbdu6c+eOjh07pkWLFsV6PgDAOGIUjX379unw4cPRSLS3t6uqqkrvvfdeTIcDAIwvRu+e+vLLL4ccVSxevFj379+P2VAAgPHJKBrPPvusLly4EL194cIFvksDACYho9NTVVVV2rJli95+++3otjNnzsRsKADA+GR0pNHU1KSpU6fq4sWLOn36tHw+n9ra2mI9GwBgnDGKxrvvvqvf/e53euaZZzRv3jz9/ve/129+85tYzwYAGGeMovHgwYMhV4BzNTgATE5Gf9NYtmyZfvrTn+rll1+WJP3pT3/S0qVLYzoYAGD8MYrGjh079P777+uvf/2rkpKS9Nprr2nZsmWxng0AMM4YfxbIihUrtGLFiljOAgAY50b90egAgMmLaAAAjBENAIAxogEAMEY0AADGYhqNa9euqaioSJJ048YNLVmyREVFRSoqKtIf//hHSVJtba3WrFmjdevW6fr165Kkrq4urV+/Xhs2bFBlZaVs2x5xLQAgfmL29Xt1dXVqaGjQ1KlTJUmdnZ3atGmTiouLo2s6OzvV1tam+vp69fT0qKSkROfOnVN1dbVKS0uVl5enYDCoxsZGpaenD7sWABA/MTvSmDlzpo4ePRq93dHRoUuXLmnjxo0qLy9XOBxWKBRSfn6+LMtSenq6BgcH1dvbq87OTuXm5kqSCgoKdPny5RHXAgDiJ2ZHGoFAQLdu3YreXrhwodauXasFCxboxIkTOnbsmFJTU4d8L0dKSor6+vrkOI4syxqyLRwOD7vW5/M9dpZQKPQEHxnAPoXJK2bR+F/Lly+X1+uN/ruqqkpLly5VJBKJrolEIkpNTZXL5Rqyzev1yuPxDLvWRE5Ozphmv3q6bkz3x8Qz1n0KGO9GemEUt3dPbd68OfrH6ytXrmj+/PnKzs5Wc3OzbNtWd3e3bNuWz+dTVlaWWltbJf37uzz8fv+IawEA8RO3I409e/aoqqpKycnJmjZtmqqqquTxeOT3+1VYWCjbthUMBiVJZWVlqqioUE1NjTIzMxUIBOR2u4ddCwCIH8txHCfRQ8RSKBQa++mp7W88oWkwUfiPnEz0CEBMjfTcycV9AABjRAMAYIxoAACMEQ0AgDGiAQAwRjQAAMaIBgDAGNEAABgjGgAAY0QDAGCMaAAAjBENAIAxogEAMEY0AADGiAYAwBjRAAAYIxoAAGNEAwBgjGgAAIwRDQCAsZhG49q1ayoqKpIkdXV1af369dqwYYMqKytl27Ykqba2VmvWrNG6det0/fr1Ua8FAMRPzKJRV1en3bt36/79+5Kk6upqlZaW6re//a0cx1FjY6M6OzvV1tam+vp61dTUaO/evaNeCwCIn5hFY+bMmTp69Gj0dmdnp3JzcyVJBQUFunz5skKhkPLz82VZltLT0zU4OKje3t5RrQUAxE9SrH5wIBDQrVu3orcdx5FlWZKklJQU9fX1KRwOKy0tLbrm4fbRrPX5fI+dJRQKPamHBUhin8LkFbNo/C+X6z8HNZFIRF6vVx6PR5FIZMj21NTUUa01kZOTM6bZr56uG9P9MfGMdZ8CxruRXhjF7d1TWVlZam1tlSQ1NTXJ7/crOztbzc3Nsm1b3d3dsm1bPp9vVGsBAPETtyONsrIyVVRUqKamRpmZmQoEAnK73fL7/SosLJRt2woGg6NeCwCIH8txHCfRQ8RSKBQa++mp7W88oWkwUfiPnEz0CEBMjfTcycV9AABjRAMAYIxoAACMEQ0AgDGiAQAwRjQAAMaIBgDAGNEAABgjGgAAY0QDAGCMaAAAjBENAIAxogEAMEY0AADGiAYAwBjRAAAYIxoAAGNEAwBgjGgAAIwRDQCAMaIBADCWFO9f+Oqrr8rj8UiSZsyYocLCQr3zzjtyu93Kz8/Xtm3bZNu29uzZo48++khTpkzRvn37NGvWLLW3tz+yFgAQP3GNxv379+U4jn79619Ht61atUpHjx7V888/r5/97Ge6ceOGbt26pf7+fp09e1bt7e06cOCATpw4ocrKykfWZmVlxfMhAMCkFtdo/O1vf9NXX32l4uJiDQwMqKSkRP39/Zo5c6YkKT8/X5cvX9Y///lPLVmyRJK0ePFidXR0KBwOD7vWJBqhUCh2DwqTEvsUJqu4RuOb3/ymNm/erLVr1+rvf/+7Xn/9dXm93uj/p6Sk6NNPP1U4HI6ewpIkt9v9yLaHa03k5OSMae6rp+vGdH9MPGPdp4DxbqQXRnGNRkZGhmbNmiXLspSRkaHU1FTdvXs3+v+RSERer1f/+te/FIlEottt25bH4xmy7eFaAED8xDUa7733nj7++GPt2bNHn3/+ub766is988wz+sc//qHnn39ezc3N2rZtmz777DNdvHhRK1euVHt7u1544QV5PB4lJyc/shaYzN64fDXRI2AcOvl//pj97LhGY82aNdq1a5fWr18vy7K0f/9+uVwuvfXWWxocHFR+fr4WLVqk733ve2ppadG6devkOI72798vSdq7d+8jawEA8RPXaEyZMkW/+tWvHtn+7rvvDrntcrn0y1/+8pF1ixcvfmQtACB+uLgPAGCMaAAAjBENAIAxogEAMEY0AADGiAYAwBjRAAAYIxoAAGNEAwBgjGgAAIwRDQCAMaIBADBGNAAAxogGAMAY0QAAGCMaAABjRAMAYIxoAACMEQ0AgDGiAQAwlpToAUbLtm3t2bNHH330kaZMmaJ9+/Zp1qxZiR4LACaFp+5I48KFC+rv79fZs2f1i1/8QgcOHEj0SAAwaTx10QiFQlqyZIkkafHixero6EjwRAAweTx1p6fC4bA8Hk/0ttvt1sDAgJKSRn4ooVBoTL/T+unrY7o/Jp6x7lNPyuvfsBI9AsahWO6fT100PB6PIpFI9LZt218bjJycnHiMBQCTwlN3eio7O1tNTU2SpPb2dr3wwgsJnggAJg/LcRwn0UOMxsN3T3388cdyHEf79+/XnDlzEj0WAEwKT100AACJ89SdngIAJA7RAAAYIxoAAGNEA49l27aCwaAKCwtVVFSkrq6uRI8EDHHt2jUVFRUleoxJ4am7TgPx998f3dLe3q4DBw7oxIkTiR4LkCTV1dWpoaFBU6dOTfQokwJHGngsProF49nMmTN19OjRRI8xaRANPNZIH90CjAeBQOBrPxUCTxbRwGON9qNbAExcRAOPxUe3AHiIl4t4rOXLl6ulpUXr1q2LfnQLgMmJjxEBABjj9BQAwBjRAAAYIxoAAGNEAwBgjGgAAIwRDQCAMaIBADBGNIA427Fjh86ePRu9XVRUpGvXriVwIsAc0QDi7Mc//rEaGhokSbdv31Zvb68WLVqU4KkAM0QDiLO8vDx98cUXunXrls6fP69Vq1YleiTAGNEA4syyLK1evVp/+MMf9P777xMNPFWIBpAAP/rRj3TmzBl95zvf0XPPPZfocQBjRANIgOnTp2v69Ol69dVXEz0KMCp8NDoQZ47j6IsvvtCdO3e0bNmyRI8DjApHGkCcffDBB1q1apV+/vOfa8qUKYkeBxgVvk8DAGCMIw0AgDGiAQAwRjQAAMaIBgDAGNEAABj7f3VxLgrSmnlHAAAAAElFTkSuQmCC\n", | |
| "text/plain": [ | |
| "<matplotlib.figure.Figure at 0x1169bdf98>" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "sns.countplot(x=\"y\", data=data, palette=\"hls\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "source": [ | |
| "Group the datasets by the values of the \"y\" column (0, 1) and calculate the mean for each column. " | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 41, | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>age</th>\n", | |
| " <th>duration</th>\n", | |
| " <th>campaign</th>\n", | |
| " <th>pdays</th>\n", | |
| " <th>previous</th>\n", | |
| " <th>emp_var_rate</th>\n", | |
| " <th>cons_price_idx</th>\n", | |
| " <th>cons_conf_idx</th>\n", | |
| " <th>euribor3m</th>\n", | |
| " <th>nr_employed</th>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>y</th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>39.911185</td>\n", | |
| " <td>220.844807</td>\n", | |
| " <td>2.633085</td>\n", | |
| " <td>984.113878</td>\n", | |
| " <td>0.132374</td>\n", | |
| " <td>0.248875</td>\n", | |
| " <td>93.603757</td>\n", | |
| " <td>-40.593097</td>\n", | |
| " <td>3.811491</td>\n", | |
| " <td>5176.166600</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>40.913147</td>\n", | |
| " <td>553.191164</td>\n", | |
| " <td>2.051724</td>\n", | |
| " <td>792.035560</td>\n", | |
| " <td>0.492672</td>\n", | |
| " <td>-1.233448</td>\n", | |
| " <td>93.354386</td>\n", | |
| " <td>-39.789784</td>\n", | |
| " <td>2.123135</td>\n", | |
| " <td>5095.115991</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " age duration campaign pdays previous emp_var_rate \\\n", | |
| "y \n", | |
| "0 39.911185 220.844807 2.633085 984.113878 0.132374 0.248875 \n", | |
| "1 40.913147 553.191164 2.051724 792.035560 0.492672 -1.233448 \n", | |
| "\n", | |
| " cons_price_idx cons_conf_idx euribor3m nr_employed \n", | |
| "y \n", | |
| "0 93.603757 -40.593097 3.811491 5176.166600 \n", | |
| "1 93.354386 -39.789784 2.123135 5095.115991 " | |
| ] | |
| }, | |
| "execution_count": 41, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "data.groupby(\"y\").mean()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "source": [ | |
| "Doing the same as before, but this time with the \"job\" column instead of the \"y\" column." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 42, | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>age</th>\n", | |
| " <th>duration</th>\n", | |
| " <th>campaign</th>\n", | |
| " <th>pdays</th>\n", | |
| " <th>previous</th>\n", | |
| " <th>emp_var_rate</th>\n", | |
| " <th>cons_price_idx</th>\n", | |
| " <th>cons_conf_idx</th>\n", | |
| " <th>euribor3m</th>\n", | |
| " <th>nr_employed</th>\n", | |
| " <th>y</th>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>job</th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>admin.</th>\n", | |
| " <td>38.187296</td>\n", | |
| " <td>254.312128</td>\n", | |
| " <td>2.623489</td>\n", | |
| " <td>954.319229</td>\n", | |
| " <td>0.189023</td>\n", | |
| " <td>0.015563</td>\n", | |
| " <td>93.534054</td>\n", | |
| " <td>-40.245433</td>\n", | |
| " <td>3.550274</td>\n", | |
| " <td>5164.125350</td>\n", | |
| " <td>0.129726</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>blue-collar</th>\n", | |
| " <td>39.555760</td>\n", | |
| " <td>264.542360</td>\n", | |
| " <td>2.558461</td>\n", | |
| " <td>985.160363</td>\n", | |
| " <td>0.122542</td>\n", | |
| " <td>0.248995</td>\n", | |
| " <td>93.656656</td>\n", | |
| " <td>-41.375816</td>\n", | |
| " <td>3.771996</td>\n", | |
| " <td>5175.615150</td>\n", | |
| " <td>0.068943</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>entrepreneur</th>\n", | |
| " <td>41.723214</td>\n", | |
| " <td>263.267857</td>\n", | |
| " <td>2.535714</td>\n", | |
| " <td>981.267170</td>\n", | |
| " <td>0.138736</td>\n", | |
| " <td>0.158723</td>\n", | |
| " <td>93.605372</td>\n", | |
| " <td>-41.283654</td>\n", | |
| " <td>3.791120</td>\n", | |
| " <td>5176.313530</td>\n", | |
| " <td>0.085165</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>housemaid</th>\n", | |
| " <td>45.500000</td>\n", | |
| " <td>250.454717</td>\n", | |
| " <td>2.639623</td>\n", | |
| " <td>960.579245</td>\n", | |
| " <td>0.137736</td>\n", | |
| " <td>0.433396</td>\n", | |
| " <td>93.676576</td>\n", | |
| " <td>-39.495283</td>\n", | |
| " <td>4.009645</td>\n", | |
| " <td>5179.529623</td>\n", | |
| " <td>0.100000</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>management</th>\n", | |
| " <td>42.362859</td>\n", | |
| " <td>257.058140</td>\n", | |
| " <td>2.476060</td>\n", | |
| " <td>962.647059</td>\n", | |
| " <td>0.185021</td>\n", | |
| " <td>-0.012688</td>\n", | |
| " <td>93.522755</td>\n", | |
| " <td>-40.489466</td>\n", | |
| " <td>3.611316</td>\n", | |
| " <td>5166.650513</td>\n", | |
| " <td>0.112175</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>retired</th>\n", | |
| " <td>62.027326</td>\n", | |
| " <td>273.712209</td>\n", | |
| " <td>2.476744</td>\n", | |
| " <td>897.936047</td>\n", | |
| " <td>0.327326</td>\n", | |
| " <td>-0.698314</td>\n", | |
| " <td>93.430786</td>\n", | |
| " <td>-38.573081</td>\n", | |
| " <td>2.770066</td>\n", | |
| " <td>5122.262151</td>\n", | |
| " <td>0.252326</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>self-employed</th>\n", | |
| " <td>39.949331</td>\n", | |
| " <td>264.142153</td>\n", | |
| " <td>2.660802</td>\n", | |
| " <td>976.621393</td>\n", | |
| " <td>0.143561</td>\n", | |
| " <td>0.094159</td>\n", | |
| " <td>93.559982</td>\n", | |
| " <td>-40.488107</td>\n", | |
| " <td>3.689376</td>\n", | |
| " <td>5170.674384</td>\n", | |
| " <td>0.104856</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>services</th>\n", | |
| " <td>37.926430</td>\n", | |
| " <td>258.398085</td>\n", | |
| " <td>2.587805</td>\n", | |
| " <td>979.974049</td>\n", | |
| " <td>0.154951</td>\n", | |
| " <td>0.175359</td>\n", | |
| " <td>93.634659</td>\n", | |
| " <td>-41.290048</td>\n", | |
| " <td>3.699187</td>\n", | |
| " <td>5171.600126</td>\n", | |
| " <td>0.081381</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>student</th>\n", | |
| " <td>25.894857</td>\n", | |
| " <td>283.683429</td>\n", | |
| " <td>2.104000</td>\n", | |
| " <td>840.217143</td>\n", | |
| " <td>0.524571</td>\n", | |
| " <td>-1.408000</td>\n", | |
| " <td>93.331613</td>\n", | |
| " <td>-40.187543</td>\n", | |
| " <td>1.884224</td>\n", | |
| " <td>5085.939086</td>\n", | |
| " <td>0.314286</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>technician</th>\n", | |
| " <td>38.507638</td>\n", | |
| " <td>250.232241</td>\n", | |
| " <td>2.577339</td>\n", | |
| " <td>964.408127</td>\n", | |
| " <td>0.153789</td>\n", | |
| " <td>0.274566</td>\n", | |
| " <td>93.561471</td>\n", | |
| " <td>-39.927569</td>\n", | |
| " <td>3.820401</td>\n", | |
| " <td>5175.648391</td>\n", | |
| " <td>0.108260</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>unemployed</th>\n", | |
| " <td>39.733728</td>\n", | |
| " <td>249.451677</td>\n", | |
| " <td>2.564103</td>\n", | |
| " <td>935.316568</td>\n", | |
| " <td>0.199211</td>\n", | |
| " <td>-0.111736</td>\n", | |
| " <td>93.563781</td>\n", | |
| " <td>-40.007594</td>\n", | |
| " <td>3.466583</td>\n", | |
| " <td>5157.156509</td>\n", | |
| " <td>0.142012</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>unknown</th>\n", | |
| " <td>45.563636</td>\n", | |
| " <td>239.675758</td>\n", | |
| " <td>2.648485</td>\n", | |
| " <td>938.727273</td>\n", | |
| " <td>0.154545</td>\n", | |
| " <td>0.357879</td>\n", | |
| " <td>93.718942</td>\n", | |
| " <td>-38.797879</td>\n", | |
| " <td>3.949033</td>\n", | |
| " <td>5172.931818</td>\n", | |
| " <td>0.112121</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " age duration campaign pdays previous \\\n", | |
| "job \n", | |
| "admin. 38.187296 254.312128 2.623489 954.319229 0.189023 \n", | |
| "blue-collar 39.555760 264.542360 2.558461 985.160363 0.122542 \n", | |
| "entrepreneur 41.723214 263.267857 2.535714 981.267170 0.138736 \n", | |
| "housemaid 45.500000 250.454717 2.639623 960.579245 0.137736 \n", | |
| "management 42.362859 257.058140 2.476060 962.647059 0.185021 \n", | |
| "retired 62.027326 273.712209 2.476744 897.936047 0.327326 \n", | |
| "self-employed 39.949331 264.142153 2.660802 976.621393 0.143561 \n", | |
| "services 37.926430 258.398085 2.587805 979.974049 0.154951 \n", | |
| "student 25.894857 283.683429 2.104000 840.217143 0.524571 \n", | |
| "technician 38.507638 250.232241 2.577339 964.408127 0.153789 \n", | |
| "unemployed 39.733728 249.451677 2.564103 935.316568 0.199211 \n", | |
| "unknown 45.563636 239.675758 2.648485 938.727273 0.154545 \n", | |
| "\n", | |
| " emp_var_rate cons_price_idx cons_conf_idx euribor3m \\\n", | |
| "job \n", | |
| "admin. 0.015563 93.534054 -40.245433 3.550274 \n", | |
| "blue-collar 0.248995 93.656656 -41.375816 3.771996 \n", | |
| "entrepreneur 0.158723 93.605372 -41.283654 3.791120 \n", | |
| "housemaid 0.433396 93.676576 -39.495283 4.009645 \n", | |
| "management -0.012688 93.522755 -40.489466 3.611316 \n", | |
| "retired -0.698314 93.430786 -38.573081 2.770066 \n", | |
| "self-employed 0.094159 93.559982 -40.488107 3.689376 \n", | |
| "services 0.175359 93.634659 -41.290048 3.699187 \n", | |
| "student -1.408000 93.331613 -40.187543 1.884224 \n", | |
| "technician 0.274566 93.561471 -39.927569 3.820401 \n", | |
| "unemployed -0.111736 93.563781 -40.007594 3.466583 \n", | |
| "unknown 0.357879 93.718942 -38.797879 3.949033 \n", | |
| "\n", | |
| " nr_employed y \n", | |
| "job \n", | |
| "admin. 5164.125350 0.129726 \n", | |
| "blue-collar 5175.615150 0.068943 \n", | |
| "entrepreneur 5176.313530 0.085165 \n", | |
| "housemaid 5179.529623 0.100000 \n", | |
| "management 5166.650513 0.112175 \n", | |
| "retired 5122.262151 0.252326 \n", | |
| "self-employed 5170.674384 0.104856 \n", | |
| "services 5171.600126 0.081381 \n", | |
| "student 5085.939086 0.314286 \n", | |
| "technician 5175.648391 0.108260 \n", | |
| "unemployed 5157.156509 0.142012 \n", | |
| "unknown 5172.931818 0.112121 " | |
| ] | |
| }, | |
| "execution_count": 42, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "data.groupby(\"job\").mean()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "heading_collapsed": true | |
| }, | |
| "source": [ | |
| "## Data Visualization ##" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "source": [ | |
| "Displaying the frequencies of purchase for each job and the number of term deposits." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 43, | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "image/png": "\n", | |
| "text/plain": [ | |
| "<matplotlib.figure.Figure at 0x116d5f518>" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "%matplotlib inline\n", | |
| "pd.crosstab(data.job,data.y).plot(kind='bar')\n", | |
| "plt.title('Purchase Frequency for Job Title')\n", | |
| "plt.xlabel('Job')\n", | |
| "plt.ylabel('Frequency of Purchase')\n", | |
| "plt.savefig('purchase_fre_job')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "source": [ | |
| "The frequency of purchase of the deposit depends a great deal on the job title. Thus, the job title can be a good predictor of the outcome variable." | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "source": [ | |
| "The next couple of rows will oppose the martial status values and the purchase." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 44, | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "image/png": "\n", | |
| "text/plain": [ | |
| "<matplotlib.figure.Figure at 0x1167572b0>" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "table=pd.crosstab(data.marital,data.y)\n", | |
| "table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)\n", | |
| "plt.title('Stacked Bar Chart of Marital Status vs Purchase')\n", | |
| "plt.xlabel('Marital Status')\n", | |
| "plt.ylabel('Proportion of Customers')\n", | |
| "plt.savefig('mariral_vs_pur_stack')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "source": [ | |
| "The marital status does not seem a strong predictor for the outcome variable." | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "source": [ | |
| "The following lines create a bar chart comparing education and purchase." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 45, | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "image/png": "\n", | |
| "text/plain": [ | |
| "<matplotlib.figure.Figure at 0x1166fb9b0>" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "table=pd.crosstab(data.education,data.y)\n", | |
| "table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)\n", | |
| "plt.title('Stacked Bar Chart of Education vs Purchase')\n", | |
| "plt.xlabel('Education')\n", | |
| "plt.ylabel('Proportion of Customers')\n", | |
| "plt.savefig('edu_vs_pur_stack')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "source": [ | |
| "Education seems a good predictor of the outcome variable." | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "source": [ | |
| "Now, let's have a look if the day of the purchase has an impact on our depended variable y." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 46, | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "image/png": "\n", | |
| "text/plain": [ | |
| "<matplotlib.figure.Figure at 0x11938f5c0>" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "pd.crosstab(data.day_of_week,data.y).plot(kind='bar')\n", | |
| "plt.title('Purchase Frequency for Day of Week')\n", | |
| "plt.xlabel('Day of Week')\n", | |
| "plt.ylabel('Frequency of Purchase')\n", | |
| "plt.savefig('pur_dayofweek_bar')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "source": [ | |
| "Day of week may not be a good predictor of the outcome." | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "source": [ | |
| "After looking at the day of order, let's have a look at the month." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 47, | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "image/png": "\n", | |
| "text/plain": [ | |
| "<matplotlib.figure.Figure at 0x11643d160>" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "pd.crosstab(data.month,data.y).plot(kind='bar')\n", | |
| "plt.title('Purchase Frequency for Month')\n", | |
| "plt.xlabel('Month')\n", | |
| "plt.ylabel('Frequency of Purchase')\n", | |
| "plt.savefig('pur_fre_month_bar')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "source": [ | |
| "Month might be a good predictor of the outcome variable." | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "source": [ | |
| "The following lines will create a histogram showing which age the people have with the highest frequency of purchase." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 48, | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "image/png": "\n", | |
| "text/plain": [ | |
| "<matplotlib.figure.Figure at 0x116538f28>" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "data.age.hist()\n", | |
| "plt.title('Histogram of Age')\n", | |
| "plt.xlabel('Age')\n", | |
| "plt.ylabel('Frequency')\n", | |
| "plt.savefig('hist_age')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "source": [ | |
| "Most of the customers of the bank in this dataset are in the age range of 30–40." | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "source": [ | |
| "Last but not least we have a look at the purchase frequency for the previous outcome (from the last campaign)." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 49, | |
| "metadata": { | |
| "hidden": true, | |
| "scrolled": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY0AAAE+CAYAAABx+UwKAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAIABJREFUeJzt3XmYHFW9//H3ZGMNKOsFwiLbB8UryygEAROWhIusFxGQRUURhXAFjY+owI8IyAUE7gVkDUQWDVvYFIhEZEkMS6QlSFi+SJDtgsoihDXr/P6oM9KMM9PVk3R19/Tn9TzzTFX1qapvdc30t885VafaOjo6MDMzy2NAvQMwM7Pm4aRhZma5OWmYmVluThpmZpabk4aZmeXmpGFmZrkNqncA1hgkrQfMBh4tW9wGnBMRE5bQPi4HZkXEmUtie1XstwOYBSwsW/xQRBxWZBxFkXQc8E3gdxFxaB+3cTkwCngF6AAGk/19fCMi/t7HbX4MODMivtCX9a0xOGlYufciYvPOGUlrAbMkPRQRf6pjXEvCDhHxar2DKMjXgQMj4veLuZ3/KU/wks4CLgD27eP21gW0mDFZnTlpWI8i4v8k/RnYWNKWwL4RsTuApK92zqdvpSsBGwC3AicD5wHbAguAm4Hj0mY/K+k+YHWyb/8HRsQ7kr5G9u14SNrWaRFxoaR/A64EVknr3xYRJ6QYvg4cSdbM+hpwVEQ8Wc0xSpoL3AJsBhwEvAOcA6wMDATO7axpSToplXkVmAZ8OiJGdq1Blc+nxPszYB2yb+vXRMSpqWb3O+B2YOt0zMdFxLWSBgFnALun9+8+YAzwJ+C/ImJK2s/4tJ9zyo7nWmAYcJmk/wdMBy4E1iOrOV4RET9N+58GPJFeGxERL1d4u36X4kLSpum4ViariZwVEVdKGgn8LCI+mcqNTOU2Ay4F1pJ0R0TsIml34BSy8/cO8K2IeETS3sCJ6f2fA3w3ImZIGkf2N7YBsCbwIDAF+ArwMeD7EXF12u9xwBfStp8FjoyIlyocn+XgPg3rkaRtgA3J/jkrWTYiNo2IY4GTgKWBjwObkyWPEancWsDOwMZkH277SFoe+Abw+YjYAtif9OGUlj8TEVsC2wMbSVpR0giyD4vt0zpnADf2Et/dkmaW/ayWlg8Bfh0RAmYCk4AfRER7ivl7koZL2ofsQ2hzYDvgEzneE4CrgAlpe1sBO0vaL722PnBHRGwFHFt2zEcC7WQftJ8EhgL7kX34HwYgaQVgL+CK8p1FxP7AS8BBEXEt8Evg7oj4d7LzcLCkA1LxYcDJEbFxpYQhaRngy2Tv4yDgV8B5EfEpYFfg1PT30q2IWJhin50SxurAL4Cvpm38FDhN0ibARcAX0vL/B9ySjhey935Xsr+tUcAnIuJzwFHAj1OsXwb+Hdgq1ZxvJ0tYtgS4pmHllpE0M00PIvtGfVBEvCBVbFUobwrZmezb4UKyfoQR8M/ayc0R8W6anwWsFhFvp2+du0naiOyDefm0rd8At0taB7iT7AP9TUm7kSW0+8piW0nSShHxejfx9dY8NS393pjsW+yEsm0uA2wBbArcGBFvpdgvAY7p7Q2RtFw69pUknZwWL5+ObwYwn+wDDeCPZLUNyN6/qyLivTS/f9reR4ATJa1K1kR0a0S8UWH/2wKjAdL7djnZh+4DZLWY+3s5hO9IOjhNDwLuBX5I9j4tHRE3pu2+JOkG4D+Au3t7T8psS1ZLmpm2cSNwo6QjyfpinknL75L0d7IkCnBnRLyZju8lsr8PyPpbOt+/3ckS9EPpPA4Els0Zl1XgpGHlPtSn0UUHWfNGpyFdXn+7bHpBKg+ApLWBd9Ps/K7blDSM7MPrErLkM4nsH5+I+EPqQN0Z2BGYkZovBpJ9sB6b9jGArMniH/kOtdvYBwJvdOnXWR14k6zJrfz453U9jrL5zvdmYFr+2bJEuQrwPllz27yIWNTNNrq+f6sDAyLiZUnXAwcDB5I1WfVmQJe4OpcNTtNzI2JBL+t/qE+jLJ7uWig6t1vp76RT12NsI6sd9LZtgLldXpvPvxoInB4RF6ZtLwV8tIc4rEpunrK8XgE+KWnp1DyxRy9l7wS+ImlA+oedxAfNU935dNr+KRFxBylhSBoo6TTghIi4GTgaeIzsm+4U4EuS1kjb+BZZm/viCOD9zm/XKdnNIvuWexuwn6SPpg/NL5et90o6hs6ksD1ARMwh+0b/3fTaR8j6GPaqEMedwIGSlkr7uhD4UnrtfODbZElkRq8Hk9WKHiAlF0krprh/W2H/lQQwLzXZIWlNsqa735K9F+tIWi0lgr3L1lvABx/+DwIfT30jkL0nvwDuAkZLWj9te0dgbfI1kXa6AzisrEnrJLJmQlsCnDQsrylkzRNPkjXnPNpL2R+TfRN/BHgYuL2zKaOXbb8IhKSHyTqNXyFrfvpfYPPUlPUQ8Bfg6pRcTgd+K+lPZN+894mIPg/bHBHzyD68DkvbnEKWsKZHxD1kHeS/J2taGly26nnAGpKCrA/hnrLXDgSGS3qU7IPv6oj4ZYVQLgZK6edR4GXg3BTjI2S1qYtyHtZBwE5p/zOAG4DLc67brYiYT5YMjk7v053ASRFxd0Q8nuJ/iCxhlfeVPAYslDQD+HuK7YrUJPpd4IC0/pFkTVWzgNOAPTqbpHK6lOyCjAckPQZ8Cvhqnw/YPqTNQ6ObVU/SvmRXa40seL8bkCUldTZ5mRXJNQ2zJpEu+Z0OfM8Jw+rFNQ0zM8vNNQ0zM8vNScPMzHJz0jAzs9z6/c19pVLJnTZmZn3Q3t7e9ebQ/p80ANrb2ysXalKlUqlfH19/5nPX3Pr7+SuVSt0ud/OUmZnl5qRhZma5OWmYmVluThpmZpabk4aZmeXmpGFmZrk5aZiZWW5OGmZmlltL3NxnlsceY28pdH/jDhxW6P7MlgTXNMzMGtzYsWO55557AJg9ezaHH3543WJx0jAza3Bf/OIXuemmmwCYNGkS++67b91icdIwM2twW2+9NbNnz+b1119n+vTp7LDDDnWLxUnDzKzBtbW1seeee3LKKaew7bbbMnjw4LrF4o5wM7MmsM8++zBy5EhuuaXYCza6ck3DzKwJLFy4kPb2djbYYIO6xuGkYWbW4KZMmcJhhx3Gt7/97XqH4uYpM7NGN3r0aEaPHl3vMADXNMzMrApOGmZmlpuThpmZ5eakYWZmudWsI1zSQGA8IKAD+BbwPnB5mp8FjImIRZJOBHYDFgDHRMQMSRvmLVurYzAzW5KW9KCYvz5rr15fX7RoEePGjSMiGDJkCKeccgrrrrvuYu2zljWNPQAiYlvgeOAnwNnA8RGxPdAG7CVpS2AEsDVwAHB+Wr+asmZm1sWdd97JvHnzuPbaaxk7diynnXbaYm+zZkkjIm4GOodiXBd4A2gH7k3LJgM7A9sBUyKiIyKeBwZJWrXKsmZm1kWpVGL77bcHYPPNN2fWrFmLvc2a3qcREQskXQH8J7AvMCoiOtLLbwErAisAr5Wt1rm8rYqyr/QWR6lUWswjaWz9/fj6M5+75lbv81dp/88++yyrrbbaP8stXLiQGTNmMHDgwD7vs+Y390XEVyQdCzwILFP20lCy2secNN11+aIqyvaqvb29T7E3g1Kp1K+Pr1ATXyx8lz53zatP/3tL+G+s0v7XW2891lhjjX+WGzRoEFtttVWubfeUkGrWPCXpEEk/TLPvkiWBhySNTMt2BaYB04FdJA2QtA4wICJeBR6uoqyZmXWx5ZZbMnXqVABmzpzJxhtvvNjbrGVN40bg55KmAoOBY4AngPGShqTpSRGxUNI04H6yJDYmrT+2irJmZtbFqFGjmD59OgcccAAdHR2ceuqpi73NmiWNiHgH2K+bl0Z0U3YcMK7LsqfyljUzawaVLpFd0gYMGMBJJ520ZLe5RLdmZmb9mpOGmZnl5qRhZma5OWmYmVluThpmZpabk4aZmeXmx72amRVkv2uPWKLbu27/C3OVe+SRRzjzzDO56qqrFnufThpmZv3Y+PHj+dWvfsUyyyxTuXAObp4yM+vH1llnHc4777wltj0nDTOzfmyXXXZh0KAl16jkpGFmZrk5aZiZWW5OGmZmlpuvnjIzK0jeS2SXtGHDhnHdddctkW25pmFmZrk5aZiZWW5OGmZmlpuThpmZ5eakYWZmuTlpmJlZbk4aZmaWm5OGmZnlluvmPknLARsAjwLLRsQ7NY3KzMwaUsWahqSdgEeAW4B/A56VNLrWgZmZWePJU9M4FdgOmBwRL0saAVwNTOlpBUmDgQnAesBSwCnAC8CtwJ9TsQsj4lpJJwK7AQuAYyJihqQNgcuBDmAWMCYiFnVXtsrjNTOzxZCnT2NARPy1cyYiHs+xzsHAaxGxPfAfwM+AduDsiBiZfq6VtCUwAtgaOAA4P61/NnB8Wr8N2KuXsmZmVpA8NY0XJe0OdEj6CDAGeL7COtcDk9J0G1nNoB2QpL3IahvHkNVgpkREB/C8pEGSVk1l703rTwZGA9Fd2Yh4pdIBlEqlHIfZvPr78fVnPnfNrRXPX56k8U3gHGBtYDZwF3B4bytExNsAkoaSJY/jyZqpLo2IkqTjgBOBN4DXylZ9C1gRaEvJoXzZCj2UrZg02tvbKxVpWqVSqV8fX6Emvlj4Ln3umld//9/rKSFWTBoR8XfgSwCSVgSGRcTLldaTtDZwE3BBREyU9JGIeCO9fBNwHlnn+tCy1YaSJZJF3Syb00NZMzMrSJ6rpw6TNCE1Gz0GTJJ0SoV1VifrKD82IiakxXdI2ipN7wSUgOnALpIGSFqHrP/kVeBhSSNT2V2Bab2UNTOzguRpnjoCGEXWuX0LcDTwAFmTU09+BHwUOEHSCWnZd4H/kTQf+CtweETMkTQNuJ8sgY1JZccC4yUNAZ4AJkXEwh7KmplZQXLd3BcRr0v6PHBuRCyQtEyF8keTJZeutu2m7DhgXJdlT5FdKVWxrJmZFSfPJbePSboVWB+4U9J1wEO1DcvMzBpRnqTxNeAMYHhEzAOuSsvMzKzF5GmeWonsvokRktqAgcAXgS/XMjAzM2s8eWoaNwKbk3WELwfsyYcviTUzsxaRJ2msEhFfAX5NlkBGApvWMigzM2tMeZLGP9LvADaLiDeBwbULyczMGlWePo27JF0PfA+YkgYOfL+2YZmZWSOqWNOIiOOAH0TEc2TDiQTwn7UOzMzMGk+eYUQGAxtLOgT4JNmggaNqHZiZmTWePM1T1wNrkA3n0TnybAdwZa2CMjOzxpQnaWwSEZvUPBIzM2t4ea6emp1GlTUzsxbXY01D0t1kzVCrAY9KeoTsCXxtQEdE7FhMiGZm1ih6a54aV1QQZmbWHHpsnoqIeyPiXuBp4PNp+nng68CTBcVnZmYNJE+fxi+AZ9L0S2RP0buqZhGZmVnDypM0VoqIiwEiYm5EjAdWqW1YZmbWiPIkjfck7do5I2kn4J3ahWRmZo0qz30a3wR+KamzSeoFsmHSzcysxeRJGttExCclrQzMj4g5tQ7KzMwaU56kcRRwUUS8VutgzMysseVJGi9Iugt4EHivc2FEnFSzqMzMrCHlSRoPlE231SoQMzNrfBWTRkT8uNqNpuHUJwDrAUsBpwCPA5eTDU0yCxgTEYsknQjsRjZEyTERMUPShnnLVhubmZn1XcWkIWkRHwyJ3umliFi7l9UOBl6LiEMkrQTMTD/HR8Q9ki4C9pL0HDAC2BpYG7gB+AxwdhVlzcysIHlqGv+8lyPVIPYGtqmw2vXApDTdRlYzaAfuTcsmA6PJngI4JSI6gOclDZK0ajVlI+KVyodpZmZLQp4+jX+KiPnA9ZKOq1DubQBJQ8mSx/HAmekDH+AtYEVgBbInAdJleVsVZSsmjVKpVKlIU+vvx9ef+dw1t1Y8f3map75cNtsGbArMy7He2sBNwAURMVHSGWUvDwXeAOak6a7LF1VRtqL29vY8xZpSqVTq18dXqIkvFr5Ln7vm1d//93pKiHmGEdmh7GdEWrZ/bytIWh2YAhwbERPS4ocljUzTu5INfDgd2EXSgPSgpwER8WqVZc3MrCC91jQkDQS+14cb+34EfBQ4QdIJadnRwLmShpA9b3xSRCyUNA24nyyBjUllxwLjc5Y1M7OC9PbkvpHAtcAqkv4MfDEiHs2z0Yg4mixJdDWim7Lj6PLAp4h4Km9ZMzMrTm/NUz8FDgGWI7sE9vRCIjIzs4bVW/PU4IiYkqYvkdRdzcHMzFpIbzWNRV3m59YyEDMza3y91TSGpMtm27qbj4jnax2cmZk1lt6SxvJkd2WXD1I4Nf3uANavVVBmZtaYekwaEbFegXGYmVkTyHNzn5mZGeCkYWZmVegxaUg6Jv3+VHHhmJlZI+utI/woSbcCEyXtSpen9vnqKTOz1tNb0vglcAcwjA+umurkq6fMzFpQb1dPnQicKOnCiDiiwJjMzKxB5XkI0xhJRwA7pfJ3AT+LiK53jJuZWT+XJ2mcDmwETCDr1ziUrGnqmBrGZWZmDShP0hgNbNFZs5B0G5BriHQzM+tf8tynMYgPJ5dBwMLahGNmZo0sT03jl8A9kq5O818CJtYuJDMza1QVaxoRcSpwMrAOsB7wk7TMzMxaTJ6aBhExGZhc41jMzKzBeewpMzPLzUnDzMxyq9g8Jel24OfAzRExv/YhmZlZo8pT0zgN+A/gz5LOl/SZGsdkZmYNqmJNIyKmAlMlLQPsC9wgaQ5wKXBhRMytcYxmZtYgcl09JWkkcAjZ3eGTgWuAUcCvgF16WW9r4PSIGClpC+BW4M/p5Qsj4lpJJwK7AQuAYyJihqQNgcvJRtOdBYyJiEXdla3yeM3MbDHk6dN4DniGrF/jqIh4Ly2/B3iol/W+T5Zo3kmL2oGzI+KssjJbAiOArYG1gRuAzwBnA8dHxD2SLgL2SnF0V9bMzAqSp09jR2D/iLgSINUCiIhFEbFlL+vNBvYpm28HdpM0VdJlkoYC2wFTIqIjPdRpkKRVU9l703qTgZ17KWtmZgXJ0zy1G/BVYEtgNeDXkv4nIi7pbaWIuEHSemWLZgCXRkRJ0nHAicAbwGtlZd4CVgTaIqKjy7IVeij7SqUDKJVKlYo0tf5+fP2Zz11za8XzlydpHE7WJEREPCepHXgQ6DVpdOOmiHijcxo4D7gFGFpWZihZIlnUzbI5PZStqL29vcpQm0epVOrXx1eoiS8Wvkufu+bV3//3ekqIeZqnBgPlV0jNI+ugrtYdkrZK0zsBJWA6sIukAZLWAQZExKvAw6nzHWBXYFovZc3MrCB5aho3A3dJui7N70N21VS1jgDOkzQf+CtweETMkTQNuJ8sgY1JZccC4yUNAZ4AJkXEwh7KmplZQfLcp3GspH3JrlyaD5wbETfn2XhEPAsMT9N/BLbtpsw4YFyXZU+l/VUsa2Zmxck79tQTwHVktY7XJX2udiGZmVmjynOfxvnAHmSX0HbqILsU18zMWkjeZ4Sr86Y+MzNrXXmap54B2modiJmZNb48NY3Xgccl3Qe837kwIr5Ws6jMzKwh5Ukav0k/ZmbW4vJccntFGg5kU+AOYO2I+EutAzMzs8ZTsU9D0v7Ar4FzgJWA+yUdXOvAzMys8eTpCD8W+CzwVkT8HdgC+GFNozIzs4aUJ2ksjIi3Omci4mU+PKCgmZm1iDwd4Y9JOgoYLGlz4EhgZm3DMjOzRpSnpjEGWAt4D5hANkT5kbUMyszMGlOeq6feIevDcD+GmVmLyzP21CL+9fkZL0fEsNqEZGZmjSpPTeOfTViSBgN7A9vUMigzM2tMeYdGByAi5kfE9XiEWzOzlpSneerLZbNtZHeGz6tZRGZm1rDyXHK7Q9l0B/AqsH9twjEzs0aWp0/j0CICMTOzxpeneeov/OvVU5A1VXVExPpLPCozM2tIeZqnJgJzgfHAfOAg4DPAcTWMy8zMGlCepLFLRHy6bP4cSaWIeK5WQZmZWWPKc8ltm6SdO2ck7U42lIiZmbWYPDWNw4ErJf0bWd/Gk8BXahqVmZk1pDxXT5WATSWtArwfEW/n3bikrYHTI2KkpA2By8kSzyxgTEQsknQisBuwADgmImZUU7aKYzUzs8WU58l960r6LXA/sLyku9LjXyut933gUmDptOhs4PiI2J7syqu9JG0JjAC2Bg4Azu9DWTMzK0iePo2LgZ8CbwN/A64Grsyx3mxgn7L5duDeND0Z2BnYDpgSER0R8TwwSNKqVZY1M7OC5OnTWCUipkg6PSI6gPGSxlRaKSJu6FIjaUvrA7wFrAisALxWVqZzeTVlX6kUS6lUqlSkqfX34+vPfO6aWyuevzxJ4z1Jw0g3+Enajuy+jWqVPyJ2KPAG2VVYQ7tZXk3Zitrb2/sQbnMolUr9+vgKNfHFwnfpc9e8+vv/Xk8JMU/z1HeAW4GNJM0ku9nv232I4WFJI9P0rsA0YDqwi6QBktYBBkTEq1WWNTOzguSpaaxOdgf4xsBA4MmI6Msot2PJmraGAE8AkyJioaRpZJ3sA8geLVttWTMzK0iepHFGRNwGPFbtxiPiWWB4mn6K7OqnrmXGAeO6LMtd1szMipMnacyWNAF4EHivc2FE5LmCyszM+pE8SeM1snslhpct6yDfZbdmZtaP9Jg0JK0VEf/n52mYmVmn3q6e+nXnhKSxBcRiZmYNrrek0VY2fVCtAzEzs8bXW9Iof1pfW4+lzMysZeS5uQ+6f9yrmZm1mN6untpU0jNpeq2yaT8b3MysRfWWNDYuLAozM2sKPSYNPwPczMy6ytunYWZm5qRhZmb5OWmYmVluThpmZpabk4aZmeXmpGFmZrk5aZiZWW5OGmZmlpuThpmZ5eakYWZmuTlpmJlZbk4aZmaWm5OGmZnl5qRhZma59fY8jZqQ9EdgTpr9C3AxcA6wAJgSET+WNAC4ANgMmAscFhFPSxretWzR8ZuZtbJCk4akpYG2iBhZtmwm8AXgGeA2SVsAHwOWjohtUqI4C9gLuKhr2Yh4uMhjMDNrZUXXNDYDlpU0Je17HLBURMwGkHQHsDOwBvAbgIh4QNKnJa3QQ1knDTOzghSdNN4FzgQuBTYCJgNvlL3+FrA+sALwZtnyhWnZnG7KVlQqlfoecRPo78fXn/ncNbdWPH9FJ42ngKcjogN4StKbwEplrw8lSyLLpulOA8gSxtBuylbU3t6+ODE3tFKp1K+Pr1ATXyx8lz53zau//+/1lBCLvnrqa2T9E0hakyw5vCNpA0ltwC7ANGA68PlUbjjwaETMAeZ1U9bMzApSdE3jMuBySb8HOsiSyCLgl8BAsiuiHpT0B2CUpPuANuDQtP63upYtOH4zs5ZWaNKIiHnAgd28NLxLuUVkCaLr+g90LWtmZsXxzX1mZpabk4aZmeXmpGFmZrk5aZiZWW6Fjz3V3+0x9pZC9zfuwGGF7s/MWptrGmZmlpuThpmZ5ebmKbM6Of3pS+HpSwvb33X7X1jYvqz/ck3DzMxyc9IwM7PcnDTMzCw3Jw0zM8vNScPMzHJz0jAzs9ycNMzMLDcnDTMzy81Jw8zMcnPSMDOz3Jw0zMwsNycNMzPLzUnDzMxy8yi3ZmZ90KqjFLumYWZmubmm0eRa9duOmdVH0yUNSQOAC4DNgLnAYRHxdH2jMrN622PsLYXub5mtCt1dw2jG5qm9gaUjYhvgB8BZdY7HzKxlNGPS2A74DUBEPAB8ur7hmJm1jraOjo56x1AVSZcCN0TE5DT/PLB+RCzornypVGquAzQzaxDt7e1tXZc1XZ8GMAcYWjY/oKeEAd0ftJmZ9U0zNk9NBz4PIGk48Gh9wzEzax3NWNO4CRgl6T6gDTi0zvGYmbWMpuvTMDOz+mnG5ikzM6sTJw0zM8vNScPMzHJz0jAzs9ycNJqUpJXqHYNVT9LuXeb3q1cstnjSOHgtx1dPNRlJI4DzgYHA9cBzEXFZfaOySlKy2Bb4EjAxLR4I7BkRH69bYFYVSQcBC4GlgJ8CZ0TEmfWNqlgtmSmb3MnA54C/AqcCR9Y3HMvpEeBJ4D0g0s8ssiRizeNo4LfAwcDawB71Dad4zXhzX6tbFBGvS+qIiPclvVXvgKyyiHgBuELSVRGxqN7xWJ+9l36/FRFzJbXcZ2jLHXA/8LSk/wZWlvQD4Ll6B2RVOVbSscC7ZCMadETEmnWOyfJ7BngA+I6kE4E/1TmewjlpNJ8jga8BvwfeAb5R33CsSgcAa0bEu/UOxKoXEYdKWj4i3pb0h4j4W71jKpqTRvO5NSJG1zsI67O/8EEThzUZSTsDg9KVU+dJOiEiJlZarz9x0mg+/5C0J/AUsAggIp6qb0hWhSHAo5IeBToAIuLA+oZkVfgJcCDZFYzbAtfxwdVwLcFJo/msBnynbL4D2LFOsVj1Tq93ALZY3gX+BiyIiL9Karl7Fpw0mkxE7FDvGGyx/BE4FlgTuJUW7EhtcnPIHjd9iaQxwN/rHE/hnDSajKS/kJo1kjcjYot6xWNVmwBMBkaQ3WtzWZq25rAfsEFEPC7pk8Cl9Q6oaL65r/lsAnwc+ARwCHBvfcOxKq0cEROA+RFxH/4fbDarAD+SNAXYBti8zvEUzjWNJhMRc8tmp6d7NqyJSNok/R4G9Ph8e2tIlwBnAScAU4ErgOF1jahgThpNJiWJzuapNUlXUFnT+Dbwc7La4iTgiPqGY1VaJiLuknR8RISk9+sdUNGcNJrPk2XTj5B1ylnzWC8itumcSaPcPlzHeKw670vaBRgoaTjgpGGNSVLnDX0vd3lpa2BKweFYlcpHuZX02bR4ALAX2bX+1hwOB84k69v4Hi1YU3TSaB49jYbagZNGM3gEWJkPRrmFrGnxmrpFZH3xHnBZRPxW0lHAP+odUNGcNJrHNyJigaQh9Q7Eqlc+ym1aNIDs6pvH6xeV9cE1wDlp+nXgF8DuPRfvf5w0mseVZMMXBB90hLel6fXrFZRV7WzgCWBdYEuyu4u/UteIrBrLRcStABExUdJh9Q6oaE4aTaJzfKKI+Fi9Y7HF8pmIOEbS3RGxg6Tf1Tsgq8o8SaPIhkffiha8etFJo8mkwQrHAIPJahorR8Sn6huVVWGgpHbg2dTUOLTeAVlVDiPrCD+XrGnxm/UNp3i+G7X5nAJL9k/lAAAFlElEQVSMA14gu7Ho0bpGY9W6EriA7IPnDODi+oZjVfoHcH56rvvdwGt1jqdwThrN5+WIuB8gIi4H1qpvOFaNiLggIraOiMci4piIuKzeMVlVrgGWStOdHeEtxUmjSUhaMU3OlfQ5YHC6yWiVOoZlOUmalH6/LOml9POypJfqHZtV5UMd4cCydY6ncO7TaB63AduRDcU8iKyZ6iTg5HoGZflExL5pclhELOxcLmmFOoVkfdPyHeGuaTSP+ZL+AOxLljAuJWuaOrquUVm17pK0BoCkrYD76hyPVecwsgtRZgBH0oId4a5pNI+dyZLEhWR/rNacfgzcLule4NNkXwKsSUTE08De9Y6jnpw0mkRq0nge2K3esdhieYysiXEUcAcwu77hWDUkvUx2Q20bsBLwTLqSqmW4ecqsWNOACyJiU+Al4P46x2NViIg1ImLNiFgD2Jisb6OlOGmYFWvHiLgFICLOpAXbxPuLiHiO7EmaLcXNU2bFWlHS1cBHya7xn1XneKwK6dx1jv22Btlz3luKaxpmxToXOBR4BbiM7O5+ax63AdOBe8mSx0/qG07xnDTMCpauwOmIiFeAt+odj1XlG2RjTo0ie174T+sbTvGcNMyK9bqkbwLLSToAeKPeAVlVFgFTgY9ExDX45j4zq7GvAx8DXiW7T+Pr9Q3HqjSYbKDJqZJ2AFruoWhOGmYFiog5ZA9iOoKsf2P5+kZkVTqU7N6a04FVacEHaLV1dHRULmVmS4SkC4BdgZdJT16MiM/WNyqz/HzJrVmxtgI2iIiWawu3/sHNU2bFehpYut5BmPWVaxpmxVoHeE7S02nezVPWVJw0zIr1pXoHYLY43DxlVqyFZM8Hvx34X7LOcLOm4aRhVqzxwFXAtsAVZEOJmDUNN0+ZFWvpiPhVmr5Z0nfrGo1ZlVzTMCvWIEn/DpB++0YpayquaZgV67+AyyStSfYQpm/UOR6zqrimYVaszYChwHyyYShuqm84ZtVxTcOsWN8H9gBeqHcgZn3hpGFWrGfS8zTMmpKThlmx3pU0GZhJ6gSPiB/VNySz/Jw0zIp1e70DMFscHhrdzMxy89VTZmaWm5OGmZnl5j4Nsx5IWg94CnicrNN6CNkNeYdGxItVbmsPYKOIOHtJx2lWJCcNs969FBGbd85I+m/gPOA/q9xO+xKNyqxOnDTMqjMV2FPScOAcsqfwvQp8MyKelnQPMC4i7kk1lXuAzwPfApD0HHAL2ei2mwBzge9GxF2SdgdOIWs2fiZt82+SngWuBXYHFgA/AsYCGwFjI+I6SasDFwNrA4uAH0bEnbV9K6wVuU/DLCdJg4H9gQeBa4CjImIz4CLg6p7Wi4jHU5mLIuLnwMnA0xHxceAQ4CeSViP70N87Ij4FTAd+VraZlyJiU+CPwA+A0cDBwA/T6+cAEyKiHdgTuFjS0CVz5GYfcNIw692akmZKmgn8ieyhSZcD/4iIPwBExPXAhpJWzLnNEWTP1CAiHo2IbYCtgBkR8WwqcwmwU9k6k9Pv54B7I2JBmv5oWr4zcFKKczIwGNigymM1q8jNU2a9+1CfBoCkT3VTrg0YSNZh3vk0vsE9bHN+l+1twr9+gWvjw/+f88qmF3SzzYHAjhHxetrmmsDfeti/WZ+5pmFWvQBWlvQZAEn7Ac+lD+xXgU1Tub3L1lnAB0lgKnBAWncT4DdkTV7DUz8IwOHA3VXEdBdwZNrmJ8hqRctWdVRmObimYValiJgraX/gZ5KWA14n6+sAOAO4QtLXgJvLVpualv8NOBEYL+kRsmRySOrwPhy4SdIQsqanr1cR1n8Bl0jqbEI7JCLeWozDNOuWhxExM7Pc3DxlZma5OWmYmVluThpmZpabk4aZmeXmpGFmZrk5aZiZWW5OGmZmlpuThpmZ5fb/ARmZRvJiy5dbAAAAAElFTkSuQmCC\n", | |
| "text/plain": [ | |
| "<matplotlib.figure.Figure at 0x116538a58>" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "pd.crosstab(data.poutcome,data.y).plot(kind='bar')\n", | |
| "plt.title('Purchase Frequency for Poutcome')\n", | |
| "plt.xlabel('Poutcome')\n", | |
| "plt.ylabel('Frequency of Purchase')\n", | |
| "plt.savefig('pur_fre_pout_bar')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "source": [ | |
| "Poutcome (previous outcome) seems to be a good predictor of the outcome variable." | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "heading_collapsed": true | |
| }, | |
| "source": [ | |
| "## Create dummy variables ##" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "source": [ | |
| "That are variables with only two values, zero and one. They are created by categorial variables. It's hard to analyse categorial variables, that's why we create for each possible value of a categorial variable a new (dummy) variable, which then has only the possible values of one and zero." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 50, | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "cat_vars=['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome']\n", | |
| "for var in cat_vars:\n", | |
| " cat_list='var'+'_'+var\n", | |
| " cat_list = pd.get_dummies(data[var], prefix=var)\n", | |
| " data1=data.join(cat_list)\n", | |
| " data=data1\n", | |
| "cat_vars=['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome']\n", | |
| "data_vars=data.columns.values.tolist()\n", | |
| "to_keep=[i for i in data_vars if i not in cat_vars]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "source": [ | |
| "Our final data columns will be:" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 51, | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "array(['age', 'duration', 'campaign', 'pdays', 'previous', 'emp_var_rate',\n", | |
| " 'cons_price_idx', 'cons_conf_idx', 'euribor3m', 'nr_employed', 'y',\n", | |
| " 'job_admin.', 'job_blue-collar', 'job_entrepreneur',\n", | |
| " 'job_housemaid', 'job_management', 'job_retired',\n", | |
| " 'job_self-employed', 'job_services', 'job_student',\n", | |
| " 'job_technician', 'job_unemployed', 'job_unknown',\n", | |
| " 'marital_divorced', 'marital_married', 'marital_single',\n", | |
| " 'marital_unknown', 'education_Basic', 'education_high.school',\n", | |
| " 'education_illiterate', 'education_professional.course',\n", | |
| " 'education_university.degree', 'education_unknown', 'default_no',\n", | |
| " 'default_unknown', 'default_yes', 'housing_no', 'housing_unknown',\n", | |
| " 'housing_yes', 'loan_no', 'loan_unknown', 'loan_yes',\n", | |
| " 'contact_cellular', 'contact_telephone', 'month_apr', 'month_aug',\n", | |
| " 'month_dec', 'month_jul', 'month_jun', 'month_mar', 'month_may',\n", | |
| " 'month_nov', 'month_oct', 'month_sep', 'day_of_week_fri',\n", | |
| " 'day_of_week_mon', 'day_of_week_thu', 'day_of_week_tue',\n", | |
| " 'day_of_week_wed', 'poutcome_failure', 'poutcome_nonexistent',\n", | |
| " 'poutcome_success'], dtype=object)" | |
| ] | |
| }, | |
| "execution_count": 51, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "data_final=data[to_keep]\n", | |
| "data_final.columns.values" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 52, | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "data_final_vars=data_final.columns.values.tolist()\n", | |
| "y=['y']\n", | |
| "X=[i for i in data_final_vars if i not in y]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "heading_collapsed": true | |
| }, | |
| "source": [ | |
| "## Feature Selection ##" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "source": [ | |
| "Recursive Feature Elimination (RFE) is based on the idea to repeatedly construct a model and choose either the best or worst performing feature, setting the feature aside and then repeating the process with the rest of the features. This process is applied until all features in the dataset are exhausted. The goal of RFE is to select features by recursively considering smaller and smaller sets of features.\n", | |
| "\n", | |
| "Let's do some feature selection:" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 53, | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/sklearn/utils/validation.py:578: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n", | |
| " y = column_or_1d(y, warn=True)\n" | |
| ] | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "[False False False False True False False False True False False True\n", | |
| " False False False True False True True False False False False False\n", | |
| " False False True False False False False False True False False False\n", | |
| " False False False False False True False True True False False False\n", | |
| " True True True False False False True False False False True True\n", | |
| " True]\n", | |
| "[35 33 13 42 1 17 21 23 1 30 12 1 28 41 38 1 31 1 1 14 24 40 7 8\n", | |
| " 9 43 1 2 39 3 4 5 1 20 44 36 15 37 22 19 16 1 18 1 1 25 27 26\n", | |
| " 1 1 1 32 10 11 1 34 29 6 1 1 1]\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "from sklearn import datasets\n", | |
| "from sklearn.feature_selection import RFE\n", | |
| "from sklearn.linear_model import LogisticRegression\n", | |
| "logreg = LogisticRegression()\n", | |
| "rfe = RFE(logreg, 18)\n", | |
| "rfe = rfe.fit(data_final[X], data_final[y] )\n", | |
| "print(rfe.support_)\n", | |
| "print(rfe.ranking_)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "source": [ | |
| "The RFE has helped us select the following features: “previous”, “euribor3m”, “job_blue-collar”, “job_retired”, “job_services”, “job_student”, “default_no”, “month_aug”, “month_dec”, “month_jul”, “month_nov”, “month_oct”, “month_sep”, “day_of_week_fri”, “day_of_week_wed”, “poutcome_failure”, “poutcome_nonexistent”, “poutcome_success”." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 54, | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "cols=[\"previous\", \"euribor3m\", \"job_blue-collar\", \"job_retired\", \"job_services\", \"job_student\", \"default_no\", \n", | |
| " \"month_aug\", \"month_dec\", \"month_jul\", \"month_nov\", \"month_oct\", \"month_sep\", \"day_of_week_fri\", \"day_of_week_wed\", \n", | |
| " \"poutcome_failure\", \"poutcome_nonexistent\", \"poutcome_success\"] \n", | |
| "X=data_final[cols]\n", | |
| "y=data_final['y']" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "heading_collapsed": true | |
| }, | |
| "source": [ | |
| "## Implementing the model ##" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 55, | |
| "metadata": { | |
| "hidden": true, | |
| "scrolled": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Optimization terminated successfully.\n", | |
| " Current function value: 0.287116\n", | |
| " Iterations 7\n", | |
| " Logit Regression Results \n", | |
| "==============================================================================\n", | |
| "Dep. Variable: y No. Observations: 41188\n", | |
| "Model: Logit Df Residuals: 41170\n", | |
| "Method: MLE Df Model: 17\n", | |
| "Date: Sat, 03 Mar 2018 Pseudo R-squ.: 0.1844\n", | |
| "Time: 09:01:56 Log-Likelihood: -11826.\n", | |
| "converged: True LL-Null: -14499.\n", | |
| " LLR p-value: 0.000\n", | |
| "========================================================================================\n", | |
| " coef std err z P>|z| [0.025 0.975]\n", | |
| "----------------------------------------------------------------------------------------\n", | |
| "previous 0.2385 0.051 4.642 0.000 0.138 0.339\n", | |
| "euribor3m -0.4981 0.012 -40.386 0.000 -0.522 -0.474\n", | |
| "job_blue-collar -0.3222 0.049 -6.549 0.000 -0.419 -0.226\n", | |
| "job_retired 0.3821 0.069 5.552 0.000 0.247 0.517\n", | |
| "job_services -0.2423 0.065 -3.701 0.000 -0.371 -0.114\n", | |
| "job_student 0.3540 0.086 4.107 0.000 0.185 0.523\n", | |
| "default_no 0.3312 0.056 5.943 0.000 0.222 0.440\n", | |
| "month_aug 0.4272 0.055 7.770 0.000 0.319 0.535\n", | |
| "month_dec 0.8061 0.163 4.948 0.000 0.487 1.125\n", | |
| "month_jul 0.7319 0.056 13.094 0.000 0.622 0.841\n", | |
| "month_nov 0.2706 0.064 4.249 0.000 0.146 0.395\n", | |
| "month_oct 0.8043 0.087 9.258 0.000 0.634 0.975\n", | |
| "month_sep 0.5906 0.096 6.160 0.000 0.403 0.778\n", | |
| "day_of_week_fri -0.0044 0.046 -0.097 0.923 -0.094 0.085\n", | |
| "day_of_week_wed 0.1226 0.044 2.771 0.006 0.036 0.209\n", | |
| "poutcome_failure -1.8438 0.100 -18.412 0.000 -2.040 -1.647\n", | |
| "poutcome_nonexistent -1.1344 0.070 -16.253 0.000 -1.271 -0.998\n", | |
| "poutcome_success 0.0912 0.114 0.803 0.422 -0.131 0.314\n", | |
| "========================================================================================\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "import statsmodels.api as sm\n", | |
| "\n", | |
| "# Workaround, because of error inside of the official released package\n", | |
| "from scipy import stats\n", | |
| "stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)\n", | |
| "# End of workaround\n", | |
| "\n", | |
| "logit_model=sm.Logit(y,X)\n", | |
| "result=logit_model.fit()\n", | |
| "print(result.summary())" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "source": [ | |
| "The p-value for each term tests the null hypothesis that the coefficient is equal to zero (no effect). A low p-value (< 0.05) indicates that you can reject the null hypothesis. In other words, a predictor that has a low p-value is likely to be a meaningful addition to your model because changes in the predictor's value are related to changes in the response variable.\n", | |
| "\n", | |
| "Conversely, a larger (insignificant) p-value suggests that changes in the predictor are not associated with changes in the response.\n", | |
| "\n", | |
| "The p-values for most of the variables are smaller than 0.05, therefore, most of them are significant to the model." | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "heading_collapsed": true | |
| }, | |
| "source": [ | |
| "## Logistic Regression Model Fitting ##" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 56, | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", | |
| " intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n", | |
| " penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n", | |
| " verbose=0, warm_start=False)" | |
| ] | |
| }, | |
| "execution_count": 56, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "from sklearn.linear_model import LogisticRegression\n", | |
| "from sklearn import metrics\n", | |
| "\n", | |
| "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)\n", | |
| "logreg = LogisticRegression()\n", | |
| "logreg.fit(X_train, y_train)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "source": [ | |
| "We used the LogisticRegression() without any parameters. Here are the default values, which are used.\n", | |
| "\n", | |
| "*LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class=’ovr’, n_jobs=1, penalty=’l2', random_state=None, solver=’liblinear’, tol=0.0001,\n", | |
| "verbose=0, warm_start=False)*" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "source": [ | |
| "**Predicting the test set results and calculating the accuracy**" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 57, | |
| "metadata": { | |
| "hidden": true, | |
| "scrolled": false | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Accuracy of logistic regression classifier on test set: 0.90\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "y_pred = logreg.predict(X_test)\n", | |
| "print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "source": [ | |
| "Accuracy of logistic regression classifier on test set: 0.90" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "heading_collapsed": true | |
| }, | |
| "source": [ | |
| "## Cross Validation ##" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "source": [ | |
| "Cross validation attempts to avoid overfitting while still producing a prediction for each observation dataset. We are using 10-fold Cross-Validation to train our Logistic Regression model." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 58, | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "10-fold cross validation average accuracy: 0.897\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "from sklearn import model_selection\n", | |
| "from sklearn.model_selection import cross_val_score\n", | |
| "\n", | |
| "kfold = model_selection.KFold(n_splits=10, random_state=7)\n", | |
| "modelCV = LogisticRegression()\n", | |
| "scoring = 'accuracy'\n", | |
| "results = model_selection.cross_val_score(modelCV, X_train, y_train, cv=kfold, scoring=scoring)\n", | |
| "print(\"10-fold cross validation average accuracy: %.3f\" % (results.mean()))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "source": [ | |
| "10-fold cross validation average accuracy: 0.897\n", | |
| "\n", | |
| "The average accuracy remains very close to the Logistic Regression model accuracy; hence, we can conclude that our model generalizes well." | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "heading_collapsed": true | |
| }, | |
| "source": [ | |
| "## Confusion Matrix ##" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "source": [ | |
| "In the field of machine learning and specifically the problem of statistical classification, a confusion matrix, also known as an error matrix, is a specific table layout that allows visualization of the performance of an algorithm, typically a supervised learning one. Each row of the matrix represents the instances in a predicted class while each column represents the instances in an actual class (or vice versa). The name stems from the fact that it makes it easy to see if the system is confusing two classes (i.e. commonly mislabelling one as another).\n", | |
| "\n", | |
| "It is a special kind of contingency table, with two dimensions (\"actual\" and \"predicted\"), and identical sets of \"classes\" in both dimensions (each combination of dimension and class is a variable in the contingency table).\n", | |
| "\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 59, | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "[[10872 109]\n", | |
| " [ 1122 254]]\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "from sklearn.metrics import confusion_matrix\n", | |
| "\n", | |
| "confusion_matrix = confusion_matrix(y_test, y_pred)\n", | |
| "print(confusion_matrix)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "source": [ | |
| "The result is telling us that we have 10872+254 correct predictions and 1122+109 incorrect predictions." | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "heading_collapsed": true | |
| }, | |
| "source": [ | |
| "## Compute precision, recall, F-measure and support ##" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "source": [ | |
| "To quote from Scikit Learn:\n", | |
| "\n", | |
| "The precision is the ratio **tp / (tp + fp)** where tp is the number of true positives and fp the number of false positives. The precision is intuitively the ability of the classifier to not label a sample as positive if it is negative.\n", | |
| "\n", | |
| "The recall is the ratio **tp / (tp + fn)** where tp is the number of true positives and fn the number of false negatives. The recall is intuitively the ability of the classifier to find all the positive samples.\n", | |
| "\n", | |
| "The F-beta score can be interpreted as a weighted harmonic mean of the precision and recall, where an F-beta score reaches its best value at 1 and worst score at 0.\n", | |
| "The F-beta score weights the recall more than the precision by a factor of beta. **beta = 1.0** means recall and precision are equally important.\n", | |
| "\n", | |
| "The support is the number of occurrences of each class in y_test." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 60, | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| " precision recall f1-score support\n", | |
| "\n", | |
| " 0 0.91 0.99 0.95 10981\n", | |
| " 1 0.70 0.18 0.29 1376\n", | |
| "\n", | |
| "avg / total 0.88 0.90 0.87 12357\n", | |
| "\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "from sklearn.metrics import classification_report\n", | |
| "\n", | |
| "print(classification_report(y_test, y_pred))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "source": [ | |
| "**Interpretation:** Of the entire test set, 88% of the promoted term deposit were the term deposit that the customers liked. Of the entire test set, 90% of the customer’s preferred term deposits that were promoted." | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "heading_collapsed": true | |
| }, | |
| "source": [ | |
| "## ROC Curve ##" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "source": [ | |
| "The **receiver operating characteristic (ROC)** curve is another common tool used with binary classifiers. \n", | |
| "\n", | |
| "The dotted line represents the ROC curve of a purely random classifier; a good classifier stays as far away from that line as possible (toward the top-left corner)." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 61, | |
| "metadata": { | |
| "hidden": true | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "image/png": "\n", | |
| "text/plain": [ | |
| "<matplotlib.figure.Figure at 0x116a401d0>" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "from sklearn.metrics import roc_auc_score\n", | |
| "from sklearn.metrics import roc_curve\n", | |
| "\n", | |
| "logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))\n", | |
| "fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])\n", | |
| "\n", | |
| "plt.figure()\n", | |
| "plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)\n", | |
| "plt.plot([0, 1], [0, 1],'r--')\n", | |
| "plt.xlim([0.0, 1.0])\n", | |
| "plt.ylim([0.0, 1.05])\n", | |
| "plt.xlabel('False Positive Rate')\n", | |
| "plt.ylabel('True Positive Rate')\n", | |
| "plt.title('Receiver operating characteristic')\n", | |
| "plt.legend(loc=\"lower right\")\n", | |
| "plt.savefig('Log_ROC')\n", | |
| "\n", | |
| "plt.show()" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.6.3" | |
| }, | |
| "toc": { | |
| "nav_menu": {}, | |
| "number_sections": true, | |
| "sideBar": true, | |
| "skip_h1_title": false, | |
| "title_cell": "Table of Contents", | |
| "title_sidebar": "Contents", | |
| "toc_cell": false, | |
| "toc_position": {}, | |
| "toc_section_display": true, | |
| "toc_window_display": false | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment