Skip to content

Instantly share code, notes, and snippets.

@kcarnold
Created November 15, 2019 14:30
Show Gist options
  • Save kcarnold/dd4d1b1787cdb1d66ec1cbaa4927f1fc to your computer and use it in GitHub Desktop.
Save kcarnold/dd4d1b1787cdb1d66ec1cbaa4927f1fc to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"import sklearn.datasets\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import confusion_matrix\n",
"from sklearn.linear_model import LinearRegression, LogisticRegression\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"%matplotlib inline"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Dataset source: https://archive.ics.uci.edu/ml/datasets/Statlog+(German+Credit+Data)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"credit_data = sklearn.datasets.fetch_openml('credit-g')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"**Author**: Dr. Hans Hofmann \n",
"**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)) - 1994 \n",
"**Please cite**: [UCI](https://archive.ics.uci.edu/ml/citation_policy.html)\n",
"\n",
"**German Credit data** \n",
"This dataset classifies people described by a set of attributes as good or bad credit risks.\n",
"\n",
"This dataset comes with a cost matrix: \n",
"``` \n",
" Good Bad (predicted) \n",
"Good 0 1 (actual) \n",
"Bad 5 0 \n",
"```\n",
"\n",
"It is worse to class a customer as good when they are bad (5), than it is to class a customer as bad when they are good (1). \n",
"\n",
"### Attribute description \n",
"\n",
"1. Status of existing checking account, in Deutsche Mark. \n",
"2. Duration in months \n",
"3. Credit history (credits taken, paid back duly, delays, critical accounts) \n",
"4. Purpose of the credit (car, television,...) \n",
"5. Credit amount \n",
"6. Status of savings account/bonds, in Deutsche Mark. \n",
"7. Present employment, in number of years. \n",
"8. Installment rate in percentage of disposable income \n",
"9. Personal status (married, single,...) and sex \n",
"10. Other debtors / guarantors \n",
"11. Present residence since X years \n",
"12. Property (e.g. real estate) \n",
"13. Age in years \n",
"14. Other installment plans (banks, stores) \n",
"15. Housing (rent, own,...) \n",
"16. Number of existing credits at this bank \n",
"17. Job \n",
"18. Number of people being liable to provide maintenance for \n",
"19. Telephone (yes,no) \n",
"20. Foreign worker (yes,no)\n",
"\n",
"Downloaded from openml.org.\n"
]
}
],
"source": [
"print(credit_data.DESCR)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>checking_status</th>\n",
" <th>duration</th>\n",
" <th>credit_history</th>\n",
" <th>purpose</th>\n",
" <th>credit_amount</th>\n",
" <th>savings_status</th>\n",
" <th>employment</th>\n",
" <th>installment_commitment</th>\n",
" <th>personal_status</th>\n",
" <th>other_parties</th>\n",
" <th>residence_since</th>\n",
" <th>property_magnitude</th>\n",
" <th>age</th>\n",
" <th>other_payment_plans</th>\n",
" <th>housing</th>\n",
" <th>existing_credits</th>\n",
" <th>job</th>\n",
" <th>num_dependents</th>\n",
" <th>own_telephone</th>\n",
" <th>foreign_worker</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0.0</td>\n",
" <td>6.0</td>\n",
" <td>4.0</td>\n",
" <td>3.0</td>\n",
" <td>1169.0</td>\n",
" <td>4.0</td>\n",
" <td>4.0</td>\n",
" <td>4.0</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>4.0</td>\n",
" <td>0.0</td>\n",
" <td>67.0</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1.0</td>\n",
" <td>48.0</td>\n",
" <td>2.0</td>\n",
" <td>3.0</td>\n",
" <td>5951.0</td>\n",
" <td>0.0</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>22.0</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3.0</td>\n",
" <td>12.0</td>\n",
" <td>4.0</td>\n",
" <td>6.0</td>\n",
" <td>2096.0</td>\n",
" <td>0.0</td>\n",
" <td>3.0</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>3.0</td>\n",
" <td>0.0</td>\n",
" <td>49.0</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.0</td>\n",
" <td>42.0</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>7882.0</td>\n",
" <td>0.0</td>\n",
" <td>3.0</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>4.0</td>\n",
" <td>1.0</td>\n",
" <td>45.0</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.0</td>\n",
" <td>24.0</td>\n",
" <td>3.0</td>\n",
" <td>0.0</td>\n",
" <td>4870.0</td>\n",
" <td>0.0</td>\n",
" <td>2.0</td>\n",
" <td>3.0</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>4.0</td>\n",
" <td>3.0</td>\n",
" <td>53.0</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" checking_status duration credit_history purpose credit_amount \\\n",
"0 0.0 6.0 4.0 3.0 1169.0 \n",
"1 1.0 48.0 2.0 3.0 5951.0 \n",
"2 3.0 12.0 4.0 6.0 2096.0 \n",
"3 0.0 42.0 2.0 2.0 7882.0 \n",
"4 0.0 24.0 3.0 0.0 4870.0 \n",
"\n",
" savings_status employment installment_commitment personal_status \\\n",
"0 4.0 4.0 4.0 2.0 \n",
"1 0.0 2.0 2.0 1.0 \n",
"2 0.0 3.0 2.0 2.0 \n",
"3 0.0 3.0 2.0 2.0 \n",
"4 0.0 2.0 3.0 2.0 \n",
"\n",
" other_parties residence_since property_magnitude age \\\n",
"0 0.0 4.0 0.0 67.0 \n",
"1 0.0 2.0 0.0 22.0 \n",
"2 0.0 3.0 0.0 49.0 \n",
"3 2.0 4.0 1.0 45.0 \n",
"4 0.0 4.0 3.0 53.0 \n",
"\n",
" other_payment_plans housing existing_credits job num_dependents \\\n",
"0 2.0 1.0 2.0 2.0 1.0 \n",
"1 2.0 1.0 1.0 2.0 1.0 \n",
"2 2.0 1.0 1.0 1.0 2.0 \n",
"3 2.0 2.0 1.0 2.0 2.0 \n",
"4 2.0 2.0 2.0 2.0 2.0 \n",
"\n",
" own_telephone foreign_worker \n",
"0 1.0 0.0 \n",
"1 0.0 0.0 \n",
"2 0.0 0.0 \n",
"3 0.0 0.0 \n",
"4 0.0 0.0 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"credit_df = pd.DataFrame(credit_data.data, columns=credit_data.feature_names)\n",
"credit_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"credit_df['GOOD'] = credit_data.target"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th>col_0</th>\n",
" <th>bad</th>\n",
" <th>good</th>\n",
" <th>All</th>\n",
" </tr>\n",
" <tr>\n",
" <th>checking_status</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0.0</th>\n",
" <td>135</td>\n",
" <td>139</td>\n",
" <td>274</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1.0</th>\n",
" <td>105</td>\n",
" <td>164</td>\n",
" <td>269</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2.0</th>\n",
" <td>14</td>\n",
" <td>49</td>\n",
" <td>63</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3.0</th>\n",
" <td>46</td>\n",
" <td>348</td>\n",
" <td>394</td>\n",
" </tr>\n",
" <tr>\n",
" <th>All</th>\n",
" <td>300</td>\n",
" <td>700</td>\n",
" <td>1000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"col_0 bad good All\n",
"checking_status \n",
"0.0 135 139 274\n",
"1.0 105 164 269\n",
"2.0 14 49 63\n",
"3.0 46 348 394\n",
"All 300 700 1000"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.crosstab(credit_df['checking_status'], credit_data.target, margins=True)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>checking_status_1.0</th>\n",
" <th>checking_status_2.0</th>\n",
" <th>checking_status_3.0</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" checking_status_1.0 checking_status_2.0 checking_status_3.0\n",
"0 0 0 0\n",
"1 1 0 0\n",
"2 0 0 1\n",
"3 0 0 0\n",
"4 0 0 0"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X = pd.get_dummies(credit_df[['checking_status']], columns=['checking_status'], drop_first=True)\n",
"X.head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 good\n",
"1 bad\n",
"2 good\n",
"3 good\n",
"4 bad\n",
"Name: GOOD, dtype: object"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"credit_df['GOOD'].head()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"y = 0+(credit_df['GOOD'] == 'good')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment