Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save janduplessis883/a714b41bec7ae71ea3f67bc01ef4b9d2 to your computer and use it in GitHub Desktop.
Save janduplessis883/a714b41bec7ae71ea3f67bc01ef4b9d2 to your computer and use it in GitHub Desktop.
Framingham Cardiovascular Risk Logistic Regression
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "b59bcc18",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "b19e76df2ef8436ea86e77b4be907443",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"HBox(children=(Button(description='⚡️Connect', style=ButtonStyle(button_color='#e3e3e3')), Button(description=…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"application/javascript": [
"\n",
" var code = \"an.search_code()\";\n",
" var cell = Jupyter.notebook.insert_cell_above('code');\n",
" cell.set_text(code);\n",
" cell.metadata.id = '9174b691-b218-4346-be0f-8e30b9306ceb';\n",
" "
],
"text/plain": [
"<IPython.core.display.Javascript object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from autonote import AutoNote\n",
"an = AutoNote()"
]
},
{
"cell_type": "markdown",
"id": "e597cfdf",
"metadata": {
"id": "295f83b3-7607-4608-a888-32b68a90ea1e"
},
"source": [
"<div class='alert' style='background-color: #1c1a1e; color: #f5f4f0; padding:26px 26px; border-radius:15px; font-size:40px;'><B>Framingham Cardiovasc Prediction</B> </div><span style='color: #1c1a1e; padding:26px 26px; font-size:11px;'> Powered by <B>?AutoNote</B></span><div style='margin:4px 26px; color:#1c1a1e; font-size:17px;'>\n",
"<ol>\n",
"<li><B>Problem statement</B>: A clear description of the problem the project aims to solve.</li><BR>\n",
"<li><B>Data source</B>: Information on where the data used in the project is obtained from.</li><BR>\n",
"<li><B>Libraries used</B>: A list of the Python libraries used in the project and a brief explanation of their role. Include library version.</li><BR>\n",
"<li><B>Exploratory Data Analysis (EDA)</B>: A summary of the initial findings from exploring the data.</li><BR>\n",
"<li><B>Preprocessing</B>: Steps taken to clean and prepare the data for model building.</li><BR>\n",
"<li><B>Model building</B>: An overview of the model used and the reasoning behind its selection.</li><BR>\n",
"<br> Accuracy = $\\frac{\\text{correct predictions}}{\\text{total predictions}}$, Precision = $\\frac{\\text{true positives}}{\\text{true positives + false positives}}$, Recall = $\\frac{\\text{true positives}}{\\text{true positives + false negatives}}$, <br>F1 = $2 \\times \\frac{\\text{precision} \\times \\text{recall}}{\\text{precision + recall}}$</li><BR><BR>\n",
"<li><B>Model evaluation</B>: Evaluation metrics used to assess the performance of the model and results of the evaluation.</li><BR>\n",
"<li><B>Conclusion</B>: A summary of the findings and recommendations for further work.</li>\n",
"</ol>\n",
"</div>"
]
},
{
"cell_type": "markdown",
"id": "8da15493",
"metadata": {
"id": "805cf86c-2474-4b5c-a03b-ad7e5c6d10ba"
},
"source": [
"# Import Libraries"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "db2eb6ba",
"metadata": {
"id": "faaf891e-2cc1-4608-801f-f805e3c2529c"
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.metrics import classification_report, confusion_matrix, accuracy_score"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "abf07322",
"metadata": {
"id": "9fcb296f-6dd8-4ae3-b3f1-d977b5104521"
},
"outputs": [],
"source": [
"data = pd.read_csv('../jan-datasets/framingham.csv')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "96c7a985",
"metadata": {
"id": "0d5a54b7-8950-4f51-b9d5-74b8518ccf9a"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>male</th>\n",
" <th>age</th>\n",
" <th>education</th>\n",
" <th>currentSmoker</th>\n",
" <th>cigsPerDay</th>\n",
" <th>BPMeds</th>\n",
" <th>prevalentStroke</th>\n",
" <th>prevalentHyp</th>\n",
" <th>diabetes</th>\n",
" <th>totChol</th>\n",
" <th>sysBP</th>\n",
" <th>diaBP</th>\n",
" <th>BMI</th>\n",
" <th>heartRate</th>\n",
" <th>glucose</th>\n",
" <th>TenYearCHD</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>39</td>\n",
" <td>4.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>195.0</td>\n",
" <td>106.0</td>\n",
" <td>70.0</td>\n",
" <td>26.97</td>\n",
" <td>80.0</td>\n",
" <td>77.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>46</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>250.0</td>\n",
" <td>121.0</td>\n",
" <td>81.0</td>\n",
" <td>28.73</td>\n",
" <td>95.0</td>\n",
" <td>76.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>48</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>20.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>245.0</td>\n",
" <td>127.5</td>\n",
" <td>80.0</td>\n",
" <td>25.34</td>\n",
" <td>75.0</td>\n",
" <td>70.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>61</td>\n",
" <td>3.0</td>\n",
" <td>1</td>\n",
" <td>30.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>225.0</td>\n",
" <td>150.0</td>\n",
" <td>95.0</td>\n",
" <td>28.58</td>\n",
" <td>65.0</td>\n",
" <td>103.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>46</td>\n",
" <td>3.0</td>\n",
" <td>1</td>\n",
" <td>23.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>285.0</td>\n",
" <td>130.0</td>\n",
" <td>84.0</td>\n",
" <td>23.10</td>\n",
" <td>85.0</td>\n",
" <td>85.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>0</td>\n",
" <td>43</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>228.0</td>\n",
" <td>180.0</td>\n",
" <td>110.0</td>\n",
" <td>30.30</td>\n",
" <td>77.0</td>\n",
" <td>99.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>0</td>\n",
" <td>63</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>205.0</td>\n",
" <td>138.0</td>\n",
" <td>71.0</td>\n",
" <td>33.11</td>\n",
" <td>60.0</td>\n",
" <td>85.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>0</td>\n",
" <td>45</td>\n",
" <td>2.0</td>\n",
" <td>1</td>\n",
" <td>20.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>313.0</td>\n",
" <td>100.0</td>\n",
" <td>71.0</td>\n",
" <td>21.68</td>\n",
" <td>79.0</td>\n",
" <td>78.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>1</td>\n",
" <td>52</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>260.0</td>\n",
" <td>141.5</td>\n",
" <td>89.0</td>\n",
" <td>26.36</td>\n",
" <td>76.0</td>\n",
" <td>79.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>1</td>\n",
" <td>43</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>30.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>225.0</td>\n",
" <td>162.0</td>\n",
" <td>107.0</td>\n",
" <td>23.61</td>\n",
" <td>93.0</td>\n",
" <td>88.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>0</td>\n",
" <td>50</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>254.0</td>\n",
" <td>133.0</td>\n",
" <td>76.0</td>\n",
" <td>22.91</td>\n",
" <td>75.0</td>\n",
" <td>76.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>0</td>\n",
" <td>43</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>247.0</td>\n",
" <td>131.0</td>\n",
" <td>88.0</td>\n",
" <td>27.64</td>\n",
" <td>72.0</td>\n",
" <td>61.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>1</td>\n",
" <td>46</td>\n",
" <td>1.0</td>\n",
" <td>1</td>\n",
" <td>15.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>294.0</td>\n",
" <td>142.0</td>\n",
" <td>94.0</td>\n",
" <td>26.31</td>\n",
" <td>98.0</td>\n",
" <td>64.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>0</td>\n",
" <td>41</td>\n",
" <td>3.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>332.0</td>\n",
" <td>124.0</td>\n",
" <td>88.0</td>\n",
" <td>31.31</td>\n",
" <td>65.0</td>\n",
" <td>84.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>0</td>\n",
" <td>39</td>\n",
" <td>2.0</td>\n",
" <td>1</td>\n",
" <td>9.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>226.0</td>\n",
" <td>114.0</td>\n",
" <td>64.0</td>\n",
" <td>22.35</td>\n",
" <td>85.0</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>0</td>\n",
" <td>38</td>\n",
" <td>2.0</td>\n",
" <td>1</td>\n",
" <td>20.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>221.0</td>\n",
" <td>140.0</td>\n",
" <td>90.0</td>\n",
" <td>21.35</td>\n",
" <td>95.0</td>\n",
" <td>70.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>1</td>\n",
" <td>48</td>\n",
" <td>3.0</td>\n",
" <td>1</td>\n",
" <td>10.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>232.0</td>\n",
" <td>138.0</td>\n",
" <td>90.0</td>\n",
" <td>22.37</td>\n",
" <td>64.0</td>\n",
" <td>72.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>0</td>\n",
" <td>46</td>\n",
" <td>2.0</td>\n",
" <td>1</td>\n",
" <td>20.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>291.0</td>\n",
" <td>112.0</td>\n",
" <td>78.0</td>\n",
" <td>23.38</td>\n",
" <td>80.0</td>\n",
" <td>89.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>0</td>\n",
" <td>38</td>\n",
" <td>2.0</td>\n",
" <td>1</td>\n",
" <td>5.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>195.0</td>\n",
" <td>122.0</td>\n",
" <td>84.5</td>\n",
" <td>23.24</td>\n",
" <td>75.0</td>\n",
" <td>78.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>1</td>\n",
" <td>41</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>195.0</td>\n",
" <td>139.0</td>\n",
" <td>88.0</td>\n",
" <td>26.88</td>\n",
" <td>85.0</td>\n",
" <td>65.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>0</td>\n",
" <td>42</td>\n",
" <td>2.0</td>\n",
" <td>1</td>\n",
" <td>30.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>190.0</td>\n",
" <td>108.0</td>\n",
" <td>70.5</td>\n",
" <td>21.59</td>\n",
" <td>72.0</td>\n",
" <td>85.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>0</td>\n",
" <td>43</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>185.0</td>\n",
" <td>123.5</td>\n",
" <td>77.5</td>\n",
" <td>29.89</td>\n",
" <td>70.0</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>0</td>\n",
" <td>52</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>234.0</td>\n",
" <td>148.0</td>\n",
" <td>78.0</td>\n",
" <td>34.17</td>\n",
" <td>70.0</td>\n",
" <td>113.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>0</td>\n",
" <td>52</td>\n",
" <td>3.0</td>\n",
" <td>1</td>\n",
" <td>20.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>215.0</td>\n",
" <td>132.0</td>\n",
" <td>82.0</td>\n",
" <td>25.11</td>\n",
" <td>71.0</td>\n",
" <td>75.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>1</td>\n",
" <td>44</td>\n",
" <td>2.0</td>\n",
" <td>1</td>\n",
" <td>30.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>270.0</td>\n",
" <td>137.5</td>\n",
" <td>90.0</td>\n",
" <td>21.96</td>\n",
" <td>75.0</td>\n",
" <td>83.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>1</td>\n",
" <td>47</td>\n",
" <td>4.0</td>\n",
" <td>1</td>\n",
" <td>20.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>294.0</td>\n",
" <td>102.0</td>\n",
" <td>68.0</td>\n",
" <td>24.18</td>\n",
" <td>62.0</td>\n",
" <td>66.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>0</td>\n",
" <td>60</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>260.0</td>\n",
" <td>110.0</td>\n",
" <td>72.5</td>\n",
" <td>26.59</td>\n",
" <td>65.0</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>1</td>\n",
" <td>35</td>\n",
" <td>2.0</td>\n",
" <td>1</td>\n",
" <td>20.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>225.0</td>\n",
" <td>132.0</td>\n",
" <td>91.0</td>\n",
" <td>26.09</td>\n",
" <td>73.0</td>\n",
" <td>83.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>0</td>\n",
" <td>61</td>\n",
" <td>3.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>272.0</td>\n",
" <td>182.0</td>\n",
" <td>121.0</td>\n",
" <td>32.80</td>\n",
" <td>85.0</td>\n",
" <td>65.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>0</td>\n",
" <td>60</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>247.0</td>\n",
" <td>130.0</td>\n",
" <td>88.0</td>\n",
" <td>30.36</td>\n",
" <td>72.0</td>\n",
" <td>74.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" male age education currentSmoker cigsPerDay BPMeds prevalentStroke \n",
"0 1 39 4.0 0 0.0 0.0 0 \\\n",
"1 0 46 2.0 0 0.0 0.0 0 \n",
"2 1 48 1.0 1 20.0 0.0 0 \n",
"3 0 61 3.0 1 30.0 0.0 0 \n",
"4 0 46 3.0 1 23.0 0.0 0 \n",
"5 0 43 2.0 0 0.0 0.0 0 \n",
"6 0 63 1.0 0 0.0 0.0 0 \n",
"7 0 45 2.0 1 20.0 0.0 0 \n",
"8 1 52 1.0 0 0.0 0.0 0 \n",
"9 1 43 1.0 1 30.0 0.0 0 \n",
"10 0 50 1.0 0 0.0 0.0 0 \n",
"11 0 43 2.0 0 0.0 0.0 0 \n",
"12 1 46 1.0 1 15.0 0.0 0 \n",
"13 0 41 3.0 0 0.0 1.0 0 \n",
"14 0 39 2.0 1 9.0 0.0 0 \n",
"15 0 38 2.0 1 20.0 0.0 0 \n",
"16 1 48 3.0 1 10.0 0.0 0 \n",
"17 0 46 2.0 1 20.0 0.0 0 \n",
"18 0 38 2.0 1 5.0 0.0 0 \n",
"19 1 41 2.0 0 0.0 0.0 0 \n",
"20 0 42 2.0 1 30.0 0.0 0 \n",
"21 0 43 1.0 0 0.0 0.0 0 \n",
"22 0 52 1.0 0 0.0 0.0 0 \n",
"23 0 52 3.0 1 20.0 0.0 0 \n",
"24 1 44 2.0 1 30.0 0.0 0 \n",
"25 1 47 4.0 1 20.0 0.0 0 \n",
"26 0 60 1.0 0 0.0 0.0 0 \n",
"27 1 35 2.0 1 20.0 0.0 0 \n",
"28 0 61 3.0 0 0.0 0.0 0 \n",
"29 0 60 1.0 0 0.0 0.0 0 \n",
"\n",
" prevalentHyp diabetes totChol sysBP diaBP BMI heartRate glucose \n",
"0 0 0 195.0 106.0 70.0 26.97 80.0 77.0 \\\n",
"1 0 0 250.0 121.0 81.0 28.73 95.0 76.0 \n",
"2 0 0 245.0 127.5 80.0 25.34 75.0 70.0 \n",
"3 1 0 225.0 150.0 95.0 28.58 65.0 103.0 \n",
"4 0 0 285.0 130.0 84.0 23.10 85.0 85.0 \n",
"5 1 0 228.0 180.0 110.0 30.30 77.0 99.0 \n",
"6 0 0 205.0 138.0 71.0 33.11 60.0 85.0 \n",
"7 0 0 313.0 100.0 71.0 21.68 79.0 78.0 \n",
"8 1 0 260.0 141.5 89.0 26.36 76.0 79.0 \n",
"9 1 0 225.0 162.0 107.0 23.61 93.0 88.0 \n",
"10 0 0 254.0 133.0 76.0 22.91 75.0 76.0 \n",
"11 0 0 247.0 131.0 88.0 27.64 72.0 61.0 \n",
"12 1 0 294.0 142.0 94.0 26.31 98.0 64.0 \n",
"13 1 0 332.0 124.0 88.0 31.31 65.0 84.0 \n",
"14 0 0 226.0 114.0 64.0 22.35 85.0 NaN \n",
"15 1 0 221.0 140.0 90.0 21.35 95.0 70.0 \n",
"16 1 0 232.0 138.0 90.0 22.37 64.0 72.0 \n",
"17 0 0 291.0 112.0 78.0 23.38 80.0 89.0 \n",
"18 0 0 195.0 122.0 84.5 23.24 75.0 78.0 \n",
"19 0 0 195.0 139.0 88.0 26.88 85.0 65.0 \n",
"20 0 0 190.0 108.0 70.5 21.59 72.0 85.0 \n",
"21 0 0 185.0 123.5 77.5 29.89 70.0 NaN \n",
"22 0 0 234.0 148.0 78.0 34.17 70.0 113.0 \n",
"23 0 0 215.0 132.0 82.0 25.11 71.0 75.0 \n",
"24 1 0 270.0 137.5 90.0 21.96 75.0 83.0 \n",
"25 0 0 294.0 102.0 68.0 24.18 62.0 66.0 \n",
"26 0 0 260.0 110.0 72.5 26.59 65.0 NaN \n",
"27 1 0 225.0 132.0 91.0 26.09 73.0 83.0 \n",
"28 1 0 272.0 182.0 121.0 32.80 85.0 65.0 \n",
"29 0 0 247.0 130.0 88.0 30.36 72.0 74.0 \n",
"\n",
" TenYearCHD \n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 1 \n",
"4 0 \n",
"5 0 \n",
"6 1 \n",
"7 0 \n",
"8 0 \n",
"9 0 \n",
"10 0 \n",
"11 0 \n",
"12 0 \n",
"13 0 \n",
"14 0 \n",
"15 1 \n",
"16 0 \n",
"17 1 \n",
"18 0 \n",
"19 0 \n",
"20 0 \n",
"21 0 \n",
"22 0 \n",
"23 0 \n",
"24 0 \n",
"25 1 \n",
"26 0 \n",
"27 0 \n",
"28 1 \n",
"29 0 "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head(30)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "1767e15b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(4238, 16)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.shape"
]
},
{
"cell_type": "markdown",
"id": "63a57088",
"metadata": {
"id": "bc3c763f-6bff-4244-9a0d-13aa2296024b"
},
"source": [
"# Explore and Clean the Data"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "998a3fab",
"metadata": {
"id": "8f4d8817-5e74-4602-bb39-e2551d98ab11"
},
"outputs": [
{
"data": {
"text/plain": [
"male 0\n",
"age 0\n",
"education 105\n",
"currentSmoker 0\n",
"cigsPerDay 29\n",
"BPMeds 53\n",
"prevalentStroke 0\n",
"prevalentHyp 0\n",
"diabetes 0\n",
"totChol 50\n",
"sysBP 0\n",
"diaBP 0\n",
"BMI 19\n",
"heartRate 1\n",
"glucose 388\n",
"TenYearCHD 0\n",
"dtype: int64"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Check for missing values\n",
"data.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "69d37309",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[<Axes: title={'center': 'glucose'}>,\n",
" <Axes: title={'center': 'education'}>],\n",
" [<Axes: title={'center': 'BPMeds'}>,\n",
" <Axes: title={'center': 'totChol'}>]], dtype=object)"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 640x480 with 4 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"data[['glucose', 'education', 'BPMeds', 'totChol']].hist()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "cab7edf7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['male', 'age', 'education', 'currentSmoker', 'cigsPerDay', 'BPMeds',\n",
" 'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP',\n",
" 'diaBP', 'BMI', 'heartRate', 'glucose', 'TenYearCHD'],\n",
" dtype='object')"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.columns"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "45601e56",
"metadata": {},
"outputs": [],
"source": [
"data['BPMeds'] = data['BPMeds'].fillna(0)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "f4aed2a7",
"metadata": {},
"outputs": [],
"source": [
"missing_values = ['education','cigsPerDay', 'totChol','BMI', 'heartRate', 'glucose']"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "6a8fa7a0",
"metadata": {
"id": "f2eff417-d5bd-4515-b7d0-f62797042b58"
},
"outputs": [],
"source": [
"from sklearn.impute import SimpleImputer\n",
"imputer = SimpleImputer(strategy='mean')\n",
"imputed_data = imputer.fit_transform(data)\n",
"imputed_df = pd.DataFrame(imputed_data, columns=data.columns)\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "819c4057",
"metadata": {
"id": "b14373b0-8452-413f-84fe-985e53ad4ae0"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>male</th>\n",
" <th>age</th>\n",
" <th>education</th>\n",
" <th>currentSmoker</th>\n",
" <th>cigsPerDay</th>\n",
" <th>BPMeds</th>\n",
" <th>prevalentStroke</th>\n",
" <th>prevalentHyp</th>\n",
" <th>diabetes</th>\n",
" <th>totChol</th>\n",
" <th>sysBP</th>\n",
" <th>diaBP</th>\n",
" <th>BMI</th>\n",
" <th>heartRate</th>\n",
" <th>glucose</th>\n",
" <th>TenYearCHD</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1.0</td>\n",
" <td>39.0</td>\n",
" <td>4.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>195.0</td>\n",
" <td>106.0</td>\n",
" <td>70.0</td>\n",
" <td>26.97</td>\n",
" <td>80.0</td>\n",
" <td>77.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.0</td>\n",
" <td>46.0</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>250.0</td>\n",
" <td>121.0</td>\n",
" <td>81.0</td>\n",
" <td>28.73</td>\n",
" <td>95.0</td>\n",
" <td>76.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1.0</td>\n",
" <td>48.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>20.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>245.0</td>\n",
" <td>127.5</td>\n",
" <td>80.0</td>\n",
" <td>25.34</td>\n",
" <td>75.0</td>\n",
" <td>70.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0.0</td>\n",
" <td>61.0</td>\n",
" <td>3.0</td>\n",
" <td>1.0</td>\n",
" <td>30.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>225.0</td>\n",
" <td>150.0</td>\n",
" <td>95.0</td>\n",
" <td>28.58</td>\n",
" <td>65.0</td>\n",
" <td>103.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0.0</td>\n",
" <td>46.0</td>\n",
" <td>3.0</td>\n",
" <td>1.0</td>\n",
" <td>23.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>285.0</td>\n",
" <td>130.0</td>\n",
" <td>84.0</td>\n",
" <td>23.10</td>\n",
" <td>85.0</td>\n",
" <td>85.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" male age education currentSmoker cigsPerDay BPMeds prevalentStroke \n",
"0 1.0 39.0 4.0 0.0 0.0 0.0 0.0 \\\n",
"1 0.0 46.0 2.0 0.0 0.0 0.0 0.0 \n",
"2 1.0 48.0 1.0 1.0 20.0 0.0 0.0 \n",
"3 0.0 61.0 3.0 1.0 30.0 0.0 0.0 \n",
"4 0.0 46.0 3.0 1.0 23.0 0.0 0.0 \n",
"\n",
" prevalentHyp diabetes totChol sysBP diaBP BMI heartRate glucose \n",
"0 0.0 0.0 195.0 106.0 70.0 26.97 80.0 77.0 \\\n",
"1 0.0 0.0 250.0 121.0 81.0 28.73 95.0 76.0 \n",
"2 0.0 0.0 245.0 127.5 80.0 25.34 75.0 70.0 \n",
"3 1.0 0.0 225.0 150.0 95.0 28.58 65.0 103.0 \n",
"4 0.0 0.0 285.0 130.0 84.0 23.10 85.0 85.0 \n",
"\n",
" TenYearCHD \n",
"0 0.0 \n",
"1 0.0 \n",
"2 0.0 \n",
"3 1.0 \n",
"4 0.0 "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"imputed_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "a572ba39",
"metadata": {},
"outputs": [],
"source": [
"data = imputed_df "
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "1cfbfe8e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"male 0\n",
"age 0\n",
"education 0\n",
"currentSmoker 0\n",
"cigsPerDay 0\n",
"BPMeds 0\n",
"prevalentStroke 0\n",
"prevalentHyp 0\n",
"diabetes 0\n",
"totChol 0\n",
"sysBP 0\n",
"diaBP 0\n",
"BMI 0\n",
"heartRate 0\n",
"glucose 0\n",
"TenYearCHD 0\n",
"dtype: int64"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.isna().sum()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "6151ffac",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<Axes: ylabel='Count'>"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sns.histplot(data)"
]
},
{
"cell_type": "markdown",
"id": "dc56864d",
"metadata": {
"id": "c1fe4a31-b2ce-4339-bada-99084d1183e4"
},
"source": [
"# Feature Engineering"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "1fa8eede",
"metadata": {
"id": "5dde5da5-ad72-4bb7-9490-3d3251e2e7ed"
},
"outputs": [],
"source": [
"# Convert categorical features to numerical\n",
"data['heavy_smoker'] = (data['cigsPerDay']>= 20).astype(int)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "176b069d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>male</th>\n",
" <th>age</th>\n",
" <th>education</th>\n",
" <th>currentSmoker</th>\n",
" <th>cigsPerDay</th>\n",
" <th>BPMeds</th>\n",
" <th>prevalentStroke</th>\n",
" <th>prevalentHyp</th>\n",
" <th>diabetes</th>\n",
" <th>totChol</th>\n",
" <th>sysBP</th>\n",
" <th>diaBP</th>\n",
" <th>BMI</th>\n",
" <th>heartRate</th>\n",
" <th>glucose</th>\n",
" <th>TenYearCHD</th>\n",
" <th>heavy_smoker</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1.0</td>\n",
" <td>39.0</td>\n",
" <td>4.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>195.0</td>\n",
" <td>106.0</td>\n",
" <td>70.0</td>\n",
" <td>26.97</td>\n",
" <td>80.0</td>\n",
" <td>77.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0.0</td>\n",
" <td>46.0</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>250.0</td>\n",
" <td>121.0</td>\n",
" <td>81.0</td>\n",
" <td>28.73</td>\n",
" <td>95.0</td>\n",
" <td>76.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1.0</td>\n",
" <td>48.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>20.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>245.0</td>\n",
" <td>127.5</td>\n",
" <td>80.0</td>\n",
" <td>25.34</td>\n",
" <td>75.0</td>\n",
" <td>70.0</td>\n",
" <td>0.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" male age education currentSmoker cigsPerDay BPMeds prevalentStroke \n",
"0 1.0 39.0 4.0 0.0 0.0 0.0 0.0 \\\n",
"1 0.0 46.0 2.0 0.0 0.0 0.0 0.0 \n",
"2 1.0 48.0 1.0 1.0 20.0 0.0 0.0 \n",
"\n",
" prevalentHyp diabetes totChol sysBP diaBP BMI heartRate glucose \n",
"0 0.0 0.0 195.0 106.0 70.0 26.97 80.0 77.0 \\\n",
"1 0.0 0.0 250.0 121.0 81.0 28.73 95.0 76.0 \n",
"2 0.0 0.0 245.0 127.5 80.0 25.34 75.0 70.0 \n",
"\n",
" TenYearCHD heavy_smoker \n",
"0 0.0 0 \n",
"1 0.0 0 \n",
"2 0.0 1 "
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head(3)"
]
},
{
"cell_type": "markdown",
"id": "86a55754",
"metadata": {
"id": "eea560c6-3d20-4cbe-b3b2-226be23314bb"
},
"source": [
"# Plot a Learning Curve"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "321d58af",
"metadata": {
"id": "938f8a95-3ee9-4938-b278-c1f8d23139b1"
},
"outputs": [],
"source": [
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.model_selection import learning_curve\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.model_selection import train_test_split"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "6e4dffe1",
"metadata": {
"id": "e16a9cdc-b577-4049-a2dd-dea1082aabef"
},
"outputs": [],
"source": [
"def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=None, train_sizes=np.linspace(0.1, 1.0, 5)):\n",
" plt.figure()\n",
" plt.title(title)\n",
" if ylim is not None:\n",
" plt.ylim(*ylim)\n",
" plt.xlabel(\"Training examples\")\n",
" plt.ylabel(\"Score\")\n",
" train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)\n",
" train_scores_mean = np.mean(train_scores, axis=1)\n",
" train_scores_std = np.std(train_scores, axis=1)\n",
" test_scores_mean = np.mean(test_scores, axis=1)\n",
" test_scores_std = np.std(test_scores, axis=1)\n",
" plt.grid()\n",
"\n",
" plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color=\"r\")\n",
" plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color=\"g\")\n",
" plt.plot(train_sizes, train_scores_mean, 'o-', color=\"r\", label=\"Training score\")\n",
" plt.plot(train_sizes, test_scores_mean, 'o-', color=\"g\", label=\"Cross-validation score\")\n",
"\n",
" plt.legend(loc=\"best\")\n",
" return plt"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "47161f78",
"metadata": {
"id": "c567d405-f6a7-477b-b7c4-67c187059b4d"
},
"outputs": [],
"source": [
"# Define the target variable and feature matrix\n",
"X = data.drop('TenYearCHD', axis=1)\n",
"y = data['TenYearCHD']\n",
"\n",
"# Split the dataset into training and testing sets\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "1448869c",
"metadata": {
"id": "b728e88f-e009-4bcf-968d-a135e4878111"
},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"logreg = LogisticRegression(solver='liblinear', random_state=42)\n",
"title = \"Learning Curves (Logistic Regression)\"\n",
"cv = 5 # Number of cross-validation folds\n",
"plot_learning_curve(logreg, title, X_train, y_train, cv=cv, n_jobs=1)\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"id": "d96eae71",
"metadata": {},
"source": [
"## Scale the data and repeat the Learning Curve"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "fd725a41",
"metadata": {
"id": "9c371f32-2298-4e78-b47c-ad81fad42474"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Original DataFrame:\n",
" male age education currentSmoker cigsPerDay BPMeds \n",
"0 1.0 39.0 4.0 0.0 0.0 0.0 \\\n",
"1 0.0 46.0 2.0 0.0 0.0 0.0 \n",
"2 1.0 48.0 1.0 1.0 20.0 0.0 \n",
"3 0.0 61.0 3.0 1.0 30.0 0.0 \n",
"4 0.0 46.0 3.0 1.0 23.0 0.0 \n",
"... ... ... ... ... ... ... \n",
"4233 1.0 50.0 1.0 1.0 1.0 0.0 \n",
"4234 1.0 51.0 3.0 1.0 43.0 0.0 \n",
"4235 0.0 48.0 2.0 1.0 20.0 0.0 \n",
"4236 0.0 44.0 1.0 1.0 15.0 0.0 \n",
"4237 0.0 52.0 2.0 0.0 0.0 0.0 \n",
"\n",
" prevalentStroke prevalentHyp diabetes totChol sysBP diaBP BMI \n",
"0 0.0 0.0 0.0 195.0 106.0 70.0 26.97 \\\n",
"1 0.0 0.0 0.0 250.0 121.0 81.0 28.73 \n",
"2 0.0 0.0 0.0 245.0 127.5 80.0 25.34 \n",
"3 0.0 1.0 0.0 225.0 150.0 95.0 28.58 \n",
"4 0.0 0.0 0.0 285.0 130.0 84.0 23.10 \n",
"... ... ... ... ... ... ... ... \n",
"4233 0.0 1.0 0.0 313.0 179.0 92.0 25.97 \n",
"4234 0.0 0.0 0.0 207.0 126.5 80.0 19.71 \n",
"4235 0.0 0.0 0.0 248.0 131.0 72.0 22.00 \n",
"4236 0.0 0.0 0.0 210.0 126.5 87.0 19.16 \n",
"4237 0.0 0.0 0.0 269.0 133.5 83.0 21.47 \n",
"\n",
" heartRate glucose TenYearCHD heavy_smoker \n",
"0 80.0 77.000000 0.0 0 \n",
"1 95.0 76.000000 0.0 0 \n",
"2 75.0 70.000000 0.0 1 \n",
"3 65.0 103.000000 1.0 1 \n",
"4 85.0 85.000000 0.0 1 \n",
"... ... ... ... ... \n",
"4233 66.0 86.000000 1.0 0 \n",
"4234 65.0 68.000000 0.0 1 \n",
"4235 84.0 86.000000 0.0 1 \n",
"4236 86.0 81.966753 0.0 0 \n",
"4237 80.0 107.000000 0.0 0 \n",
"\n",
"[4238 rows x 17 columns]\n",
"\n",
"Scaled DataFrame:\n",
" male age education currentSmoker cigsPerDay BPMeds \n",
"0 1.0 0.184211 1.000000 0.0 0.000000 0.0 \\\n",
"1 0.0 0.368421 0.333333 0.0 0.000000 0.0 \n",
"2 1.0 0.421053 0.000000 1.0 0.285714 0.0 \n",
"3 0.0 0.763158 0.666667 1.0 0.428571 0.0 \n",
"4 0.0 0.368421 0.666667 1.0 0.328571 0.0 \n",
"... ... ... ... ... ... ... \n",
"4233 1.0 0.473684 0.000000 1.0 0.014286 0.0 \n",
"4234 1.0 0.500000 0.666667 1.0 0.614286 0.0 \n",
"4235 0.0 0.421053 0.333333 1.0 0.285714 0.0 \n",
"4236 0.0 0.315789 0.000000 1.0 0.214286 0.0 \n",
"4237 0.0 0.526316 0.333333 0.0 0.000000 0.0 \n",
"\n",
" prevalentStroke prevalentHyp diabetes totChol sysBP diaBP \n",
"0 0.0 0.0 0.0 0.149406 0.106383 0.232804 \\\n",
"1 0.0 0.0 0.0 0.242784 0.177305 0.349206 \n",
"2 0.0 0.0 0.0 0.234295 0.208038 0.338624 \n",
"3 0.0 1.0 0.0 0.200340 0.314421 0.497354 \n",
"4 0.0 0.0 0.0 0.302207 0.219858 0.380952 \n",
"... ... ... ... ... ... ... \n",
"4233 0.0 1.0 0.0 0.349745 0.451537 0.465608 \n",
"4234 0.0 0.0 0.0 0.169779 0.203310 0.338624 \n",
"4235 0.0 0.0 0.0 0.239389 0.224586 0.253968 \n",
"4236 0.0 0.0 0.0 0.174873 0.203310 0.412698 \n",
"4237 0.0 0.0 0.0 0.275042 0.236407 0.370370 \n",
"\n",
" BMI heartRate glucose TenYearCHD heavy_smoker \n",
"0 0.277024 0.363636 0.104520 0.0 0.0 \n",
"1 0.319680 0.515152 0.101695 0.0 0.0 \n",
"2 0.237518 0.313131 0.084746 0.0 1.0 \n",
"3 0.316045 0.212121 0.177966 1.0 1.0 \n",
"4 0.183228 0.414141 0.127119 0.0 1.0 \n",
"... ... ... ... ... ... \n",
"4233 0.252787 0.222222 0.129944 1.0 0.0 \n",
"4234 0.101066 0.212121 0.079096 0.0 1.0 \n",
"4235 0.156568 0.404040 0.129944 0.0 1.0 \n",
"4236 0.087736 0.424242 0.118550 0.0 0.0 \n",
"4237 0.143723 0.363636 0.189266 0.0 0.0 \n",
"\n",
"[4238 rows x 17 columns]\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.preprocessing import MinMaxScaler\n",
"\n",
"# Get the column headers\n",
"column_headers = data.columns\n",
"\n",
"# Initialize the MinMaxScaler\n",
"scaler = MinMaxScaler()\n",
"\n",
"# Fit the scaler to the data and transform the data\n",
"scaled_values = scaler.fit_transform(data)\n",
"\n",
"# Convert the transformed data back to a DataFrame, preserving the column headers\n",
"scaled_data = pd.DataFrame(scaled_values, columns=column_headers)\n",
"\n",
"print(\"Original DataFrame:\")\n",
"print(data)\n",
"print(\"\\nScaled DataFrame:\")\n",
"print(scaled_data)\n"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "80673518",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<Axes: ylabel='Count'>"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 640x480 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sns.histplot(scaled_data)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "5342a37b",
"metadata": {},
"outputs": [],
"source": [
"data = scaled_data"
]
},
{
"cell_type": "markdown",
"id": "97f0a373",
"metadata": {
"id": "64f83ef0-6502-449c-925a-b2885b23df6d"
},
"source": [
"# Prepare data for modeling "
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "b03ece40",
"metadata": {
"id": "aa465aa8-9b73-456f-81f0-9458e525647a"
},
"outputs": [],
"source": [
"# Define the target variable and feature matrix\n",
"X = data.drop('TenYearCHD', axis=1)\n",
"y = data['TenYearCHD']"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "2282362f",
"metadata": {
"id": "aa465aa8-9b73-456f-81f0-9458e525647a"
},
"outputs": [],
"source": [
"# Split the dataset into training and testing sets\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)"
]
},
{
"cell_type": "markdown",
"id": "5a4936e3",
"metadata": {
"id": "a1d2e92a-dfa7-414b-95e4-24aa8f1a622e"
},
"source": [
"# Train the Model"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "671888c3",
"metadata": {
"id": "3b67313d-b1d3-4879-a766-11ade4010958"
},
"outputs": [
{
"data": {
"text/html": [
"<style>#sk-container-id-1 {color: black;background-color: white;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: \"▸\";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: \"▾\";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: \"\";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: \"\";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>LogisticRegression(random_state=42, solver=&#x27;liblinear&#x27;)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label sk-toggleable__label-arrow\">LogisticRegression</label><div class=\"sk-toggleable__content\"><pre>LogisticRegression(random_state=42, solver=&#x27;liblinear&#x27;)</pre></div></div></div></div></div>"
],
"text/plain": [
"LogisticRegression(random_state=42, solver='liblinear')"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"logreg = LogisticRegression(solver='liblinear', random_state=42)\n",
"logreg.fit(X_train, y_train)"
]
},
{
"cell_type": "markdown",
"id": "2d10d5b6",
"metadata": {
"id": "bd839a48-c1df-432f-893b-e7cffe02b9e1"
},
"source": [
"# Make Predictions"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "0722b753",
"metadata": {
"id": "a9ae23de-6584-4878-8925-257368066ae3"
},
"outputs": [],
"source": [
"y_pred = logreg.predict(X_test)"
]
},
{
"cell_type": "markdown",
"id": "ad52a9e8",
"metadata": {
"id": "19b82267-d17a-4e60-9ba1-b1ffc618c08e"
},
"source": [
"# Evaluate the Model"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "003874e7",
"metadata": {
"id": "6a94fb2b-b003-4cbd-acdd-fa17d45cf978"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" precision recall f1-score support\n",
"\n",
" 0.0 0.86 0.99 0.92 724\n",
" 1.0 0.60 0.07 0.13 124\n",
"\n",
" accuracy 0.86 848\n",
" macro avg 0.73 0.53 0.53 848\n",
"weighted avg 0.82 0.86 0.81 848\n",
"\n",
"Accuracy: 0.8573113207547169\n"
]
},
{
"data": {
"image/png": "",
"text/plain": [
"<Figure size 640x480 with 2 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Classification report\n",
"print(classification_report(y_test, y_pred))\n",
"\n",
"# Confusion matrix\n",
"conf_matrix = confusion_matrix(y_test, y_pred)\n",
"sns.heatmap(conf_matrix, annot=True, cmap=\"YlGnBu\", fmt=\"d\")\n",
"\n",
"# Accuracy score\n",
"accuracy = accuracy_score(y_test, y_pred)\n",
"print('Accuracy:', accuracy)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
},
"toc": {
"base_numbering": 1,
"nav_menu": {},
"number_sections": true,
"sideBar": true,
"skip_h1_title": false,
"title_cell": "Table of Contents",
"title_sidebar": "Contents",
"toc_cell": false,
"toc_position": {},
"toc_section_display": true,
"toc_window_display": false
},
"varInspector": {
"cols": {
"lenName": 16,
"lenType": 16,
"lenVar": 40
},
"kernels_config": {
"python": {
"delete_cmd_postfix": "",
"delete_cmd_prefix": "del ",
"library": "var_list.py",
"varRefreshCmd": "print(var_dic_list())"
},
"r": {
"delete_cmd_postfix": ") ",
"delete_cmd_prefix": "rm(",
"library": "var_list.r",
"varRefreshCmd": "cat(var_dic_list()) "
}
},
"types_to_exclude": [
"module",
"function",
"builtin_function_or_method",
"instance",
"_Feature"
],
"window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment