Created
May 22, 2016 11:04
-
-
Save VeylanSolmira/48e67648cfbf73fd399c5aefe75fd039 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import pandas\n", | |
"import csv\n", | |
"import numpy\n", | |
"from __future__ import division\n", | |
"from sklearn import linear_model, cross_validation, preprocessing, ensemble" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#Here I load each file separately. This ensures we now precisely what data we are woring with. Alternatively, especially if we had\n", | |
"#many files to read from, we might read all csv files in a directory. Here, I loaded locally. Could load directly web\n", | |
"data_md = pandas.read_csv(\"Desktop/Population_Health_Measures__Age-Adjusted_Mortality_Rates.csv\")\n", | |
"data_cdc = pandas.read_csv(\"Desktop/mortality_underlying_and_multiple_2057_35606388919559.csv\", skiprows = 6, encoding = \"ISO-8859-1\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#delete row with only cause value\n", | |
"data_cdc.drop(data_cdc.index[0], inplace = True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#exploratory data analysis\n", | |
"data_md" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"print \"Unique Deaths:\", len(numpy.unique(data_md['Cause of Death'].values)) #54\n", | |
"print \"Unique Races:\", len(numpy.unique(data_md['Race'].values))\n", | |
"print \"Unique Genders:\", len(numpy.unique(data_md['Gender'].values))\n", | |
"print numpy.unique(data_md['Cause of Death'].values)\n", | |
"print numpy.unique(data_md['Race'].values)\n", | |
"print numpy.unique(data_md['Race'].values)\n", | |
"print numpy.unique(data_md['3-Year Period'].values) #['2006-2008' '2007-2009' '2008-2010' '2009-2011' '2010-2012']\n", | |
"#data consists:\n", | |
" #of causes of death = 54 unique values\n", | |
" #race = 5 unique values; 'American Indian/Alaska Native', 'Asian/Pacific Islander', 'Black', 'Total', 'White'\n", | |
" #gender" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#Some simple aggregation by race\n", | |
"data_md.groupby('Race').mean()\n", | |
"#These results indicate we not have much information on American Indian/Alasa Native in this dataset" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#what are the most prevalent causes of death\n", | |
"data_md.groupby('Cause of Death').mean().sort_values('Age-Adjusted Rate per 100,000 Population', ascending = False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#here we have cause of death, sorted by Age-Adjusted Rate per 100,000 Population, grouped by race\n", | |
"death_by_race = data_md.groupby(['Race', 'Cause of Death']).mean().sort_values('Age-Adjusted Rate per 100,000 Population', ascending = False) \\\n", | |
" .sortlevel(sort_remaining = False)\n", | |
"death_by_race" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#here we can see the top-5 diseases by race\n", | |
"for name in [u'American Indian/Alaska Native', u'Asian/Pacific Islander', u'Black', u'Total', u'White']:\n", | |
" print death_by_race.loc[[name]]['Age-Adjusted Rate per 100,000 Population'].nlargest(5)\n", | |
"#this confirms that we do not have cause of death for American Indian/Alasa Native segmented by specific disease, confirming our above suspicion\n", | |
"\n", | |
"#cardiovascular diseases and malignant neoplasms are primary killers across racial classification\n", | |
"\n", | |
"#influenza and pneumonia is uniquely in the top 5 of Asian/Pacific Islander. Diabetes Mellitus unique in the top-5 of Black" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"data_cdc\n", | |
"#I didn't have a chance to work on the cdc data. Looking at top killers by age group would be the first thing I'd look at" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import pylab\n", | |
"data_cdc.hist()\n", | |
"pylab.show()\n", | |
"#date consists of causes of death and age" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"data_titanic" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"data_titanic.count()\n", | |
"#we have no missing values for survived" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#For this problem we have significant domain nowledge. Based off culture, we might think that age, sex, and class of the passengers, Pclass\n", | |
"# would be strong predictors of survival. Let's start by looking at the priors\n", | |
"survived = data_titanic['Survived'].sum()\n", | |
"total_passengers = len(data_titanic['Survived'])\n", | |
"print \"Number of survivors\", survived\n", | |
"print \"Number of passengers\", total_passengers\n", | |
"print \"Percent survived\", survived / total_passengers" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#Now, let's group by the above features and check rates of survival\n", | |
"data_titanic.groupby(['Sex'])['Survived'].mean()\n", | |
"#gender is a major predictor of survival" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#when we additionally include the class of the passenger we get further\n", | |
"data_titanic.groupby(['Sex', 'Pclass'])['Survived'].mean()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#from the above, we could predict on each passenger using the above odds based off their sex and Pclass data. But, we can do better with\n", | |
"# an actual meachine learning approach" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#Now, let us try a machine learning model. We could try a multiclass logistic regression\n", | |
"target = data_titanic['Survived']\n", | |
"target\n", | |
"data_titanic.drop('Survived', axis = 1, inplace = True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#some of our features are categorical.\n", | |
"data_titanic.dtypes" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#let's label encode these categorical features\n", | |
"non_numeric = list()\n", | |
"dtypes = data_titanic.dtypes\n", | |
"for index, elem in enumerate(dtypes):\n", | |
" if elem == 'object':\n", | |
" non_numeric.append(dtypes.index[index])\n", | |
"non_numeric" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"le = preprocessing.LabelEncoder()\n", | |
"for elem in non_numeric:\n", | |
" data_titanic[elem] = le.fit_transform(data_titanic[elem]) # generates a FutureWarning" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#using cross validation, let's check our model. Here, we fill missing values with the median for that variable\n", | |
"model = linear_model.LogisticRegression()\n", | |
"scores = cross_validation.cross_val_score(model, data_titanic.fillna(data_titanic.median()), target, cv = 5, scoring = 'accuracy')\n", | |
"print \"Accuracy: %0.4f (+/- %0.4f)\" % (scores.mean(), scores.std() * 2)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#the above model generated 79.35% accurate predictions. Let's try a random forest. Here, we fill missiong values with -999, common\n", | |
"# for decision tree based models\n", | |
"model = ensemble.RandomForestClassifier(n_estimators = 100)\n", | |
"scores = cross_validation.cross_val_score(model, data_titanic.fillna(-999), target, cv = 5, scoring = 'accuracy')\n", | |
"print \"Accuracy: %0.4f (+/- %0.4f)\" % (scores.mean(), scores.std() * 2)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#the random forest did a little better than logistic regression. Let's try one more" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"model = ensemble.ExtraTreesClassifier(n_estimators = 100)\n", | |
"scores = cross_validation.cross_val_score(model, data_titanic.fillna(-999), target, cv = 5, scoring = 'accuracy')\n", | |
"print \"Accuracy: %0.4f (+/- %0.4f)\" % (scores.mean(), scores.std() * 2)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#a little worse than a random forest, for this dataset" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.11" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment