Skip to content

Instantly share code, notes, and snippets.

@VeylanSolmira
Created May 22, 2016 11:04
Show Gist options
  • Save VeylanSolmira/48e67648cfbf73fd399c5aefe75fd039 to your computer and use it in GitHub Desktop.
Save VeylanSolmira/48e67648cfbf73fd399c5aefe75fd039 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas\n",
"import csv\n",
"import numpy\n",
"from __future__ import division\n",
"from sklearn import linear_model, cross_validation, preprocessing, ensemble"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#Here I load each file separately. This ensures we now precisely what data we are woring with. Alternatively, especially if we had\n",
"#many files to read from, we might read all csv files in a directory. Here, I loaded locally. Could load directly web\n",
"data_md = pandas.read_csv(\"Desktop/Population_Health_Measures__Age-Adjusted_Mortality_Rates.csv\")\n",
"data_cdc = pandas.read_csv(\"Desktop/mortality_underlying_and_multiple_2057_35606388919559.csv\", skiprows = 6, encoding = \"ISO-8859-1\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#delete row with only cause value\n",
"data_cdc.drop(data_cdc.index[0], inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#exploratory data analysis\n",
"data_md"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"print \"Unique Deaths:\", len(numpy.unique(data_md['Cause of Death'].values)) #54\n",
"print \"Unique Races:\", len(numpy.unique(data_md['Race'].values))\n",
"print \"Unique Genders:\", len(numpy.unique(data_md['Gender'].values))\n",
"print numpy.unique(data_md['Cause of Death'].values)\n",
"print numpy.unique(data_md['Race'].values)\n",
"print numpy.unique(data_md['Race'].values)\n",
"print numpy.unique(data_md['3-Year Period'].values) #['2006-2008' '2007-2009' '2008-2010' '2009-2011' '2010-2012']\n",
"#data consists:\n",
" #of causes of death = 54 unique values\n",
" #race = 5 unique values; 'American Indian/Alaska Native', 'Asian/Pacific Islander', 'Black', 'Total', 'White'\n",
" #gender"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#Some simple aggregation by race\n",
"data_md.groupby('Race').mean()\n",
"#These results indicate we not have much information on American Indian/Alasa Native in this dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#what are the most prevalent causes of death\n",
"data_md.groupby('Cause of Death').mean().sort_values('Age-Adjusted Rate per 100,000 Population', ascending = False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#here we have cause of death, sorted by Age-Adjusted Rate per 100,000 Population, grouped by race\n",
"death_by_race = data_md.groupby(['Race', 'Cause of Death']).mean().sort_values('Age-Adjusted Rate per 100,000 Population', ascending = False) \\\n",
" .sortlevel(sort_remaining = False)\n",
"death_by_race"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#here we can see the top-5 diseases by race\n",
"for name in [u'American Indian/Alaska Native', u'Asian/Pacific Islander', u'Black', u'Total', u'White']:\n",
" print death_by_race.loc[[name]]['Age-Adjusted Rate per 100,000 Population'].nlargest(5)\n",
"#this confirms that we do not have cause of death for American Indian/Alasa Native segmented by specific disease, confirming our above suspicion\n",
"\n",
"#cardiovascular diseases and malignant neoplasms are primary killers across racial classification\n",
"\n",
"#influenza and pneumonia is uniquely in the top 5 of Asian/Pacific Islander. Diabetes Mellitus unique in the top-5 of Black"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data_cdc\n",
"#I didn't have a chance to work on the cdc data. Looking at top killers by age group would be the first thing I'd look at"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import pylab\n",
"data_cdc.hist()\n",
"pylab.show()\n",
"#date consists of causes of death and age"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data_titanic"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data_titanic.count()\n",
"#we have no missing values for survived"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#For this problem we have significant domain nowledge. Based off culture, we might think that age, sex, and class of the passengers, Pclass\n",
"# would be strong predictors of survival. Let's start by looking at the priors\n",
"survived = data_titanic['Survived'].sum()\n",
"total_passengers = len(data_titanic['Survived'])\n",
"print \"Number of survivors\", survived\n",
"print \"Number of passengers\", total_passengers\n",
"print \"Percent survived\", survived / total_passengers"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#Now, let's group by the above features and check rates of survival\n",
"data_titanic.groupby(['Sex'])['Survived'].mean()\n",
"#gender is a major predictor of survival"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#when we additionally include the class of the passenger we get further\n",
"data_titanic.groupby(['Sex', 'Pclass'])['Survived'].mean()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#from the above, we could predict on each passenger using the above odds based off their sex and Pclass data. But, we can do better with\n",
"# an actual meachine learning approach"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#Now, let us try a machine learning model. We could try a multiclass logistic regression\n",
"target = data_titanic['Survived']\n",
"target\n",
"data_titanic.drop('Survived', axis = 1, inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#some of our features are categorical.\n",
"data_titanic.dtypes"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#let's label encode these categorical features\n",
"non_numeric = list()\n",
"dtypes = data_titanic.dtypes\n",
"for index, elem in enumerate(dtypes):\n",
" if elem == 'object':\n",
" non_numeric.append(dtypes.index[index])\n",
"non_numeric"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"le = preprocessing.LabelEncoder()\n",
"for elem in non_numeric:\n",
" data_titanic[elem] = le.fit_transform(data_titanic[elem]) # generates a FutureWarning"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#using cross validation, let's check our model. Here, we fill missing values with the median for that variable\n",
"model = linear_model.LogisticRegression()\n",
"scores = cross_validation.cross_val_score(model, data_titanic.fillna(data_titanic.median()), target, cv = 5, scoring = 'accuracy')\n",
"print \"Accuracy: %0.4f (+/- %0.4f)\" % (scores.mean(), scores.std() * 2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#the above model generated 79.35% accurate predictions. Let's try a random forest. Here, we fill missiong values with -999, common\n",
"# for decision tree based models\n",
"model = ensemble.RandomForestClassifier(n_estimators = 100)\n",
"scores = cross_validation.cross_val_score(model, data_titanic.fillna(-999), target, cv = 5, scoring = 'accuracy')\n",
"print \"Accuracy: %0.4f (+/- %0.4f)\" % (scores.mean(), scores.std() * 2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#the random forest did a little better than logistic regression. Let's try one more"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"model = ensemble.ExtraTreesClassifier(n_estimators = 100)\n",
"scores = cross_validation.cross_val_score(model, data_titanic.fillna(-999), target, cv = 5, scoring = 'accuracy')\n",
"print \"Accuracy: %0.4f (+/- %0.4f)\" % (scores.mean(), scores.std() * 2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#a little worse than a random forest, for this dataset"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment