VeylanSolmira · May 22, 2016 11:04
diff --git a/Healthcare Data.ipynb b/Healthcare Data.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas\n",
    "import csv\n",
    "import numpy\n",
    "from __future__ import division\n",
    "from sklearn import linear_model, cross_validation, preprocessing, ensemble"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#Here I load each file separately. This ensures we now precisely what data we are woring with. Alternatively, especially if we had\n",
    "#many files to read from, we might read all csv files in a directory. Here, I loaded locally. Could load directly web\n",
    "data_md = pandas.read_csv(\"Desktop/Population_Health_Measures__Age-Adjusted_Mortality_Rates.csv\")\n",
    "data_cdc = pandas.read_csv(\"Desktop/mortality_underlying_and_multiple_2057_35606388919559.csv\", skiprows = 6, encoding = \"ISO-8859-1\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#delete row with only cause value\n",
    "data_cdc.drop(data_cdc.index[0], inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#exploratory data analysis\n",
    "data_md"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "print \"Unique Deaths:\", len(numpy.unique(data_md['Cause of Death'].values)) #54\n",
    "print \"Unique Races:\", len(numpy.unique(data_md['Race'].values))\n",
    "print \"Unique Genders:\", len(numpy.unique(data_md['Gender'].values))\n",
    "print numpy.unique(data_md['Cause of Death'].values)\n",
    "print numpy.unique(data_md['Race'].values)\n",
    "print numpy.unique(data_md['Race'].values)\n",
    "print numpy.unique(data_md['3-Year Period'].values) #['2006-2008' '2007-2009' '2008-2010' '2009-2011' '2010-2012']\n",
    "#data consists:\n",
    "    #of causes of death = 54 unique values\n",
    "    #race = 5 unique values; 'American Indian/Alaska Native', 'Asian/Pacific Islander', 'Black', 'Total', 'White'\n",
    "    #gender"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#Some simple aggregation by race\n",
    "data_md.groupby('Race').mean()\n",
    "#These results indicate we not have much information on American Indian/Alasa Native in this dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#what are the most prevalent causes of death\n",
    "data_md.groupby('Cause of Death').mean().sort_values('Age-Adjusted Rate per 100,000 Population', ascending = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#here we have cause of death, sorted by Age-Adjusted Rate per 100,000 Population, grouped by race\n",
    "death_by_race = data_md.groupby(['Race', 'Cause of Death']).mean().sort_values('Age-Adjusted Rate per 100,000 Population', ascending = False) \\\n",
    "                .sortlevel(sort_remaining = False)\n",
    "death_by_race"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#here we can see the top-5 diseases by race\n",
    "for name in [u'American Indian/Alaska Native', u'Asian/Pacific Islander', u'Black', u'Total', u'White']:\n",
    "    print death_by_race.loc[[name]]['Age-Adjusted Rate per 100,000 Population'].nlargest(5)\n",
    "#this confirms that we do not have cause of death for American Indian/Alasa Native segmented by specific disease, confirming our above suspicion\n",
    "\n",
    "#cardiovascular diseases and malignant neoplasms are primary killers across racial classification\n",
    "\n",
    "#influenza and pneumonia is uniquely in the top 5 of Asian/Pacific Islander. Diabetes Mellitus unique in the top-5 of Black"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "data_cdc\n",
    "#I didn't have a chance to work on the cdc data. Looking at top killers by age group would be the first thing I'd look at"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pylab\n",
    "data_cdc.hist()\n",
    "pylab.show()\n",
    "#date consists of causes of death and age"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "data_titanic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "data_titanic.count()\n",
    "#we have no missing values for survived"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#For this problem we have significant domain  nowledge. Based off culture, we might think that age, sex, and class of the passengers, Pclass\n",
    "#    would be strong predictors of survival. Let's start by looking at the priors\n",
    "survived = data_titanic['Survived'].sum()\n",
    "total_passengers = len(data_titanic['Survived'])\n",
    "print \"Number of survivors\", survived\n",
    "print \"Number of passengers\", total_passengers\n",
    "print \"Percent survived\", survived / total_passengers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#Now, let's group by the above features and check rates of survival\n",
    "data_titanic.groupby(['Sex'])['Survived'].mean()\n",
    "#gender is a major predictor of survival"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#when we additionally include the class of the passenger we get further\n",
    "data_titanic.groupby(['Sex', 'Pclass'])['Survived'].mean()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#from the above, we could predict on each passenger using the above odds based off their sex and Pclass data. But, we can do better with\n",
    "#    an actual meachine learning approach"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#Now, let us try a machine learning model. We could try a multiclass logistic regression\n",
    "target = data_titanic['Survived']\n",
    "target\n",
    "data_titanic.drop('Survived', axis = 1, inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#some of our features are categorical.\n",
    "data_titanic.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#let's label encode these categorical features\n",
    "non_numeric = list()\n",
    "dtypes = data_titanic.dtypes\n",
    "for index, elem in enumerate(dtypes):\n",
    "    if elem == 'object':\n",
    "        non_numeric.append(dtypes.index[index])\n",
    "non_numeric"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "le = preprocessing.LabelEncoder()\n",
    "for elem in non_numeric:\n",
    "    data_titanic[elem] = le.fit_transform(data_titanic[elem]) # generates a FutureWarning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#using cross validation, let's check our model. Here, we fill missing values with the median for that variable\n",
    "model = linear_model.LogisticRegression()\n",
    "scores = cross_validation.cross_val_score(model, data_titanic.fillna(data_titanic.median()), target, cv = 5, scoring = 'accuracy')\n",
    "print \"Accuracy: %0.4f (+/- %0.4f)\" % (scores.mean(), scores.std() * 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#the above model generated 79.35% accurate predictions. Let's try a random forest. Here, we fill missiong values with -999, common\n",
    "#    for decision tree based models\n",
    "model = ensemble.RandomForestClassifier(n_estimators = 100)\n",
    "scores = cross_validation.cross_val_score(model, data_titanic.fillna(-999), target, cv = 5, scoring = 'accuracy')\n",
    "print \"Accuracy: %0.4f (+/- %0.4f)\" % (scores.mean(), scores.std() * 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#the random forest did a little better than logistic regression. Let's try one more"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "model = ensemble.ExtraTreesClassifier(n_estimators = 100)\n",
    "scores = cross_validation.cross_val_score(model, data_titanic.fillna(-999), target, cv = 5, scoring = 'accuracy')\n",
    "print \"Accuracy: %0.4f (+/- %0.4f)\" % (scores.mean(), scores.std() * 2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#a little worse than a random forest, for this dataset"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import pandas\n",
	"import csv\n",
	"import numpy\n",
	"from __future__ import division\n",
	"from sklearn import linear_model, cross_validation, preprocessing, ensemble"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"#Here I load each file separately. This ensures we now precisely what data we are woring with. Alternatively, especially if we had\n",
	"#many files to read from, we might read all csv files in a directory. Here, I loaded locally. Could load directly web\n",
	"data_md = pandas.read_csv(\"Desktop/Population_Health_Measures__Age-Adjusted_Mortality_Rates.csv\")\n",
	"data_cdc = pandas.read_csv(\"Desktop/mortality_underlying_and_multiple_2057_35606388919559.csv\", skiprows = 6, encoding = \"ISO-8859-1\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"#delete row with only cause value\n",
	"data_cdc.drop(data_cdc.index[0], inplace = True)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"#exploratory data analysis\n",
	"data_md"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"print \"Unique Deaths:\", len(numpy.unique(data_md['Cause of Death'].values)) #54\n",
	"print \"Unique Races:\", len(numpy.unique(data_md['Race'].values))\n",
	"print \"Unique Genders:\", len(numpy.unique(data_md['Gender'].values))\n",
	"print numpy.unique(data_md['Cause of Death'].values)\n",
	"print numpy.unique(data_md['Race'].values)\n",
	"print numpy.unique(data_md['Race'].values)\n",
	"print numpy.unique(data_md['3-Year Period'].values) #['2006-2008' '2007-2009' '2008-2010' '2009-2011' '2010-2012']\n",
	"#data consists:\n",
	" #of causes of death = 54 unique values\n",
	" #race = 5 unique values; 'American Indian/Alaska Native', 'Asian/Pacific Islander', 'Black', 'Total', 'White'\n",
	" #gender"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"#Some simple aggregation by race\n",
	"data_md.groupby('Race').mean()\n",
	"#These results indicate we not have much information on American Indian/Alasa Native in this dataset"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"#what are the most prevalent causes of death\n",
	"data_md.groupby('Cause of Death').mean().sort_values('Age-Adjusted Rate per 100,000 Population', ascending = False)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"#here we have cause of death, sorted by Age-Adjusted Rate per 100,000 Population, grouped by race\n",
	"death_by_race = data_md.groupby(['Race', 'Cause of Death']).mean().sort_values('Age-Adjusted Rate per 100,000 Population', ascending = False) \\\n",
	" .sortlevel(sort_remaining = False)\n",
	"death_by_race"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"#here we can see the top-5 diseases by race\n",
	"for name in [u'American Indian/Alaska Native', u'Asian/Pacific Islander', u'Black', u'Total', u'White']:\n",
	" print death_by_race.loc[[name]]['Age-Adjusted Rate per 100,000 Population'].nlargest(5)\n",
	"#this confirms that we do not have cause of death for American Indian/Alasa Native segmented by specific disease, confirming our above suspicion\n",
	"\n",
	"#cardiovascular diseases and malignant neoplasms are primary killers across racial classification\n",
	"\n",
	"#influenza and pneumonia is uniquely in the top 5 of Asian/Pacific Islander. Diabetes Mellitus unique in the top-5 of Black"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"data_cdc\n",
	"#I didn't have a chance to work on the cdc data. Looking at top killers by age group would be the first thing I'd look at"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"import pylab\n",
	"data_cdc.hist()\n",
	"pylab.show()\n",
	"#date consists of causes of death and age"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"data_titanic"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"data_titanic.count()\n",
	"#we have no missing values for survived"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"#For this problem we have significant domain nowledge. Based off culture, we might think that age, sex, and class of the passengers, Pclass\n",
	"# would be strong predictors of survival. Let's start by looking at the priors\n",
	"survived = data_titanic['Survived'].sum()\n",
	"total_passengers = len(data_titanic['Survived'])\n",
	"print \"Number of survivors\", survived\n",
	"print \"Number of passengers\", total_passengers\n",
	"print \"Percent survived\", survived / total_passengers"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"#Now, let's group by the above features and check rates of survival\n",
	"data_titanic.groupby(['Sex'])['Survived'].mean()\n",
	"#gender is a major predictor of survival"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"#when we additionally include the class of the passenger we get further\n",
	"data_titanic.groupby(['Sex', 'Pclass'])['Survived'].mean()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"#from the above, we could predict on each passenger using the above odds based off their sex and Pclass data. But, we can do better with\n",
	"# an actual meachine learning approach"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"#Now, let us try a machine learning model. We could try a multiclass logistic regression\n",
	"target = data_titanic['Survived']\n",
	"target\n",
	"data_titanic.drop('Survived', axis = 1, inplace = True)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"#some of our features are categorical.\n",
	"data_titanic.dtypes"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"#let's label encode these categorical features\n",
	"non_numeric = list()\n",
	"dtypes = data_titanic.dtypes\n",
	"for index, elem in enumerate(dtypes):\n",
	" if elem == 'object':\n",
	" non_numeric.append(dtypes.index[index])\n",
	"non_numeric"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"le = preprocessing.LabelEncoder()\n",
	"for elem in non_numeric:\n",
	" data_titanic[elem] = le.fit_transform(data_titanic[elem]) # generates a FutureWarning"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"#using cross validation, let's check our model. Here, we fill missing values with the median for that variable\n",
	"model = linear_model.LogisticRegression()\n",
	"scores = cross_validation.cross_val_score(model, data_titanic.fillna(data_titanic.median()), target, cv = 5, scoring = 'accuracy')\n",
	"print \"Accuracy: %0.4f (+/- %0.4f)\" % (scores.mean(), scores.std() * 2)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"#the above model generated 79.35% accurate predictions. Let's try a random forest. Here, we fill missiong values with -999, common\n",
	"# for decision tree based models\n",
	"model = ensemble.RandomForestClassifier(n_estimators = 100)\n",
	"scores = cross_validation.cross_val_score(model, data_titanic.fillna(-999), target, cv = 5, scoring = 'accuracy')\n",
	"print \"Accuracy: %0.4f (+/- %0.4f)\" % (scores.mean(), scores.std() * 2)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"#the random forest did a little better than logistic regression. Let's try one more"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"model = ensemble.ExtraTreesClassifier(n_estimators = 100)\n",
	"scores = cross_validation.cross_val_score(model, data_titanic.fillna(-999), target, cv = 5, scoring = 'accuracy')\n",
	"print \"Accuracy: %0.4f (+/- %0.4f)\" % (scores.mean(), scores.std() * 2)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"#a little worse than a random forest, for this dataset"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.11"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}