Last active
October 28, 2016 06:56
-
-
Save koljamaier/b2301cdcf436f27527676df5c6c64c6a to your computer and use it in GitHub Desktop.
Naive Bayes in the Kaggle Titanic Competition
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import division | |
import numpy as np | |
from matplotlib import pyplot as plt | |
from scipy.stats import norm | |
import pandas as pd | |
import csv as csv | |
import seaborn as sns | |
from numpy import matrix, mat | |
import re | |
fig, (ax1,ax2,ax3) = plt.subplots(1, 3, figsize=(15,5)) | |
df = pd.read_csv('train.csv', header=0) | |
# Fill in only missing Embarked value | |
df["Embarked"] = df["Embarked"].fillna("S") | |
# Fehlende Werte von "Age" auffüllen mit geschätzten Werten | |
df["NumEmbarked"] = df["Embarked"].map({"S": 0, "C": 1, "Q": 2}).astype(int) | |
df['Gender'] = df['Sex'].map( {'female': 0, 'male': 1} ).astype(int) | |
df["AgeNaN"] = df.Age.fillna(1).map(lambda num: num==1) | |
median_agedds = [mat(np.zeros((3,3))), mat(np.zeros((3,3)))] | |
for i in range(0, 2): | |
for j in range(0, 3): | |
for k in range(0,3): | |
median_agedds[i][j, k] = df[(df['Gender'] == i) & (df['Pclass'] == j+1) & (df["NumEmbarked"] == k)]['Age'].dropna().mean() | |
df['AgeFill'] = df['Age'] | |
for i in range(0, 2): | |
for j in range(0, 3): | |
for k in range(0, 3): | |
# New values for AgeFill are being filled in | |
df.loc[ (df.Age.isnull()) & (df.Gender == i) & (df.Pclass == j+1) & (df["NumEmbarked"]==k),'AgeFill'] = median_agedds[i][j, k] | |
# Assume that passengers with Parch==2 are most likely children (decrease age) | |
df.loc[df["Parch"]==2, "AgeFill"] = df[(df["Parch"]==2)]["Age"].dropna().mean() | |
# Assume that passengers with Parch==1 & SibSp==1 are most likely married parents (increase age) | |
df.loc[(df["Parch"]==1)&(df["SibSp"]==1), "AgeFill"] = df[(df["Parch"]==1)&(df["SibSp"]==1)]["Age"].dropna().mean() | |
df["Title"] = df["Name"].map(lambda name: re.sub('(.*, )|(\\..*.)', "", name)) | |
df.loc[df["Title"]=="Mme", "Title"] = "Mrs" | |
df.loc[(df["Title"]=="Ms") | (df["Title"]=="Mlle"), "Title"] = "Miss" | |
rare_title = ['Dona', 'Lady', 'the Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer'] | |
df.loc[(df["Title"].isin(rare_title)), "Title"] = "Rare Title" | |
fig, axel = plt.subplots(1, 1, figsize=(15,5)) | |
axel.set_title("Appearance of titles of survivors") | |
sns.countplot(x="Title", data=df[df["Survived"]==1]) | |
df["Title"] = df["Title"].map({"Mrs": 0, "Miss": 1, "Mr": 2, "Master":3, "Rare Title":4}).astype(int) | |
# The following commented block can be used to investigate the age of passengers | |
# with Parch==2 (in most cases that means its a child, because it has mother | |
# and father on board) | |
#bins = [0, 25, 100] | |
#group_names = ['Young', "Old"] | |
#df["AgeCat"] = pd.cut(df.Age.dropna(), bins, labels=group_names) | |
#possibleChildrens = df[(df["Parch"]==2)][["Age","SibSp", "AgeCat"]] | |
#sns.countplot(x="SibSp", hue="AgeCat", data=bsps, ax=axis3) | |
# Age of females embarked in S | |
embSfemale = df[(df["Embarked"]=="S") & (df["Gender"]==0)] | |
fig1, axis1 = plt.subplots(1,1, figsize=(15,5)) | |
sns.countplot(x="Age", data=embSfemale, ax=axis1) | |
# define "prior" from the training data | |
p1 = (df[df["Survived"]==1]["Survived"].count())/891 | |
# define the probabilites for the categorical variable Pclass | |
# P("Pclass" | Survived = 1) | |
p1Pclass = np.zeros(3) | |
for i in range(1,4): | |
p1Pclass[i-1] = df[(df["Pclass"]==i) & (df["Survived"]==1)]["Pclass"].count() | |
p1Pclass = np.log(p1Pclass/([sum(p1Pclass)]*len(p1Pclass))) | |
# P("Pclass" | Survived = 0) | |
p0Pclass = np.zeros(3) | |
for i in range(1,4): | |
p0Pclass[i-1] = df[(df["Pclass"]==i) & (df["Survived"]==0)]["Pclass"].count() | |
p0Pclass = np.log(p0Pclass/([sum(p0Pclass)]*len(p0Pclass))) | |
# P("Title" | Survived = 1) | |
p1Title = np.zeros(5) | |
for i in range(0,5): | |
p1Title[i] = df[(df["Title"]==i) & (df["Survived"]==1)]["Title"].count() | |
p1Title = np.log(p1Title/([sum(p1Title)]*len(p1Title))) | |
# P("Title" | Survived = 0) | |
p0Title = np.zeros(5) | |
for i in range(0,5): | |
p0Title[i] = df[(df["Title"]==i) & (df["Survived"]==0)]["Title"].count() | |
p0Title = np.log(p0Title/([sum(p0Title)]*len(p0Title))) | |
# P("Age" | Survived = 1) | |
p1Age = norm(loc = df[df["Survived"]==1]["AgeFill"].mean(), scale = df[df["Survived"]==1]["AgeFill"].std()) | |
x1 = np.linspace(p1Age.ppf(0.01), p1Age.ppf(0.99), 100) | |
ax1.set_title("Chance to survive conditioned on Age") | |
ax1.plot(x1, p1Age.pdf(x1)) | |
# P("Age" | Survived = 0) | |
p0Age = norm(loc = df[df["Survived"]==0]["AgeFill"].mean(), scale = df[df["Survived"]==0]["AgeFill"].std()) | |
x1 = np.linspace(p0Age.ppf(0.01), p0Age.ppf(0.99), 100) | |
ax1.plot(x1, p0Age.pdf(x1)) | |
# Original Age | |
pAge1 = norm(loc = df[df["Survived"]==1]["Age"].mean(), scale = df[df["Survived"]==1]["Age"].std()) | |
x1 = np.linspace(pAge1.ppf(0.01), pAge1.ppf(0.99), 100) | |
ax1.plot(x1, pAge1.pdf(x1)) | |
fig1, (axis1, axis2) = plt.subplots(1,2, figsize=(15,5)) | |
axis1.set_title("Original Age") | |
df["Age"].hist(bins=70, ax=axis1) | |
axis2.set_title("Filled in Age") | |
df["AgeFill"].hist(bins=70, ax=axis2) | |
# P("Fare" | Survived = 1) | |
p1Fare = norm(loc = df[df["Survived"]==1]["Fare"].mean(), scale = df[df["Survived"]==1]["Fare"].std()) | |
# P("Fare" | Survived = 0) | |
p0Fare = norm(loc = df[df["Survived"]==0]["Fare"].mean(), scale = df[df["Survived"]==0]["Fare"].std()) | |
fig2, ax1 = plt.subplots(1,1, figsize=(15,5)) | |
ax1.set_title("Chance to survive conditioned on paid Fare") | |
x1 = np.linspace(p1Fare.ppf(0.01), p1Fare.ppf(0.99), 100) | |
ax1.plot(x1, p1Fare.pdf(x1)) | |
x1 = np.linspace(p0Fare.ppf(0.01), p0Fare.ppf(0.99), 100) | |
ax1.plot(x1, p0Fare.pdf(x1), color="r") | |
# P("Gender" | Survived = 1) | |
p1Gender = np.zeros(2) | |
totalSurvived = df[df["Survived"]==1]["Survived"].count() | |
p1Gender[1] = df[(df.Gender==1) & (df.Survived==1)]["Survived"].count()/totalSurvived | |
p1Gender[0] = df[(df.Gender==0) & (df.Survived==1)]["Survived"].count()/totalSurvived | |
p1Gender = np.log(p1Gender) | |
# P("Gender" | Survived = 0) | |
p0Gender = np.zeros(2) | |
totalDead = df[df["Survived"]==0]["Survived"].count() | |
p0Gender[1] = df[(df.Gender==1) & (df.Survived==0)]["Survived"].count()/totalDead | |
p0Gender[0] = df[(df.Gender==0) & (df.Survived==0)]["Survived"].count()/totalDead | |
p0Gender = np.log(p0Gender) | |
sns.set_style("whitegrid") | |
sns.countplot(x="Gender", data=df[df["Survived"]==1], ax=ax2) | |
ax3.set_title("Age of female survivors") | |
sns.countplot(x="Age", data=df[(df["Survived"]==1) & df["Gender"]==0], ax=ax3) | |
# Classify Test Data | |
""" | |
Classifies a new passenger | |
Parameters: | |
vec2Classify - [PClass, Age, Gender, Fare] | |
""" | |
def classify(vec2Classify): | |
p1Vec = np.array([p1Pclass[vec2Classify[0]-1],p1Age.logpdf(vec2Classify[1]), p1Gender[vec2Classify[2]], p1Fare.logpdf(vec2Classify[3]), p1Title[vec2Classify[4]]]) | |
#p1Vec = np.array([p1Age.logpdf(vec2Classify[1])]) | |
p0Vec = np.array([p0Pclass[vec2Classify[0]-1],p0Age.logpdf(vec2Classify[1]), p0Gender[vec2Classify[2]], p0Fare.logpdf(vec2Classify[3]), p0Title[vec2Classify[4]]]) | |
#p0Vec = np.array([p0Age.logpdf(vec2Classify[1])]) | |
p1c = sum(p1Vec) + np.log(p1) # the log-sum equals the mult of the different feature likelihoods sum(p1Vec) | |
p0c = sum(p0Vec) + np.log(1.0 - p1) | |
if p1c > p0c: | |
return 1 | |
else: | |
return 0 | |
df_test = pd.read_csv('test.csv', header=0) | |
# Fill in only missing Embarked value | |
df_test["Embarked"] = df_test["Embarked"].fillna("S") | |
df_test["Fare"].fillna(df_test["Fare"].median(), inplace=True) | |
# Fehlende Werte von "Age" auffüllen mit geschätzten Werten | |
df_test['AgeFill'] = df_test['Age'] | |
df_test["NumEmbarked"] = df_test["Embarked"].map({"S": 0, "C": 1, "Q": 2}).astype(int) | |
df_test['Gender'] = df_test['Sex'].map( {'female': 0, 'male': 1} ).astype(int) | |
for i in range(0, 2): | |
for j in range(0, 3): | |
for k in range(0, 3): | |
# New values for AgeFill are being filled in | |
df_test.loc[ (df_test.Age.isnull()) & (df_test.Gender == i) & (df_test.Pclass == j+1) & (df_test["NumEmbarked"]==k),'AgeFill'] = median_agedds[i][j, k] | |
df_test.loc[df_test["Parch"]==2, "AgeFill"] = df[(df["Parch"]==2)]["Age"].dropna().mean() | |
df_test.loc[(df_test["Parch"]==1)&(df_test["SibSp"]==1), "AgeFill"] = df[(df["Parch"]==1)&(df["SibSp"]==1)]["Age"].dropna().mean() | |
df_test["Title"] = df_test["Name"].map(lambda name: re.sub('(.*, )|(\\..*.)', "", name)) | |
df_test.loc[df_test["Title"]=="Mme", "Title"] = "Mrs" | |
df_test.loc[(df_test["Title"]=="Ms") | (df_test["Title"]=="Mlle"), "Title"] = "Miss" | |
rare_title = ['Dona', 'Lady', 'the Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer'] | |
df_test.loc[(df_test["Title"].isin(rare_title)), "Title"] = "Rare Title" | |
df_test["Title"] = df_test["Title"].map({"Mrs": 0, "Miss": 1, "Mr": 2, "Master":3, "Rare Title":4}).astype(int) | |
df_test = df_test[["PassengerId", "Pclass", "AgeFill", "Gender", "Fare", "Title"]] | |
df_test["Survived"] = np.nan | |
test_file = df_test.values | |
predictions_file = open("naivebayesmodel.csv", "wb") | |
predictions_file_object = csv.writer(predictions_file) | |
predictions_file_object.writerow(["PassengerId", "Survived"]) | |
for row in test_file: | |
if classify([row[1], row[2], row[3], row[4], row[5]]): | |
predictions_file_object.writerow([row[0].astype(int), "1"]) | |
df_test.set_value(row[0].astype(int)-892, "Survived", 1) | |
else: | |
predictions_file_object.writerow([row[0].astype(int), "0"]) | |
df_test.set_value(row[0].astype(int)-892, "Survived", 0) | |
predictions_file.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment