Last active
August 29, 2015 14:05
-
-
Save roycoding/95445c1312e5fbaff8d0 to your computer and use it in GitHub Desktop.
Kaggle - Titanic: match the Gender, Class, Fare benchmark
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Python code for the Kaggle Titanic competition | |
# https://www.kaggle.com/c/titanic-gettingStarted | |
# This code implements the gender, class, fare benchmark. | |
# This is part of the Match 5 Kaggle Benchmarks in 5 Days challenge. | |
# https://www.kaggle.com/forums/t/9993/match-5-kaggle-benchmarks-in-5-days | |
import pandas as pd | |
import numpy as np | |
# Training and test data | |
train = pd.read_csv('train.csv') | |
test = pd.read_csv('test.csv') | |
# Data for gender, class, fare model | |
gcftrain = train[[u'PassengerId', u'Survived', u'Pclass',u'Sex', u'Fare']] | |
gcftest = test[[u'PassengerId', u'Pclass',u'Sex', u'Fare']] | |
# Gender, class, and fare will be binned to build survival table. | |
# 2 gender bins (M,F) | |
# 3 class bins (1,2,3), check via train.Pclass.unique() | |
# 4 fare bins [0-10],(10-20],(20-30],(30-inf), check via train.Fare.describe() | |
# Survivors | |
survival_table = np.zeros((2,3,4)) | |
# Total passengers | |
passenger_table = np.zeros((2,3,4)) | |
# Sort fares into bins | |
def farebin(fare): | |
if fare <= 10.0: | |
return 0 | |
elif 10 < fare <= 20: | |
return 1 | |
elif 20 < fare <= 30: | |
return 2 | |
else: | |
return 3 | |
gcftrain['farebin'] = gcftrain.Fare.apply(farebin) | |
gcftest['farebin'] = gcftest.Fare.apply(farebin) | |
# Binarize gender (female = 0, male = 1) | |
# pandas likes to complain about this, but it seems to work | |
gcftrain.Sex = gcftrain.Sex.apply(lambda x: 0 if x=='female' else 1) | |
gcftest.Sex = gcftest.Sex.apply(lambda x: 0 if x=='female' else 1) | |
# Populate survival and passenger tables | |
for p in gcftrain.values: | |
survival_table[p[3]][p[2]-1][int(p[5])] += p[1] | |
passenger_table[p[3]][p[2]-1][int(p[5])] += 1.0 | |
# Calculate survival rates and change NaN's to zero | |
rate_table = np.nan_to_num(survival_table / passenger_table) | |
# Predict survival of test set based on >/< 50% bin survival rate. | |
# Write predictions to CSV | |
with open('gcf.csv','w') as f: | |
f.write('PassengerId,Survived\n') | |
for p in gcftest.values: | |
if rate_table[p[2]][p[1]-1][p[4]] >= 0.5: | |
s = 1 | |
else: | |
s = 0 | |
f.write('%i,%i\n' % (p[0],s)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment