Skip to content

Instantly share code, notes, and snippets.

@roycoding
Last active August 29, 2015 14:05
Show Gist options
  • Save roycoding/95445c1312e5fbaff8d0 to your computer and use it in GitHub Desktop.
Save roycoding/95445c1312e5fbaff8d0 to your computer and use it in GitHub Desktop.
Kaggle - Titanic: match the Gender, Class, Fare benchmark
# Python code for the Kaggle Titanic competition
# https://www.kaggle.com/c/titanic-gettingStarted
# This code implements the gender, class, fare benchmark.
# This is part of the Match 5 Kaggle Benchmarks in 5 Days challenge.
# https://www.kaggle.com/forums/t/9993/match-5-kaggle-benchmarks-in-5-days
import pandas as pd
import numpy as np
# Training and test data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
# Data for gender, class, fare model
gcftrain = train[[u'PassengerId', u'Survived', u'Pclass',u'Sex', u'Fare']]
gcftest = test[[u'PassengerId', u'Pclass',u'Sex', u'Fare']]
# Gender, class, and fare will be binned to build survival table.
# 2 gender bins (M,F)
# 3 class bins (1,2,3), check via train.Pclass.unique()
# 4 fare bins [0-10],(10-20],(20-30],(30-inf), check via train.Fare.describe()
# Survivors
survival_table = np.zeros((2,3,4))
# Total passengers
passenger_table = np.zeros((2,3,4))
# Sort fares into bins
def farebin(fare):
if fare <= 10.0:
return 0
elif 10 < fare <= 20:
return 1
elif 20 < fare <= 30:
return 2
else:
return 3
gcftrain['farebin'] = gcftrain.Fare.apply(farebin)
gcftest['farebin'] = gcftest.Fare.apply(farebin)
# Binarize gender (female = 0, male = 1)
# pandas likes to complain about this, but it seems to work
gcftrain.Sex = gcftrain.Sex.apply(lambda x: 0 if x=='female' else 1)
gcftest.Sex = gcftest.Sex.apply(lambda x: 0 if x=='female' else 1)
# Populate survival and passenger tables
for p in gcftrain.values:
survival_table[p[3]][p[2]-1][int(p[5])] += p[1]
passenger_table[p[3]][p[2]-1][int(p[5])] += 1.0
# Calculate survival rates and change NaN's to zero
rate_table = np.nan_to_num(survival_table / passenger_table)
# Predict survival of test set based on >/< 50% bin survival rate.
# Write predictions to CSV
with open('gcf.csv','w') as f:
f.write('PassengerId,Survived\n')
for p in gcftest.values:
if rate_table[p[2]][p[1]-1][p[4]] >= 0.5:
s = 1
else:
s = 0
f.write('%i,%i\n' % (p[0],s))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment