Last active
March 11, 2017 11:42
-
-
Save EikeDehling/90404aed7de3746162595be161109ef3 to your computer and use it in GitHub Desktop.
Some experiments for kaggle titanic survivors machine learning competition (https://www.kaggle.com/c/titanic)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas | |
from sklearn import linear_model, svm, tree, naive_bayes | |
from sklearn.model_selection import cross_val_score | |
import numpy as np | |
data = pandas.read_csv('train.csv') | |
def preprocess(data): | |
data['Fare'] = data['Fare'].fillna(data['Fare'].mean()) | |
data['Age'] = data['Age'].fillna(data['Age'].mean()) | |
titles = { | |
"Capt": "Officer", | |
"Col": "Officer", | |
"Major": "Officer", | |
"Jonkheer": "Royalty", | |
"Don": "Royalty", | |
"Sir": "Royalty", | |
"Dr": "Officer", | |
"Rev": "Officer", | |
"the Countess":"Royalty", | |
"Dona": "Royalty", | |
"Mme": "Mrs", | |
"Mlle": "Miss", | |
"Ms": "Mrs", | |
"Mr": "Mr", | |
"Mrs": "Mrs", | |
"Miss": "Miss", | |
"Master": "Master", | |
"Lady": "Royalty", | |
} | |
data['Title'] = data['Name'].map(lambda name: name.split(',')[1].split('.')[0].strip()).map(titles).astype('category') | |
# Do the one-hot encoding | |
data = pandas.get_dummies(data, columns=['Sex', 'Title', 'Pclass']) | |
return data | |
data = preprocess(data) | |
""" | |
print(data['Sex_male'].astype('category').describe()) | |
print() | |
print(data['Sex_female'].astype('category').describe()) | |
print() | |
print(data['Survived'].describe()) | |
print() | |
print(data['Age'].describe()) | |
print() | |
""" | |
def getX(data): | |
return data.as_matrix(['Pclass_1', 'Pclass_2', 'Pclass_3', | |
'Sex_male', 'Sex_female', | |
'Age', 'Parch', 'SibSp', 'Fare', | |
'Title_Officer', 'Title_Royalty', 'Title_Master', | |
'Title_Mr', 'Title_Mrs', 'Title_Miss']) | |
classifiers = [ | |
('Logistic regression', linear_model.LogisticRegression()), | |
('SVM classifier', svm.SVC(kernel='linear')), | |
('Decision tree classifier', tree.DecisionTreeClassifier()), | |
('Naive Bayes classifier', naive_bayes.GaussianNB()), | |
] | |
for name, candidate in classifiers: | |
scores = cross_val_score(candidate, getX(data), data['Survived'], cv=5, scoring='accuracy') | |
print(name, np.mean(scores)) | |
""" | |
model = linear_model.LogisticRegression() | |
model.fit(getX(data), data['Survived']) | |
data = pandas.read_csv('test.csv') | |
data = preprocess(data) | |
data['Survived'] = model.predict(getX(data)) | |
data.to_csv('submission.csv', index=False, columns=['PassengerId', 'Survived']) | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment