Last active
March 13, 2018 03:34
-
-
Save pupadupa/91c1c4f0a93ee8d11b74 to your computer and use it in GitHub Desktop.
Some machine learning algorithm for Titanic dataset. SVM, Random Forest, XGBoost. Link to kaggle https://www.kaggle.com/c/titanic
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Be sure that you have installed libraries (if not - google it): | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import pandas as pd | |
import xgboost as xgb | |
from sklearn.preprocessing import LabelEncoder | |
import numpy as np | |
from sklearn.metrics import accuracy_score | |
from sklearn.metrics import mean_squared_error | |
from sklearn import cross_validation | |
# Load the data | |
# dataset available at kaggle https://www.kaggle.com/c/titanic | |
train_df = pd.read_csv('~/data/kaggle/titanic/train.csv', header=0) | |
test_df = pd.read_csv('~/data/kaggle/titanic/test.csv', header=0) | |
# We'll impute missing values using the median for numeric columns and the most | |
# common value for string columns. | |
# This is based on some nice code by 'sveitser' at http://stackoverflow.com/a/25562948 | |
from sklearn.base import TransformerMixin | |
class DataFrameImputer(TransformerMixin): | |
def fit(self, X, y=None): | |
self.fill = pd.Series([X[c].value_counts().index[0] | |
if X[c].dtype == np.dtype('O') else X[c].median() for c in X], | |
index=X.columns) | |
return self | |
def transform(self, X, y=None): | |
return X.fillna(self.fill) | |
# feature_columns_to_use = ['Pclass','Sex','Age','Fare','Parch'] | |
feature_columns_to_use = ['Pclass','Sex','Age','Fare','Parch','SibSp','Ticket','Cabin','Embarked'] | |
nonnumeric_columns = ['Sex','Ticket','Cabin','Embarked'] | |
# Join the features from train and test together before imputing missing values, | |
# in case their distribution is slightly different | |
big_X = train_df[feature_columns_to_use].append(test_df[feature_columns_to_use]) | |
big_X_imputed = DataFrameImputer().fit_transform(big_X) | |
# XGBoost doesn't (yet) handle categorical features automatically, so we need to change | |
# them to columns of integer values. | |
# See http://scikit-learn.org/stable/modules/preprocessing.html#preprocessing for more | |
# details and options | |
le = LabelEncoder() | |
for feature in nonnumeric_columns: | |
big_X_imputed[feature] = le.fit_transform(big_X_imputed[feature]) | |
# Prepare the inputs for the model | |
train_X = big_X_imputed[0:train_df.shape[0]].as_matrix() | |
test_X = big_X_imputed[train_df.shape[0]::].as_matrix() | |
train_y = train_df['Survived'] | |
### XGboost Baseline | |
gbm_baseline_clf = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05) | |
### XGboost | |
gbm_clf = xgb.XGBClassifier(max_depth=5, n_estimators=320, learning_rate=0.05) | |
### SVM | |
from sklearn import svm | |
svm_clf = svm.SVC(gamma=0.005, C=200) | |
### Random forest | |
from sklearn.ensemble import RandomForestClassifier | |
#rf = RandomForestClassifier() #default | |
rf = RandomForestClassifier(n_estimators=500) | |
# This function train classifiers N times. Each time it split training set for train/learn, | |
# then fit classifier on training set and calculate MSE on test set | |
mse = [[],[],[],[]] # we will create 4 predictions | |
accuracy = [[],[],[],[]] | |
def do_folds(folds = 2): | |
for f in range(0, folds): | |
### Split | |
X_train, X_test, y_train, y_test = cross_validation.train_test_split(train_X, | |
train_y, test_size=0.10) | |
gbm_baseline = gbm_baseline_clf.fit(X_train, y_train) | |
gbm_baseline_predictions = gbm_baseline.predict(X_test) | |
gbm = gbm_clf.fit(X_train, y_train) | |
gbm_predictions = gbm.predict(X_test) | |
svm_clf.fit(X_train, y_train) | |
svm_clf_predictions = svm_clf.predict(X_test) | |
rf.fit(X_train, y_train) | |
rf_clf_predict = rf.predict(X_test) | |
mse[0].append(mean_squared_error(gbm_baseline_predictions, y_test)) | |
mse[1].append(mean_squared_error(gbm_predictions, y_test)) | |
mse[2].append(mean_squared_error(svm_clf_predictions, y_test)) | |
mse[3].append(mean_squared_error(rf_clf_predict, y_test)) | |
accuracy[0].append(accuracy_score(y_test, gbm_baseline_predictions)) | |
accuracy[1].append(accuracy_score(y_test, gbm_predictions)) | |
accuracy[2].append(accuracy_score(y_test, svm_clf_predictions)) | |
accuracy[3].append(accuracy_score(y_test, rf_clf_predict)) | |
#we learn wour models 50 time and cal MSE each time | |
do_folds(50) | |
# Show accuracy for each prediction AND plot data | |
print np.mean(accuracy, axis=1) | |
print np.std(accuracy, axis=1) | |
accuracy0 = sns.distplot(accuracy[0],kde=False, rug=True, color='r', bins=20) | |
plt.show(accuracy0) | |
accuracy1 = sns.distplot(accuracy[1],kde=False, rug=True, color='r', bins=20) | |
plt.show(accuracy1) | |
accuracy2 = sns.distplot(accuracy[2],kde=False, rug=True, color='b', bins=20) | |
plt.show(accuracy2) | |
accuracy3 = sns.distplot(accuracy[3],kde=False, rug=True, color='g', bins=20) | |
plt.show(accuracy3) | |
#Show means for each prediction AND plot data | |
print np.mean(mse, axis=1) | |
mse0 = sns.distplot(mse[0],kde=False, rug=True, color='r', bins=20) | |
plt.show(mse0) | |
mse1 = sns.distplot(mse[1],kde=False, rug=True, color='r', bins=20) | |
plt.show(mse1) | |
mse2 = sns.distplot(mse[2],kde=False, rug=True, color='b', bins=20) | |
plt.show(mse2) | |
mse3 = sns.distplot(mse[3],kde=False, rug=True, color='g', bins=20) | |
plt.show(mse3) | |
# Store XGboost prediction resul | |
final_clf = gbm_baseline_clf.fit(train_X, train_y) | |
predictions = final_clf.predict(test_X) | |
# Kaggle needs the submission to have a certain format; | |
# see https://www.kaggle.com/c/titanic-gettingStarted/download/gendermodel.csv | |
# for an example of what it's supposed to look like. | |
submission = pd.DataFrame({ 'PassengerId': test_df['PassengerId'], | |
'Survived': predictions }) | |
submission.to_csv("submission.csv", index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Which one algorithm gives us accurate result on Titanic Data.....