Created
April 16, 2019 01:12
-
-
Save koba-e964/fe183af0c7e46b810dad0d3fcdce8e9b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# Reference: https://qiita.com/upura/items/3c10ff6fed4e7c3d70f0 | |
import pandas as pd | |
import numpy as np | |
import csv as csv | |
from sklearn.preprocessing import PowerTransformer | |
from sklearn.pipeline import Pipeline | |
train = pd.read_csv("train.csv") | |
test = pd.read_csv("test.csv") | |
test_orig = test | |
data = pd.concat([train, test], sort=True) | |
# preprocessing | |
data['Sex'].replace(['male','female'],[0, 1], inplace=True) | |
data['Embarked'].fillna(('S'), inplace=True) | |
data['Embarked'] = data['Embarked'].map( {'S': 0, 'C': 2, 'Q': 1} ).astype(int) | |
data['Fare'].fillna(np.mean(data['Fare']), inplace=True) | |
age_avg = data['Age'].mean() | |
age_std = data['Age'].std() | |
data['Age'].fillna(np.random.randint(age_avg - age_std, age_avg + age_std), inplace=True) | |
# Feature engineering | |
data['Family_Size'] = data['Parch'] + data['SibSp'] + 1 | |
# data['Family_Size_Feature'] = (data['Family_Size'] >= 5) | (data['Family_Size'] <= 1) | |
#data['PclassIs3'] = data['Pclass'] == 3 | |
#data['Age2030'] = (data['Age'] >= 20) | (data['Age'] <= 30) | |
#data['Age60'] = data['Age'] >= 60 | |
# remove unnecessary columns | |
delete_columns = ['Name', 'PassengerId', 'SibSp', 'Parch', 'Ticket', 'Cabin']#, 'Pclass'] | |
data.drop(delete_columns, axis = 1, inplace = True) | |
train = data[:len(train)] | |
test = data[len(train):] | |
y_train = train['Survived'] | |
X_train = train.drop('Survived', axis = 1) | |
X_test = test.drop('Survived', axis = 1) | |
print(train.head()) | |
# ML | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.naive_bayes import GaussianNB | |
from sklearn.tree import DecisionTreeClassifier | |
from sklearn.svm import SVC | |
from sklearn.model_selection import cross_val_score, StratifiedKFold | |
from xgboost import XGBClassifier | |
# clf = GaussianNB() | |
# Choose hyperparameter by cross validation | |
c_candidates = list(range(1, 10)) | |
best_score = 0.0 | |
best_param = {} | |
for c in c_candidates: | |
# clf = LogisticRegression(penalty='l1', solver="liblinear", random_state=0, C=c, | |
# max_iter=5000) | |
clf = XGBClassifier(random_state=42, max_depth=c) | |
# パイプラインの作成 (scaler -> svr) | |
pipeline = Pipeline([ | |
# ('scaler', PowerTransformer()), | |
('reg', clf) | |
]) | |
# Cross validation | |
stratified_k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) | |
scores = cross_val_score(pipeline, X_train, y_train, cv=stratified_k_fold) | |
mean = np.mean(scores) | |
print('Cross-validation scores: {}'.format(scores)) | |
print('C={}, Mean score: {}'.format(c, mean)) | |
if best_score < mean: | |
best_score = mean | |
best_param = c | |
print('{} {}'.format(best_param, best_score)) | |
# パイプラインの作成 (scaler -> svr) | |
# clf = LogisticRegression(penalty='l1', solver="liblinear", random_state=0, | |
# C=best_param, max_iter=5000) | |
clf = XGBClassifier(random_state=42, max_depth=best_param) | |
pipeline = Pipeline([ | |
('reg', clf) | |
]) | |
pipeline.fit(X_train, y_train) | |
y_pred = pipeline.predict(X_test) | |
# submit | |
sub = pd.DataFrame(test_orig['PassengerId']) | |
sub['Survived'] = list(map(int, y_pred)) | |
sub.to_csv("submission.csv", index = False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment