Skip to content

Instantly share code, notes, and snippets.

@koba-e964
Created April 16, 2019 01:12
Show Gist options
  • Save koba-e964/fe183af0c7e46b810dad0d3fcdce8e9b to your computer and use it in GitHub Desktop.
Save koba-e964/fe183af0c7e46b810dad0d3fcdce8e9b to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
# Reference: https://qiita.com/upura/items/3c10ff6fed4e7c3d70f0
import pandas as pd
import numpy as np
import csv as csv
from sklearn.preprocessing import PowerTransformer
from sklearn.pipeline import Pipeline
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
test_orig = test
data = pd.concat([train, test], sort=True)
# preprocessing
data['Sex'].replace(['male','female'],[0, 1], inplace=True)
data['Embarked'].fillna(('S'), inplace=True)
data['Embarked'] = data['Embarked'].map( {'S': 0, 'C': 2, 'Q': 1} ).astype(int)
data['Fare'].fillna(np.mean(data['Fare']), inplace=True)
age_avg = data['Age'].mean()
age_std = data['Age'].std()
data['Age'].fillna(np.random.randint(age_avg - age_std, age_avg + age_std), inplace=True)
# Feature engineering
data['Family_Size'] = data['Parch'] + data['SibSp'] + 1
# data['Family_Size_Feature'] = (data['Family_Size'] >= 5) | (data['Family_Size'] <= 1)
#data['PclassIs3'] = data['Pclass'] == 3
#data['Age2030'] = (data['Age'] >= 20) | (data['Age'] <= 30)
#data['Age60'] = data['Age'] >= 60
# remove unnecessary columns
delete_columns = ['Name', 'PassengerId', 'SibSp', 'Parch', 'Ticket', 'Cabin']#, 'Pclass']
data.drop(delete_columns, axis = 1, inplace = True)
train = data[:len(train)]
test = data[len(train):]
y_train = train['Survived']
X_train = train.drop('Survived', axis = 1)
X_test = test.drop('Survived', axis = 1)
print(train.head())
# ML
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, StratifiedKFold
from xgboost import XGBClassifier
# clf = GaussianNB()
# Choose hyperparameter by cross validation
c_candidates = list(range(1, 10))
best_score = 0.0
best_param = {}
for c in c_candidates:
# clf = LogisticRegression(penalty='l1', solver="liblinear", random_state=0, C=c,
# max_iter=5000)
clf = XGBClassifier(random_state=42, max_depth=c)
# パイプラインの作成 (scaler -> svr)
pipeline = Pipeline([
# ('scaler', PowerTransformer()),
('reg', clf)
])
# Cross validation
stratified_k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X_train, y_train, cv=stratified_k_fold)
mean = np.mean(scores)
print('Cross-validation scores: {}'.format(scores))
print('C={}, Mean score: {}'.format(c, mean))
if best_score < mean:
best_score = mean
best_param = c
print('{} {}'.format(best_param, best_score))
# パイプラインの作成 (scaler -> svr)
# clf = LogisticRegression(penalty='l1', solver="liblinear", random_state=0,
# C=best_param, max_iter=5000)
clf = XGBClassifier(random_state=42, max_depth=best_param)
pipeline = Pipeline([
('reg', clf)
])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
# submit
sub = pd.DataFrame(test_orig['PassengerId'])
sub['Survived'] = list(map(int, y_pred))
sub.to_csv("submission.csv", index = False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment