df = pd .read_csv ('/kaggle/input/titanic/train.csv' )
df .head ()
print ('shape:' , df .shape )
print ('columns:' , df .columns )
df [['PassengerId' , 'Survived' ]]
df [['PassengerId' , 'Survived' , 'Fare' ]].query ('Fare > 10' )
df [df ['Fare' ] > 10 ][['PassengerId' , 'Survived' , 'Fare' ]]
df [['PassengerId' , 'Survived' ]].loc [880 :885 ]
df [['Pclass' , 'Fare' ]].groupby (['Pclass' ]).mean ()
df [['Age' , 'SibSp' , 'Parch' , 'Fare' ]].describe ()
(9)カテゴリデータ(質的データ)の値の種類を見る
print (df ['Pclass' ].unique ())
print (type (df ['Pclass' ]))
print ('df count:' , df [['PassengerId' ]].count ())
df_a = df .dropna ()
print ('df_a count:' , df_a [['PassengerId' ]].count ())
df_b = df .fillna ({'Age' : df ['Age' ].median (), 'Cabin' : 'NA' })
df_b .tail ()
df_b .to_csv ('./train_fillna.csv' )
% matplotlib inline
from matplotlib import pyplot as plt
import seaborn
pclass_df = df [['Pclass' , 'PassengerId' ]].groupby (['Pclass' ]).count ()
pclass_df
pclass_df = df [['Pclass' , 'Survived' , 'PassengerId' ]].groupby (['Pclass' , 'Survived' ]).count ()
pclass_df
pclass_df = pclass_df .unstack ()
pclass_df
pclass_df .plot .bar (stacked = True )
plt .hist (
[df [df ['Survived' ] == 0 ]['Age' ].values ,
df [df ['Survived' ] == 1 ]['Age' ].values ],
histtype = 'barstacked' , bins = 8 , label = ['Death' , 'Survive' ]
)
plt .legend ()
df [['Pclass' , 'Survived' ]].corr ()
import seaborn as sns
sns .heatmap (df .corr (), annot = True )
df2 = pd .get_dummies (df [['Sex' , 'Embarked' ]])
df2
df3 = pd .concat ([
df .drop (['PassengerId' , 'Name' , 'Ticket' , 'Cabin' , 'Sex' , 'Embarked' ], axis = 1 ),
df2 .drop (['Sex_female' ], axis = 1 )
], axis = 1 )
df3
plt .figure (figsize = (9 ,9 ))
sns .heatmap (df3 .corr (), annot = True )
from sklearn .preprocessing import LabelEncoder
df4 = df .drop (['PassengerId' , 'Name' , 'Ticket' , 'Cabin' ], axis = 1 )
for category in ['Sex' , 'Embarked' ]:
le = LabelEncoder ()
le .fit (df4 [category ])
df4 [category ] = le .transform (df4 [category ])
df4
plt .figure (figsize = (9 ,9 ))
sns .heatmap (df4 .corr (), annot = True )
from sklearn .model_selection import train_test_split
X = df4 .drop ('Survived' , axis = 1 ).values
y = df4 ['Survived' ].values
X_train , X_valid , y_train , y_valid = train_test_split (X , y , train_size = 0.8 , random_state = 0 )
from xgboost import XGBClassifier
model = XGBClassifier (n_estimators = 5 , eval_metric = 'logloss' )
model .fit (X_train , y_train )
model .score (X_valid , y_valid )
pred = model .predict (X_valid )
pred
pred_proba = model .predict_proba (X_valid )
pred_proba
from sklearn .metrics import accuracy_score , log_loss
print ('accuracy:' , accuracy_score (y_valid , pred ))
print ('logloss:' , log_loss (y_valid , pred_proba ))
from sklearn .model_selection import KFold
import numpy as np
X = df4 .drop (['Survived' ], axis = 1 )
y = df4 ['Survived' ]
kf = KFold (n_splits = 5 , shuffle = True , random_state = 0 )
scores = []
for train_idx , valid_idx in kf .split (X ):
X_train , X_valid = X .iloc [train_idx ], X .iloc [valid_idx ]
y_train , y_valid = y .iloc [train_idx ], y .iloc [valid_idx ]
model = XGBClassifier (n_estimators = 5 , eval_metric = 'logloss' )
model .fit (X_train , y_train )
pred_proba = model .predict_proba (X_valid )
logloss = log_loss (y_valid , pred_proba )
print ('logloss:' , logloss )
scores .append (logloss )
print (model )
print ('total logloss:' , np .mean (scores ))
from sklearn .model_selection import GridSearchCV
X = df4 .drop (['Survived' ], axis = 1 ).values
y = df4 ['Survived' ].values
model = XGBClassifier (eval_metric = 'logloss' )
model_cv = GridSearchCV (
model ,
{'max_depth' : [4 ,6 ,8 ],
'n_estimators' : [3 ,5 ,10 ],
'learning_rate' : [0.2 ,0.3 ,0.4 ]},
cv = 5 , verbose = 1 )
model_cv .fit (X , y )
print (model_cv .best_params_ , model_cv .best_score_ )
自動調整したハイパーパラメータでモデルのトレーニング
X = df4 .drop (['Survived' ], axis = 1 )
y = df4 ['Survived' ]
kf = KFold (n_splits = 5 , shuffle = True , random_state = 0 )
scores = []
models = []
for train_idx , valid_idx in kf .split (X ):
X_train , X_valid = X .iloc [train_idx ], X .iloc [valid_idx ]
y_train , y_valid = y .iloc [train_idx ], y .iloc [valid_idx ]
model = XGBClassifier (learning_rate = 0.4 , max_depth = 8 , n_estimators = 10 , eval_metric = 'logloss' )
model .fit (X_train , y_train )
models .append (model )
pred_proba = model .predict_proba (X_valid )
logloss = log_loss (y_valid , pred_proba )
print ('logloss:' , logloss )
scores .append (logloss )
print ('total logloss:' , np .mean (scores ))
import joblib
model = XGBClassifier (learning_rate = 0.4 , max_depth = 8 , n_estimators = 10 , eval_metric = 'logloss' )
model .fit (X_train , y_train )
joblib .dump (model , './model.joblib' )
test_df = pd .read_csv ('/kaggle/input/titanic/test.csv' )
test_df2 = test_df .drop (['PassengerId' , 'Name' , 'Ticket' , 'Cabin' ], axis = 1 )
for category in ['Sex' , 'Embarked' ]:
le = LabelEncoder ()
le .fit (test_df2 [category ])
test_df2 [category ] = le .transform (test_df2 [category ])
test_df2
model = joblib .load ('./model.joblib' )
pred_proba = model .predict_proba (test_df2 .values )[:, 1 ]
survived = (pred_proba > 0.5 ).astype (int )
survived
submission_df = pd .read_csv ('/kaggle/input/titanic/gender_submission.csv' )
submission_df
submission_df ['Survived' ] = survived
submission_df
submission_df .to_csv ('./titanic_submit.csv' , index = False )
from sklearn .tree import DecisionTreeClassifier
from sklearn .model_selection import train_test_split
df_b = pd .read_csv ('./train_fillna.csv' )
df_b = df_b .drop (['PassengerId' , 'Name' , 'Ticket' , 'Cabin' ], axis = 1 )
for category in ['Sex' , 'Embarked' ]:
le = LabelEncoder ()
le .fit (df_b [category ])
df_b [category ] = le .transform (df_b [category ])
X = df_b .drop ('Survived' , axis = 1 ).values
y = df_b ['Survived' ].values
X_train , X_valid , y_train , y_valid = train_test_split (X , y , train_size = 0.8 , random_state = 0 )
model = DecisionTreeClassifier (max_depth = 3 )
model .fit (X_train , y_train )
model .score (X_valid , y_valid )
from sklearn .ensemble import RandomForestClassifier
model = RandomForestClassifier (max_depth = 3 , n_estimators = 3 )
model .fit (X_train , y_train )
model .score (X_valid , y_valid )
👍 👍 👍