Created
March 20, 2017 05:03
-
-
Save saboyutaka/e338733e50203d6990d92e5b24fd0212 to your computer and use it in GitHub Desktop.
Kaggle titanicで機械学習
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"_cell_guid": "18cd64c8-c9bc-7fa5-2b40-3542a8601a42" | |
}, | |
"source": [ | |
"# kaggle - Titanic: Machine Learning from Disaster" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": { | |
"_cell_guid": "7d7c050a-157f-c1ac-d54f-ef9cdba5606a", | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np\n", | |
"import pandas as pd" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 39, | |
"metadata": { | |
"_cell_guid": "bcb77017-6036-8963-9096-996a569b900a", | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# データの読み込み\n", | |
"df_train = pd.read_csv('data/train.csv') # トレーニングデータ\n", | |
"df_test = pd.read_csv('data/test.csv') # テストデータ" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 40, | |
"metadata": { | |
"_cell_guid": "04fbca24-f4b7-6c05-49f3-df6e68abf145", | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# 学習モデルは数値しか扱えない。Sexはmale, femailと入っているので数値化する\n", | |
"\n", | |
"# SexId を追加\n", | |
"df_train['SexId'] = df_train['Sex'].map({'male': 1, 'female': 0})\n", | |
"df_test['SexId'] = df_test['Sex'].map({'male': 1, 'female': 0})" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 41, | |
"metadata": { | |
"_cell_guid": "2e9ae43d-a300-f794-8b3b-cfc42b0f69f7", | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# 2つのSeriasを合わせてみる\n", | |
"\n", | |
"# FamilySize = SibSp + Parch\n", | |
"df_train['FamilySize'] = df_train['SibSp'] + df_train['Parch']\n", | |
"df_test['FamilySize'] = df_test['SibSp'] + df_test['Parch']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 42, | |
"metadata": { | |
"_cell_guid": "cf34978b-4822-f91e-00e1-b5050c7faa53", | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# 機械学習は欠損値(NaN)を扱えない。\n", | |
"# 欠損値を扱う方法は\n", | |
"# * 欠損値のあるデータを取り除く\n", | |
"# * 欠損値をある数値で埋める\n", | |
"# * 中央値で埋める\n", | |
"# * 平均値で埋める\n", | |
"# * 0で埋める\n", | |
"\n", | |
"# Ageの欠損値を中央値で補完する\n", | |
"df_train['AgeNull'] = df_train['Age'].isnull()\n", | |
"age_median = df_train['Age'].median()\n", | |
"df_train['Age'].fillna(age_median, inplace=True)\n", | |
"df_test['Age'].fillna(age_median, inplace=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 43, | |
"metadata": { | |
"_cell_guid": "2ffc3c77-9591-a35f-5d57-a1345016347a", | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"inputs = ['FamilySize', 'SexId', 'Age']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 44, | |
"metadata": { | |
"_cell_guid": "0b0633ad-f59b-a367-361c-747d2737fdb7", | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# X_train: 学習用の説明変数\n", | |
"# y_train: 学習用の目的変数\n", | |
"\n", | |
"X_train = df_train[inputs].values.astype('float32')\n", | |
"y_train = df_train['Survived'].values" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 45, | |
"metadata": { | |
"_cell_guid": "6b1e99e9-59ee-2734-7b70-4ad6f3bfce5e", | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# X_test: 予測するデータの説明変数\n", | |
"X_test = df_test[['SexId', 'Age']].values.astype('float32')\n", | |
"\n", | |
"# y_test: ないの?y_testを予測するのが機械学習のやりたいこと!" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"_cell_guid": "8b44b560-2856-ab91-ecb8-d7b4c422ec46" | |
}, | |
"source": [ | |
"# 機械学習" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"_cell_guid": "d145c5ce-bb78-569d-ce8a-75c50d5a44b6" | |
}, | |
"source": [ | |
"### RandomForestClassifier" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 46, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# ランダムフォレスト\n", | |
"from sklearn.ensemble import RandomForestClassifier" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 47, | |
"metadata": { | |
"_cell_guid": "b1254bdd-950c-04c4-47ea-c7174e57d668", | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# モデルの構築\n", | |
"model = RandomForestClassifier(random_state=42)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 48, | |
"metadata": { | |
"_cell_guid": "73a74338-4d4c-2d11-16f3-bfa4bc4adb6e", | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n", | |
" max_depth=None, max_features='auto', max_leaf_nodes=None,\n", | |
" min_samples_leaf=1, min_samples_split=2,\n", | |
" min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,\n", | |
" oob_score=False, random_state=42, verbose=0, warm_start=False)" | |
] | |
}, | |
"execution_count": 48, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# モデルの学習\n", | |
"model.fit(X_train, y_train)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 49, | |
"metadata": { | |
"_cell_guid": "6cfcf4e5-286c-5909-09bb-3a0c6983a23f", | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"mean accuracy (train): 0.8676\n" | |
] | |
} | |
], | |
"source": [ | |
"# トレーニングデータに対する予測精度\n", | |
"print(\"mean accuracy (train): {0:.4f}\".format(model.score(X_train, y_train)))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"_change_revision": 501, | |
"_is_fork": false, | |
"anaconda-cloud": {}, | |
"kernelspec": { | |
"display_name": "Python [default]", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment