Created
March 11, 2015 04:06
-
-
Save hagino3000/9c8c0b71b6302ca28f25 to your computer and use it in GitHub Desktop.
Kaggle Titanic Competition
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 358, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from sklearn.cross_validation import train_test_split, cross_val_score, KFold\n", | |
"from sklearn.linear_model import LogisticRegression\n", | |
"from sklearn.linear_model import LinearRegression\n", | |
"from sklearn.svm import LinearSVC\n", | |
"from sklearn.metrics import accuracy_score, classification_report, confusion_matrix\n", | |
"from sklearn.tree import DecisionTreeClassifier\n", | |
"from sklearn.ensemble import RandomForestClassifier" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 342, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def plot_confusion_matrix(cm):\n", | |
" fig, ax = plt.subplots()\n", | |
" im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)\n", | |
" ax.set_title('Confusion Matrix')\n", | |
" fig.colorbar(im)\n", | |
"\n", | |
" target_names = ['not survived', 'survived']\n", | |
"\n", | |
" tick_marks = np.arange(len(target_names))\n", | |
" ax.set_xticks(tick_marks)\n", | |
" ax.set_xticklabels(target_names, rotation=45)\n", | |
" ax.set_yticks(tick_marks)\n", | |
" ax.set_yticklabels(target_names)\n", | |
" ax.set_ylabel('True label')\n", | |
" ax.set_xlabel('Predicted label')\n", | |
" fig.tight_layout()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 343, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"df_train = pd.read_csv('./train.csv')\n", | |
"df_test = pd.read_csv('./test.csv')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 344, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>Survived</th>\n", | |
" <th>Pclass</th>\n", | |
" <th>Name</th>\n", | |
" <th>Sex</th>\n", | |
" <th>Age</th>\n", | |
" <th>SibSp</th>\n", | |
" <th>Parch</th>\n", | |
" <th>Ticket</th>\n", | |
" <th>Fare</th>\n", | |
" <th>Cabin</th>\n", | |
" <th>Embarked</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> 0</td>\n", | |
" <td> 3</td>\n", | |
" <td> Braund, Mr. Owen Harris</td>\n", | |
" <td> male</td>\n", | |
" <td> 22</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> A/5 21171</td>\n", | |
" <td> 7.2500</td>\n", | |
" <td> NaN</td>\n", | |
" <td> S</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td> 1</td>\n", | |
" <td> 1</td>\n", | |
" <td> Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n", | |
" <td> female</td>\n", | |
" <td> 38</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> PC 17599</td>\n", | |
" <td> 71.2833</td>\n", | |
" <td> C85</td>\n", | |
" <td> C</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" Survived Pclass Name \\\n", | |
"0 0 3 Braund, Mr. Owen Harris \n", | |
"1 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... \n", | |
"\n", | |
" Sex Age SibSp Parch Ticket Fare Cabin Embarked \n", | |
"0 male 22 1 0 A/5 21171 7.2500 NaN S \n", | |
"1 female 38 1 0 PC 17599 71.2833 C85 C " | |
] | |
}, | |
"execution_count": 344, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df_train.drop('PassengerId', axis=1, inplace=True)\n", | |
"df_train.head(2)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 345, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def _extract_title(name):\n", | |
" if name.find('Mr.') > 0:\n", | |
" return 'Mr'\n", | |
" elif name.find('Mrs.') > 0:\n", | |
" return 'Mrs'\n", | |
" elif name.find('Master.') > 0:\n", | |
" return 'Master'\n", | |
" elif name.find('Miss.') > 0:\n", | |
" return 'Miss'\n", | |
" else:\n", | |
" return None\n", | |
" \n", | |
"def extract_title(df):\n", | |
" df['Title'] = df.Name.apply(lambda n: _extract_title(n))\n", | |
" title_bin = pd.get_dummies(df.Title)\n", | |
" title_bin.rename(columns=lambda x: 'title' + \"_\" + str(x), inplace=True)\n", | |
" df = df.join(title_bin)\n", | |
" return df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 346, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def fill_fare(df):\n", | |
" df['Fare'].fillna(0, inplace=True)\n", | |
" df['FareFill'] = df.Fare\n", | |
" df.FareFill[(df.Fare == 0) & (df.Pclass == 1)] = 86\n", | |
" df.FareFill[(df.Fare == 0) & (df.Pclass == 2)] = 21\n", | |
" df.FareFill[(df.Fare == 0) & (df.Pclass == 3)] = 13\n", | |
" df.FareFill = df.FareFill.apply(lambda f:np.log(f))\n", | |
" return df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 347, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def fill_age(df):\n", | |
" df['AgeFill'] = df.Age\n", | |
" df.AgeFill[df.Age.isnull() & (df.Sex == 'male') & (df.Pclass == 1)] = 40\n", | |
" df.AgeFill[df.Age.isnull() & (df.Sex == 'male') & (df.Pclass == 2)] = 31\n", | |
" df.AgeFill[df.Age.isnull() & (df.Sex == 'male') & (df.Pclass == 3)] = 26\n", | |
" df.AgeFill[df.Age.isnull() & (df.Title == 'Master')] = 3.5\n", | |
" \n", | |
" df.AgeFill[df.Age.isnull() & (df.Title == 'Mrs') & (df.Pclass == 1)] = 41.5\n", | |
" df.AgeFill[df.Age.isnull() & (df.Title == 'Mrs') & (df.Pclass == 2)] = 32\n", | |
" df.AgeFill[df.Age.isnull() & (df.Title == 'Mrs') & (df.Pclass == 3)] = 31\n", | |
" df.AgeFill[df.Age.isnull() & (df.Title == 'Miss') & (df.Pclass == 1)] = 30\n", | |
" df.AgeFill[df.Age.isnull() & (df.Title == 'Miss') & (df.Pclass == 2)] = 24\n", | |
" df.AgeFill[df.Age.isnull() & (df.Title == 'Miss') & (df.Pclass == 3)] = 18\n", | |
" df.AgeFill[df.AgeFill.isnull() & (df.Sex == 'female')] = 30\n", | |
" return df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 348, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def extract_pclass(df):\n", | |
" pclass_new = pd.get_dummies(df.Pclass)\n", | |
" pclass_new.rename(columns=lambda x: 'pclass' + \"_\" + str(x), inplace=True)\n", | |
" df = df.join(pclass_new)\n", | |
" return df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 349, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def extract_parch(df):\n", | |
" dm = pd.get_dummies(df.Parch.apply(lambda p: min(p, 4)))\n", | |
" dm.rename(columns=lambda x: 'parch' + \"_\" + str(x), inplace=True)\n", | |
" df = df.join(dm)\n", | |
" return df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 350, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def extract_sibsp(df):\n", | |
" dm = pd.get_dummies(df.SibSp.apply(lambda s: min(s, 4)))\n", | |
" dm.rename(columns=lambda x: 'sibsp' + \"_\" + str(x), inplace=True)\n", | |
" df = df.join(dm)\n", | |
" return df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 351, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def convert_sex(df):\n", | |
" df['male'] = df.Sex.apply(lambda s: 0 if s == 'male' else 1)\n", | |
" df['female'] = df.Sex.apply(lambda s: 1 if s == 'male' else 0)\n", | |
" return df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 352, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def extract_feature(df):\n", | |
" df = extract_title(df)\n", | |
" df = fill_age(df)\n", | |
" df = extract_pclass(df)\n", | |
" df = extract_sibsp(df)\n", | |
" df = extract_parch(df)\n", | |
" df = convert_sex(df)\n", | |
" df = fill_fare(df)\n", | |
" cols = df.columns\n", | |
" drop_cols = set(cols).intersection(set(['PassengerId', 'Title', 'Name', 'SibSp', 'Ticket', 'Fare', 'Pclass', 'Survived', 'Parch', 'Sex', 'Age', 'Ticket', 'Cabin', 'Embarked', 'CCabin']))\n", | |
" return df.drop(drop_cols, axis=1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 353, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def get_classifier():\n", | |
" clf = LogisticRegression(C=100, penalty='l2', tol=0.01)\n", | |
" #clf = RandomForestClassifier()\n", | |
" #clf = DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=2)\n", | |
" return clf" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 354, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def calc_classifier(df, clf=None):\n", | |
" X_train = extract_feature(df)\n", | |
" y_train = df['Survived']\n", | |
" X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.8, random_state=42)\n", | |
" print('Num of Training Samples: {}'.format(len(X_train)))\n", | |
" print('Num of Validation Samples: {}'.format(len(X_val)))\n", | |
" \n", | |
" if clf is None:\n", | |
" clf = get_classifier()\n", | |
" clf.fit(X_train, y_train)\n", | |
" y_train_pred = clf.predict(X_train)\n", | |
" y_val_pred = clf.predict(X_val)\n", | |
" print('Accuracy on Training Set: {:.3f}'.format(accuracy_score(y_train, y_train_pred)))\n", | |
" print('Accuracy on Validation Set: {:.3f}'.format(accuracy_score(y_val, y_val_pred)))\n", | |
" cm = confusion_matrix(y_val, y_val_pred)\n", | |
" return clf" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 355, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def cross_val(X, y, K, random_state=0, clf=None, ):\n", | |
" if clf is None:\n", | |
" clf = get_classifier()\n", | |
" cv = KFold(len(y), K, shuffle=True, random_state=random_state)\n", | |
" scores = cross_val_score(clf, X, y, cv=cv)\n", | |
" print('Scores:', scores)\n", | |
" print('Mean Score: {0:.3f} (+/-{1:.3f})'.format(scores.mean(), scores.std()*2))\n", | |
" return scores" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 356, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"X_train = extract_feature(df_train)\n", | |
"y_train = df_train.Survived" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 389, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Logistic Regression\n", | |
"('Scores:', array([ 0.78212291, 0.78089888, 0.79213483, 0.8258427 , 0.83707865]))\n", | |
"Mean Score: 0.804 (+/-0.047)\n", | |
"Linear Regression\n", | |
"('Scores:', array([ 0.44844448, 0.38164783, 0.40170421, 0.36806075, 0.48549067]))\n", | |
"Mean Score: 0.417 (+/-0.087)\n", | |
"Random Forest\n", | |
"('Scores:', array([ 0.79888268, 0.79213483, 0.82022472, 0.8258427 , 0.78089888]))\n", | |
"Mean Score: 0.804 (+/-0.034)\n", | |
"SVN (L1 regression)\n", | |
"('Scores:', array([ 0.81005587, 0.82022472, 0.8258427 , 0.81460674, 0.84269663]))\n", | |
"Mean Score: 0.823 (+/-0.023)\n", | |
"SVN (L2 regression and L1 loss)\n", | |
"('Scores:', array([ 0.82122905, 0.82022472, 0.8258427 , 0.8258427 , 0.84269663]))\n", | |
"Mean Score: 0.827 (+/-0.016)\n", | |
"SVN (L2)\n", | |
"('Scores:', array([ 0.82122905, 0.75842697, 0.82022472, 0.85393258, 0.84269663]))\n", | |
"Mean Score: 0.819 (+/-0.066)\n", | |
"SVN\n", | |
"('Scores:', array([ 0.79888268, 0.71910112, 0.75280899, 0.82022472, 0.83707865]))\n", | |
"Mean Score: 0.786 (+/-0.087)\n", | |
"Decision Tree\n", | |
"('Scores:', array([ 0.7877095 , 0.80898876, 0.80898876, 0.80337079, 0.8258427 ]))\n", | |
"Mean Score: 0.807 (+/-0.024)\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"array([ 0.7877095 , 0.80898876, 0.80898876, 0.80337079, 0.8258427 ])" | |
] | |
}, | |
"execution_count": 389, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"print \"Logistic Regression\"\n", | |
"cross_val(X_train, y_train, 5, clf=LogisticRegression(C=0.1, penalty='l2', tol=0.01))\n", | |
"print \"Linear Regression\"\n", | |
"cross_val(X_train, y_train, 5, clf=LinearRegression())\n", | |
"print \"Random Forest\"\n", | |
"cross_val(X_train, y_train, 5, clf=RandomForestClassifier())\n", | |
"print \"SVN (L1 regression)\"\n", | |
"cross_val(X_train, y_train, 5, clf=LinearSVC(penalty='l1', dual=False))\n", | |
"print \"SVN (L2 regression and L1 loss)\"\n", | |
"cross_val(X_train, y_train, 5, clf=LinearSVC(penalty='l2', loss='l1'))\n", | |
"print \"SVN (L2)\"\n", | |
"cross_val(X_train, y_train, 5, clf=LinearSVC(penalty='l2'))\n", | |
"print \"SVN\"\n", | |
"cross_val(X_train, y_train, 5, clf=LinearSVC())\n", | |
"print \"Decision Tree\"\n", | |
"cross_val(X_train, y_train, 5, clf=DecisionTreeClassifier(criterion='entropy', max_depth=3, min_samples_leaf=2))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 384, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Num of Training Samples: 712\n", | |
"Num of Validation Samples: 179\n", | |
"Accuracy on Training Set: 0.833\n", | |
"Accuracy on Validation Set: 0.821\n" | |
] | |
} | |
], | |
"source": [ | |
"clf = calc_classifier(df_train, clf=LinearSVC(penalty='l1', dual=False))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 385, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>title_Master</th>\n", | |
" <th>title_Miss</th>\n", | |
" <th>title_Mr</th>\n", | |
" <th>title_Mrs</th>\n", | |
" <th>AgeFill</th>\n", | |
" <th>pclass_1</th>\n", | |
" <th>pclass_2</th>\n", | |
" <th>pclass_3</th>\n", | |
" <th>sibsp_0</th>\n", | |
" <th>sibsp_1</th>\n", | |
" <th>...</th>\n", | |
" <th>sibsp_3</th>\n", | |
" <th>sibsp_4</th>\n", | |
" <th>parch_0</th>\n", | |
" <th>parch_1</th>\n", | |
" <th>parch_2</th>\n", | |
" <th>parch_3</th>\n", | |
" <th>parch_4</th>\n", | |
" <th>male</th>\n", | |
" <th>female</th>\n", | |
" <th>FareFill</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 22</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td>...</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 1.981001</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 38</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td>...</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 4.266662</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 26</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td>...</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 2.070022</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 35</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td>...</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 3.972177</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 35</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td>...</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 2.085672</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>5 rows × 21 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" title_Master title_Miss title_Mr title_Mrs AgeFill pclass_1 pclass_2 \\\n", | |
"0 0 0 1 0 22 0 0 \n", | |
"1 0 0 0 1 38 1 0 \n", | |
"2 0 1 0 0 26 0 0 \n", | |
"3 0 0 0 1 35 1 0 \n", | |
"4 0 0 1 0 35 0 0 \n", | |
"\n", | |
" pclass_3 sibsp_0 sibsp_1 ... sibsp_3 sibsp_4 parch_0 parch_1 \\\n", | |
"0 1 0 1 ... 0 0 1 0 \n", | |
"1 0 0 1 ... 0 0 1 0 \n", | |
"2 1 1 0 ... 0 0 1 0 \n", | |
"3 0 0 1 ... 0 0 1 0 \n", | |
"4 1 1 0 ... 0 0 1 0 \n", | |
"\n", | |
" parch_2 parch_3 parch_4 male female FareFill \n", | |
"0 0 0 0 0 1 1.981001 \n", | |
"1 0 0 0 1 0 4.266662 \n", | |
"2 0 0 0 1 0 2.070022 \n", | |
"3 0 0 0 1 0 3.972177 \n", | |
"4 0 0 0 0 1 2.085672 \n", | |
"\n", | |
"[5 rows x 21 columns]" | |
] | |
}, | |
"execution_count": 385, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"X_train.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 386, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"Y = extract_feature(df_test)\n", | |
"df_test['Survived'] = clf.predict(Y)\n", | |
"submit_data = df_test[['PassengerId', 'Survived']]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 387, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>title_Master</th>\n", | |
" <th>title_Miss</th>\n", | |
" <th>title_Mr</th>\n", | |
" <th>title_Mrs</th>\n", | |
" <th>AgeFill</th>\n", | |
" <th>pclass_1</th>\n", | |
" <th>pclass_2</th>\n", | |
" <th>pclass_3</th>\n", | |
" <th>sibsp_0</th>\n", | |
" <th>sibsp_1</th>\n", | |
" <th>...</th>\n", | |
" <th>sibsp_3</th>\n", | |
" <th>sibsp_4</th>\n", | |
" <th>parch_0</th>\n", | |
" <th>parch_1</th>\n", | |
" <th>parch_2</th>\n", | |
" <th>parch_3</th>\n", | |
" <th>parch_4</th>\n", | |
" <th>male</th>\n", | |
" <th>female</th>\n", | |
" <th>FareFill</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 34.5</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td>...</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 2.057860</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 47.0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td>...</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1.945910</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 62.0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td>...</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 2.270836</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 27.0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td>...</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 2.159003</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 22.0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td>...</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 0</td>\n", | |
" <td> 1</td>\n", | |
" <td> 0</td>\n", | |
" <td> 2.508582</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>5 rows × 21 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" title_Master title_Miss title_Mr title_Mrs AgeFill pclass_1 pclass_2 \\\n", | |
"0 0 0 1 0 34.5 0 0 \n", | |
"1 0 0 0 1 47.0 0 0 \n", | |
"2 0 0 1 0 62.0 0 1 \n", | |
"3 0 0 1 0 27.0 0 0 \n", | |
"4 0 0 0 1 22.0 0 0 \n", | |
"\n", | |
" pclass_3 sibsp_0 sibsp_1 ... sibsp_3 sibsp_4 parch_0 parch_1 \\\n", | |
"0 1 1 0 ... 0 0 1 0 \n", | |
"1 1 0 1 ... 0 0 1 0 \n", | |
"2 0 1 0 ... 0 0 1 0 \n", | |
"3 1 1 0 ... 0 0 1 0 \n", | |
"4 1 0 1 ... 0 0 0 1 \n", | |
"\n", | |
" parch_2 parch_3 parch_4 male female FareFill \n", | |
"0 0 0 0 0 1 2.057860 \n", | |
"1 0 0 0 1 0 1.945910 \n", | |
"2 0 0 0 0 1 2.270836 \n", | |
"3 0 0 0 0 1 2.159003 \n", | |
"4 0 0 0 1 0 2.508582 \n", | |
"\n", | |
"[5 rows x 21 columns]" | |
] | |
}, | |
"execution_count": 387, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"Y.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 388, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"submit_data.to_csv('./submit_20150312_3.csv', index=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.9" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment