Last active
May 21, 2016 21:26
-
-
Save VeylanSolmira/0632762d9a721a46acbcf9a74839d3c1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#import xgboost\n", | |
"import pandas\n", | |
"from sklearn import preprocessing, cross_validation, grid_search, ensemble, linear_model, calibration\n", | |
"import numpy\n", | |
"import math\n", | |
"#import graphviz" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"with open(\"C:/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/train.csv/train.csv\") as f:\n", | |
" df_train = pandas.read_csv(f)\n", | |
" df_train_processed = False" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"with open(\"C:/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/test.csv/test.csv\") as f:\n", | |
" df_test = pandas.read_csv(f)\n", | |
" df_test_processed = False" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def feature_engineering(dataframe):\n", | |
" #if 'non-na_count' not in dataframe:\n", | |
" new_df = pandas.DataFrame(dataframe.count(axis = 1), columns = ['non-na_count'])\n", | |
" df_train = pandas.concat([dataframe, new_df], axis=1)\n", | |
" #count NAs in row" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def feature_selection(dataframe, threshold):\n", | |
" list_98 = ['v17', 'v46', 'v26', 'v63', 'v71']\n", | |
" list_95 = ['v11', 'v25', 'v29', 'v83', 'v41', 'v43', 'v89', 'v64', 'v92', 'v97', 'v108']\n", | |
" list_90 = ['v8', 'v10', 'v13', 'v15', 'v20', 'v32', 'v33', 'v34', 'v54', 'v67', 'v109']\n", | |
" list_85 = ['v1', 'v44', 'v55', 'v60', 'v61', 'v76', 'v77', 'v94', 'v105', 'v111', 'v119']\n", | |
" list_80 = ['v4', 'v9', 'v14', 'v35', 'v51', 'v80', 'v87', 'v101', 'v121']\n", | |
" list_75 = ['v23', 'v49', 'v65', 'v85', 'v93']\n", | |
" list_70 = ['v2', 'v7', 'v18', 'v27', 'v48', 'v59', 'v73', 'v84', 'v123']\n", | |
" list_65 = ['v36', 'v47', 'v117']\n", | |
" list_60 = ['v45', 'v70', 'v86', 'v98']\n", | |
" list_55 = ['v5', 'v31', 'v42', 'v58', 'v102']\n", | |
" list_50 = ['v19', 'v37', 'v38', 'v57', 'v82', 'v95', 'v96', 'v99', 'v103', 'v104', 'v130', 'v106']\n", | |
" corr_list = [list_98, list_95, list_90, list_85, list_80, list_75, list_70, list_65, list_60, list_55, list_50]\n", | |
" for elem in corr_list[:threshold]:\n", | |
" dataframe.drop(elem, axis = 1, inplace = True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def preprocess(threshold):\n", | |
" #consider dropping: ['v112', 'v125', 'v74', 'v1', 'v110', 'v47']\n", | |
" global df_train_processed\n", | |
" global df_test_processed\n", | |
" if not df_train_processed:\n", | |
" df_train_processed = True\n", | |
" #feature_engineering(df_train)\n", | |
" #feature_selection(df_train, threshold)\n", | |
" label = df_train['target'].values\n", | |
" df_train.drop(['target', 'ID'], axis = 1, inplace = True)\n", | |
" df_train.drop('v22', axis = 1, inplace = True) #'v22' has over 16k values\n", | |
" \n", | |
" numeric_columns = df_train.describe().columns.tolist() \n", | |
"\n", | |
" dtypes = df_train.dtypes\n", | |
" non_numeric = list()\n", | |
" for index, elem in enumerate(dtypes):\n", | |
" if elem != 'float64':\n", | |
" non_numeric.append(dtypes.index[index])\n", | |
" \n", | |
" if not df_test_processed:\n", | |
" df_test_processed = True\n", | |
" #feature_engineering(df_test)\n", | |
" #feature_selection(df_test, threshold)\n", | |
" ids = df_test['ID'].values\n", | |
" df_test.drop('ID', axis = 1, inplace = True)\n", | |
" df_test.drop('v22', axis = 1, inplace = True)\n", | |
" \n", | |
" #remove_correlations()\n", | |
" return non_numeric, label, ids\n", | |
"\n", | |
"non_numeric, label, ids = preprocess(15)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def label_encode(dataframe):\n", | |
" le = preprocessing.LabelEncoder()\n", | |
" for elem in non_numeric:\n", | |
" if elem in dataframe.columns:\n", | |
" dataframe[elem] = le.fit_transform(dataframe[elem])\n", | |
" #print len(le.classes_)\n", | |
"\n", | |
"label_encode(df_train)\n", | |
"label_encode(df_test)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def remove_correlations(lower):\n", | |
" corr = df_train.corr()\n", | |
" drop = list()\n", | |
" for row in xrange(corr.shape[0]):\n", | |
" for column in xrange(corr.shape[1]):\n", | |
" if lower < corr.ix[row, column] and row < column:\n", | |
" drop.append((row, column))\n", | |
" return drop\n", | |
" \n", | |
"drop_tuples = remove_correlations(0.98)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def ids_label_save():\n", | |
" numpy.savetxt(\"/media/sunshine/Windows/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/label\", label)\n", | |
" numpy.savetxt(\"/media/sunshine/Windows/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/ids\", ids)\n", | |
"\n", | |
"ids_label_save()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def ids_label_load():\n", | |
" label = numpy.loadtxt(\"/media/sunshine/Windows/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/label\")\n", | |
" ids = numpy.loadtxt(\"/media/sunshine/Windows/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/ids\")\n", | |
"\n", | |
"ids_label_load()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def onehot(dataframe):\n", | |
" enc = preprocessing.OneHotEncoder()\n", | |
" onehot_df_start = pandas.DataFrame(enc.fit_transform(dataframe[[non_numeric[0]]]).todense())\n", | |
" onehot_df_start.columns = map(lambda x: non_numeric[0] + '_' + str(x), onehot_df_start.columns.tolist())\n", | |
" for elem in non_numeric[1:]:\n", | |
" if elem in dataframe.columns:\n", | |
" onehot_df = pandas.DataFrame(enc.fit_transform(dataframe[[elem]]).todense())\n", | |
" onehot_df.columns = map(lambda x: elem + '_' + str(x), onehot_df.columns.tolist())\n", | |
" dataframe.drop(elem, axis = 1, inplace = True)\n", | |
" #print len(onehot_df_start.columns)\n", | |
" onehot_df_start = onehot_df_start.join(onehot_df)\n", | |
" #print len(onehot_df_start.columns)\n", | |
" dataframe = dataframe.join(onehot_df_start)\n", | |
" return dataframe\n", | |
"\n", | |
"#I got worse results with one-hot encoding v. label encoding\n", | |
"#df_train = onehot(df_train)\n", | |
"#df_test = onehot(df_test)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def save_dataframe(dataframe, path):\n", | |
" dataframe.to_csv(path)\n", | |
" \n", | |
"def load_dataframe(path)\n", | |
" return pandas.read_csv(path)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"save_dataframe(df_train, \"C:/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/df_train_label_encoded\")\n", | |
"save_dataframe(df_test, \"C:/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/df_test_label_encoded\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"df_train = load_dataframe(\"/media/sunshine/Windows/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/df_train_onehot.csv\")\n", | |
"df_test = load_dataframe(\"/media/sunshine/Windows/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/df_test_onehot.csv\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def train_test_columns():\n", | |
" train_columns = set(df_train.columns.tolist())\n", | |
" test_columns = set(df_test.columns.tolist())\n", | |
" #train_columns - test_columns\n", | |
" df_train.drop(train_columns - test_columns, axis = 1, inplace = True)\n", | |
" \n", | |
"train_test_columns()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#this may not current work\n", | |
"def boost_load():\n", | |
" bst.load_model(\"/media/sunshine/Windows/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/xgboost_model_1000rounds\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def store_logit(output, path, column):\n", | |
" def logit(x):\n", | |
" return (math.log(x) - math.log(1 - x))\n", | |
"\n", | |
" #create new column for df_train that are the logit(prediction) from earlier model\n", | |
" logit_list = list()\n", | |
" for elem in output.tolist():\n", | |
" try:\n", | |
" logit_list.append(logit(elem[1]))\n", | |
" except ValueError:\n", | |
" logit_list.append(10)\n", | |
" pandas.DataFrame(logit_list, columns=[column]).to_csv(path)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def attach_logit(dataframe, paths):\n", | |
" values = pandas.read_csv(paths[0])[[1]]\n", | |
" for path in paths[1:]:\n", | |
" new_df = pandas.read_csv(path)[[1]]\n", | |
" values = pandas.concat([values, new_df], axis=1)\n", | |
" return pandas.concat([dataframe, values], axis=1)\n", | |
" \n", | |
"train_logit_list = ['logit_column_extra_trees_train', 'logit_column_logistic_train']\n", | |
"df_train = attach_logit(df_train, train_logit_list)\n", | |
"test_logit_list = ['logit_column_extra_trees_test', 'logit_column_logistic_test']\n", | |
"df_test = attach_logit(df_test, test_logit_list)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def xgboost(na, path):\n", | |
" dtrain = xgboost.DMatrix(df_train.fillna(na), label = label, missing = na)\n", | |
" dtest = xgboost.DMatrix(df_test.fillna(na), label = label, missing = na)\n", | |
" \n", | |
" param = {'max_depth':10,\n", | |
" 'eta':0.1,\n", | |
" 'objective':'binary:logistic',\n", | |
" #'base_score': 0.76,\n", | |
" 'eval_metric': 'logloss',\n", | |
" 'subsample': 0.75, \n", | |
" 'colsample_bytree': 0.7}\n", | |
" \n", | |
" num_round = 20 #45-60 min for 1000\n", | |
" #watchlist = [(dtest,'eval'), (dtrain,'train')]\n", | |
" \n", | |
" bst = xgboost.train(param, dtrain, num_round)#, watchlist)\n", | |
" bst.save_model(\"/media/sunshine/Windows/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/\" + path)\n", | |
" output = bst.predict(dtest)\n", | |
" return output\n", | |
" \n", | |
"xgboost(-999, \"xgboost_model_1000rounds\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def xgboost_plot():\n", | |
" #xgboost.plot_importance(bst)\n", | |
" #xgboost.plot_tree(bst, num_trees=2)\n", | |
" #xgb.to_graphviz(bst, num_trees=2)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def xgboost_cv():\n", | |
" bst_cv = xgboost.cv(param, dtrain, num_boost_round = 10, nfold = 5)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def sklearn_ml_model(algorithm, criterion = 'entropy'):\n", | |
" params = {}\n", | |
" if algorithm == 'logistic':\n", | |
" model = linear_model.LogisticRegression()\n", | |
" elif algorithm == 'random_forest':\n", | |
" params = {'n_estimators':25,\n", | |
" 'n_jobs':3,\n", | |
" 'max_features':50,\n", | |
" 'criterion':criterion,\n", | |
" 'min_samples_split':4,\n", | |
" 'max_depth':50,\n", | |
" 'min_samples_leaf':4}\n", | |
" model = ensemble.RandomForestClassifier(**params)\n", | |
" elif algorithm == 'extra_trees':\n", | |
" #n_estimators=850,max_features= 60,criterion= 'entropy',min_samples_split= 4,\n", | |
" # max_depth= 40, min_samples_leaf= 2, n_jobs = -1) \n", | |
" params = {'n_estimators':25,\n", | |
" 'n_jobs':3,\n", | |
" 'max_features':45,\n", | |
" 'criterion':criterion,\n", | |
" 'min_samples_split':4,\n", | |
" 'max_depth':50,\n", | |
" 'min_samples_leaf':4}\n", | |
" model = ensemble.ExtraTreesClassifier(**params)\n", | |
" elif algorithm == 'gradient_boosting':\n", | |
" model = ensemble.GradientBoostingClassifier()\n", | |
" elif algorithm == 'naive_bayes':\n", | |
" from sklearn.naive_bayes import BernoulliNB\n", | |
" model = BernoulliNB()\n", | |
" elif algorithm == 'svm':\n", | |
" from sklearn import svm\n", | |
" model = svm.NuSVC(nu = 0.1)\n", | |
" return model, params" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def na_find(train_dataframe, test_dataframe, na):\n", | |
" if na == 'median':\n", | |
" train_na = train_dataframe.median()\n", | |
" test_na = test_dataframe.median()\n", | |
" else:\n", | |
" train_na = test_na = na\n", | |
" return train_na, test_na" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def sklearn_ml(algorithm, na, logit_path):\n", | |
" train_na, test_na = na_find(df_train, df_test, na)\n", | |
" \n", | |
" model, params = sklearn_ml_model(algorithm)\n", | |
" \n", | |
" model.fit(df_train.fillna(train_na), label)\n", | |
" output_train = model.predict_proba(df_train.fillna(train_na))\n", | |
" store_logit(output_train, logit_path + '_train', algorithm)\n", | |
" output_test = model.predict_proba(df_test.fillna(test_na))\n", | |
" store_logit(output_test, logit_path + '_test', algorithm)\n", | |
" results_save(output_test, 'predictions_' + algorithm + str(params.items()) + '.csv')\n", | |
" return output_test" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#2-3 minutes\n", | |
"#sklearn_ml('logistic', 'median', \"C:/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/logit_column_logistic_11Mar\")\n", | |
"\n", | |
"#sklearn_ml('random_forest', -999, \"C:/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/logit_column_rf\")\n", | |
"\n", | |
"#~100 minutes\n", | |
"sklearn_ml('extra_trees', -999, \"C:/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/logit_column_extra_trees_11Mar\")\n", | |
"\n", | |
"#\n", | |
"#sklearn_ml('naive_bayes', 'median', \"C:/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/logit_column_nb\")\n", | |
"#sklearn_ml('gradient_boosting', -999, )\n", | |
"#sklearn_ml('svm', 'median')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def cross_validation_sklearn(algorithm, na):\n", | |
" #manual\n", | |
" #Xtrain, Xtest, ytrain, ytest = cross_validation.train_test_split(df_train, df_test, test_size=0.20, random_state=0)\n", | |
" model, params = sklearn_ml_model(algorithm)\n", | |
" print model\n", | |
" scores = cross_validation.cross_val_score(model, df_train.fillna(na), label, cv = 5, scoring = 'log_loss')\n", | |
" print \"Accuracy: %0.4f (+/- %0.4f)\" % (scores.mean(), scores.std() * 2)\n", | |
" return scores\n", | |
" \n", | |
"#scores = cross_validation_sklearn('logistic', df_train.median())\n", | |
"#scores = cross_validation_sklearn('naive_bayes', df_train.median())\n", | |
"scores = cross_validation_sklearn('extra_trees', -999)\n", | |
"#scores = cross_validation_sklearn('svm', df_train.median())" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def results_save(output, path):\n", | |
" #with open('/home/sunshine/xgboost_labelencode_100rounds_base_score0.76', 'w') as f:\n", | |
" with open('C:/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/' + path, 'w') as f:\n", | |
" f.write('ID,PredictedProb\\n')\n", | |
" for index in xrange(len(output)):\n", | |
" f.write('{ID},{value}\\n'.format(ID = int(ids[index]), value = output[index][1]))\n", | |
" \n", | |
"results_save(output_clf, 'predictions_extratrees_250features_calibrated_with_validation_set')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def report(grid_scores, n_top=3):\n", | |
" #http://scikit-learn.org/stable/auto_examples/model_selection/randomized_search.html#example-model-selection-randomized-search-py\n", | |
" import operator\n", | |
" top_scores = sorted(grid_scores, key=operator.itemgetter(1), reverse=True)[:n_top]\n", | |
" for i, score in enumerate(top_scores):\n", | |
" print \"Model with rank: {0}\".format(i + 1)\n", | |
" print \"Mean validation score: {0:.3f} (std: {1:.3f})\".format(\n", | |
" score.mean_validation_score,\n", | |
" numpy.std(score.cv_validation_scores))\n", | |
" print \"Parameters: {0}\".format(score.parameters)\n", | |
" print \"\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"#random grid search\n", | |
"def grid_search_sklearn(algorithm, random, na):\n", | |
" from scipy.stats import randint as sp_randint\n", | |
" if algorithm == 'logistic':\n", | |
" model = linear_model.LogisticRegression()\n", | |
" params = {\n", | |
" 'penalty': ['l1', 'l2'], \n", | |
" 'C': [1, 0.1, 0.01],\n", | |
" 'fit_intercept': [True, False],\n", | |
" 'n_jobs': [3]}\n", | |
" if algorithm == 'extra_trees':\n", | |
" model = ensemble.ExtraTreesClassifier()\n", | |
" params = {'n_estimators': sp_randint(25, 200),\n", | |
" \"max_depth\": sp_randint(10, 50),\n", | |
" \"max_features\": sp_randint(10, 110),\n", | |
" \"min_samples_split\": sp_randint(3, 6),\n", | |
" \"min_samples_leaf\": sp_randint(3, 6),\n", | |
" \"bootstrap\": [True, False],\n", | |
" \"criterion\": [\"gini\", \"entropy\"],\n", | |
" 'n_jobs': [3]}\n", | |
" n_iter_search = 6\n", | |
" \n", | |
" if random == 'random':\n", | |
" search = grid_search.RandomizedSearchCV(model, param_distributions=params, n_iter=n_iter_search)\n", | |
" else:\n", | |
" search = grid_search.GridSearchCV(model, param_distributions=params, n_iter=n_iter_search)\n", | |
" search.fit(df_train.fillna(na), label)\n", | |
" report(search.grid_scores_)\n", | |
"\n", | |
"#grid_search_sklearn('logistic', 'random', df_train.median()) #~10 minutes\n", | |
"grid_search_sklearn('extra_trees', 'random', -999) #~10 minutes" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def calibrated_classifier(dataframe, index, algorithm, na, model = None):\n", | |
" from sklearn.metrics import log_loss\n", | |
"\n", | |
" #https://github.com/christophebourguignat/notebooks/blob/master/Calibration.ipynb\n", | |
" xtrain, xtest, ytrain, ytest = cross_validation.train_test_split(dataframe, index, test_size=0.20, random_state=0)\n", | |
" #xtrain_valid, xtest, ytrain_valid, ytest = cross_validation.train_test_split(dataframe, label, test_size=0.20, random_state=0)\n", | |
" #xtrain, xvalid, ytrain, yvalid = cross_validation.train_test_split(xtrain_valid, ytrain_valid, test_size=0.25, random_state=0)\n", | |
" if model == None:\n", | |
" model, params = sklearn_ml_model(algorithm)\n", | |
" train_na, test_na = na_find(xtrain, xtest, na)\n", | |
" #model.fit(xtrain.fillna(na), ytrain)\n", | |
" #ypreds = model.predict_proba(xtest.fillna(na))\n", | |
" # in our case, 'isotonic' works better than default 'sigmoid'\n", | |
" calibrated_clf = calibration.CalibratedClassifierCV(model, method='isotonic', cv=5) #cv = 5\n", | |
" calibrated_clf.fit(xtrain.fillna(train_na), ytrain)\n", | |
" #ypreds = calibrated_clf.predict_proba(xtest.fillna(test_na))\n", | |
" #print \"%.4f\" % log_loss(ytest, ypreds, eps=1e-15, normalize=True)\n", | |
" return calibrated_clf\n", | |
"\n", | |
"#~5 minutes for nfeatures == 25\n", | |
"#calibrated_clf = calibrated_classifier(df_train, label, \"extra_trees\", -999)\n", | |
"#calibrated_clf = calibrated_classifier(df_train, label, \"logistic\", 'median')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def bagging_classifier(algorithm, na):\n", | |
" from sklearn.metrics import log_loss\n", | |
" xtrain, xtest, ytrain, ytest = cross_validation.train_test_split(df_train, label, test_size=0.20, random_state=0)\n", | |
" \n", | |
" model, params = sklearn_ml_model(algorithm)\n", | |
" \n", | |
" clfbag = ensemble.BaggingClassifier(model, n_estimators=5)\n", | |
" clfbag.fit(Xtrain.fillna(na), ytrain)\n", | |
" ypreds = clfbag.predict_proba(Xtest.fillna(na))\n", | |
" print \"%.4f\" % log_loss(ytest, ypreds, eps=1e-15, normalize=True)\n", | |
" return clfbag\n", | |
" \n", | |
"clf_bag = bagging_classifier(\"extra_trees\", -999)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def average_predictions(paths):\n", | |
" #with open(paths) as f:\n", | |
" values = pandas.read_csv(paths[0], names = '0', skiprows = 1)\n", | |
" for index, path in enumerate(paths[1:]):\n", | |
" new_df = pandas.read_csv(path, names = str(index + 1), skiprows = 1)\n", | |
" values = pandas.concat([values, new_df], axis=1)\n", | |
" values.mean(axis = 1).to_csv(\"C:/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/average.csv\")\n", | |
"\n", | |
"files = [\"C:/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/extratrees\",\n", | |
" \"C:/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/preds_blend.csv\",\n", | |
" \"C:/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/xgboost_labelencode_100rounds\", \n", | |
" \"C:/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/predictions_extratrees_250features_calibrated\"]\n", | |
"average_predictions(files)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def stacking(algorithm, na):\n", | |
" x_a, x_b, y_a, y_b = cross_validation.train_test_split(df_train, label, test_size=0.50, random_state=0)\n", | |
" model, params = sklearn_ml_model(algorithm)\n", | |
" \n", | |
" #split training set into folds, train on folds, predict out of fold\n", | |
" x_a_na, x_b_na = na_find(x_a, x_b, na)\n", | |
" model.fit(x_a.fillna(x_a_na), y_a)\n", | |
" x_b_preds = model.predict_proba(x_b.fillna(x_b_na))\n", | |
" model.fit(x_b.fillna(x_b_na), y_b)\n", | |
" x_a_preds = model.predict_proba(x_a.fillna(x_a_na))\n", | |
" \n", | |
" #make new column of predictions for training set\n", | |
" x_a_df = pandas.DataFrame(x_a_preds[:,1], columns = [algorithm])\n", | |
" x_b_df = pandas.DataFrame(x_b_preds[:,1], columns = [algorithm])\n", | |
" new_df = pandas.concat([x_a_df, x_b_df], axis=0)\n", | |
" return new_df\n", | |
" \n", | |
" train_na, test_na = na_find(df_train, df_test, na)\n", | |
" model.fit(df_train.fillna(train_na), label)\n", | |
" test_preds = model.predict_proba(df_test.fillna(test_na))\n", | |
" \n", | |
" #make new column of predictions for test set\n", | |
" test_df = pandas.DataFrame(test_preds[:,1], columns = [algorithm])\n", | |
"\n", | |
" def combiner(algorithm):\n", | |
" model, params = sklearn_ml_model(algorithm)\n", | |
" #train_na, test_na = na_find(df_train, df_test, na)\n", | |
" model.fit(new_df, label)\n", | |
" final_preds = model.predict_proba(test_df)\n", | |
" return final_preds\n", | |
" \n", | |
" return combiner('logistic') #try logistic regression here\n", | |
"\n", | |
"\n", | |
"output = stacking('extra_trees', -999)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"from __future__ import division\n", | |
"\n", | |
"def blending(na):\n", | |
" numpy.random.seed(0) # seed to shuffle the train set\n", | |
" \n", | |
" global label, df_train, df_test\n", | |
" #print len(df_train), len(label)\n", | |
" n_folds = 10\n", | |
" verbose = True\n", | |
" shuffle = False\n", | |
"\n", | |
" #X, y, X_submission = load_data.load()\n", | |
"\n", | |
" if shuffle:\n", | |
" idx = np.random.permutation(label.size)\n", | |
" df_train = df_train[idx]\n", | |
" label = label[idx]\n", | |
"\n", | |
" skf = list(cross_validation.StratifiedKFold(label, n_folds))\n", | |
"\n", | |
" clfs = [ensemble.RandomForestClassifier(n_estimators=100, n_jobs=3, criterion='gini')]#,\n", | |
" #ensemble.RandomForestClassifier(n_estimators=100, n_jobs=3, criterion='entropy'),\n", | |
" #ensemble.ExtraTreesClassifier(n_estimators=100, n_jobs=3, criterion='gini'),\n", | |
" #ensemble.ExtraTreesClassifier(n_estimators=100, n_jobs=3, criterion='entropy'),\n", | |
" #ensemble.GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50)]\n", | |
"\n", | |
" print \"Creating train and test sets for blending.\"\n", | |
" \n", | |
" dataset_blend_train = numpy.zeros((df_train.shape[0], len(clfs)))\n", | |
" dataset_blend_test = numpy.zeros((df_test.shape[0], len(clfs)))\n", | |
" \n", | |
" for j, clf in enumerate(clfs):\n", | |
" print j, clf\n", | |
" dataset_blend_test_j = numpy.zeros((df_test.shape[0], len(skf)))\n", | |
" for i, (train, test) in enumerate(skf):\n", | |
" print \"Fold\", i\n", | |
" X_train = df_train.loc[train, :]\n", | |
" y_train = label[train]\n", | |
" X_test = df_train.loc[test, :]\n", | |
" y_test = label[test]\n", | |
" cal_clf = calibrated_classifier(X_train, y_train, '', na, clf)\n", | |
" cal_clf.fit(X_train.fillna(-999), y_train)\n", | |
" y_submission = cal_clf.predict_proba(X_test.fillna(-999))[:,1]\n", | |
" dataset_blend_train[test, j] = y_submission\n", | |
" dataset_blend_test_j[:, i] = cal_clf.predict_proba(df_test.fillna(-999))[:,1]\n", | |
" dataset_blend_test[:,j] = dataset_blend_test_j.mean(1)\n", | |
"\n", | |
" print\n", | |
" print \"Blending.\"\n", | |
" clf = linear_model.LogisticRegression()\n", | |
" clf.fit(dataset_blend_train, label)\n", | |
" y_submission = clf.predict_proba(dataset_blend_test)[:,1]\n", | |
"\n", | |
" print \"Linear stretch of predictions to [0,1]\"\n", | |
" y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())\n", | |
"\n", | |
" print \"Saving Results.\"\n", | |
" numpy.savetxt(fname='blend_cal.csv', X=y_submission, fmt='%0.9f')\n", | |
" numpy.savetxt(fname='dataset_blend_train_cal', X=dataset_blend_train, fmt='%0.9f')\n", | |
" numpy.savetxt(fname='dataset_blend_test_cal', X=dataset_blend_test, fmt='%0.9f')\n", | |
" return y_submission\n", | |
"\n", | |
"#~4 hours 3:37\n", | |
"train = blending(-999)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"a = numpy.loadtxt(fname='blend.csv')\n", | |
"\n", | |
"ids_df = pandas.DataFrame(ids)\n", | |
"\n", | |
"new_df = pandas.concat([ids_df, a], axis=1)\n", | |
"\n", | |
"new_df.to_csv(\"preds_blend\", index = False)\n", | |
"#still need first line of text" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.11" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment