Skip to content

Instantly share code, notes, and snippets.

@VeylanSolmira
Last active May 21, 2016 21:26
Show Gist options
  • Save VeylanSolmira/0632762d9a721a46acbcf9a74839d3c1 to your computer and use it in GitHub Desktop.
Save VeylanSolmira/0632762d9a721a46acbcf9a74839d3c1 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"#import xgboost\n",
"import pandas\n",
"from sklearn import preprocessing, cross_validation, grid_search, ensemble, linear_model, calibration\n",
"import numpy\n",
"import math\n",
"#import graphviz"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"with open(\"C:/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/train.csv/train.csv\") as f:\n",
" df_train = pandas.read_csv(f)\n",
" df_train_processed = False"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"with open(\"C:/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/test.csv/test.csv\") as f:\n",
" df_test = pandas.read_csv(f)\n",
" df_test_processed = False"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def feature_engineering(dataframe):\n",
" #if 'non-na_count' not in dataframe:\n",
" new_df = pandas.DataFrame(dataframe.count(axis = 1), columns = ['non-na_count'])\n",
" df_train = pandas.concat([dataframe, new_df], axis=1)\n",
" #count NAs in row"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def feature_selection(dataframe, threshold):\n",
" list_98 = ['v17', 'v46', 'v26', 'v63', 'v71']\n",
" list_95 = ['v11', 'v25', 'v29', 'v83', 'v41', 'v43', 'v89', 'v64', 'v92', 'v97', 'v108']\n",
" list_90 = ['v8', 'v10', 'v13', 'v15', 'v20', 'v32', 'v33', 'v34', 'v54', 'v67', 'v109']\n",
" list_85 = ['v1', 'v44', 'v55', 'v60', 'v61', 'v76', 'v77', 'v94', 'v105', 'v111', 'v119']\n",
" list_80 = ['v4', 'v9', 'v14', 'v35', 'v51', 'v80', 'v87', 'v101', 'v121']\n",
" list_75 = ['v23', 'v49', 'v65', 'v85', 'v93']\n",
" list_70 = ['v2', 'v7', 'v18', 'v27', 'v48', 'v59', 'v73', 'v84', 'v123']\n",
" list_65 = ['v36', 'v47', 'v117']\n",
" list_60 = ['v45', 'v70', 'v86', 'v98']\n",
" list_55 = ['v5', 'v31', 'v42', 'v58', 'v102']\n",
" list_50 = ['v19', 'v37', 'v38', 'v57', 'v82', 'v95', 'v96', 'v99', 'v103', 'v104', 'v130', 'v106']\n",
" corr_list = [list_98, list_95, list_90, list_85, list_80, list_75, list_70, list_65, list_60, list_55, list_50]\n",
" for elem in corr_list[:threshold]:\n",
" dataframe.drop(elem, axis = 1, inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def preprocess(threshold):\n",
" #consider dropping: ['v112', 'v125', 'v74', 'v1', 'v110', 'v47']\n",
" global df_train_processed\n",
" global df_test_processed\n",
" if not df_train_processed:\n",
" df_train_processed = True\n",
" #feature_engineering(df_train)\n",
" #feature_selection(df_train, threshold)\n",
" label = df_train['target'].values\n",
" df_train.drop(['target', 'ID'], axis = 1, inplace = True)\n",
" df_train.drop('v22', axis = 1, inplace = True) #'v22' has over 16k values\n",
" \n",
" numeric_columns = df_train.describe().columns.tolist() \n",
"\n",
" dtypes = df_train.dtypes\n",
" non_numeric = list()\n",
" for index, elem in enumerate(dtypes):\n",
" if elem != 'float64':\n",
" non_numeric.append(dtypes.index[index])\n",
" \n",
" if not df_test_processed:\n",
" df_test_processed = True\n",
" #feature_engineering(df_test)\n",
" #feature_selection(df_test, threshold)\n",
" ids = df_test['ID'].values\n",
" df_test.drop('ID', axis = 1, inplace = True)\n",
" df_test.drop('v22', axis = 1, inplace = True)\n",
" \n",
" #remove_correlations()\n",
" return non_numeric, label, ids\n",
"\n",
"non_numeric, label, ids = preprocess(15)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def label_encode(dataframe):\n",
" le = preprocessing.LabelEncoder()\n",
" for elem in non_numeric:\n",
" if elem in dataframe.columns:\n",
" dataframe[elem] = le.fit_transform(dataframe[elem])\n",
" #print len(le.classes_)\n",
"\n",
"label_encode(df_train)\n",
"label_encode(df_test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def remove_correlations(lower):\n",
" corr = df_train.corr()\n",
" drop = list()\n",
" for row in xrange(corr.shape[0]):\n",
" for column in xrange(corr.shape[1]):\n",
" if lower < corr.ix[row, column] and row < column:\n",
" drop.append((row, column))\n",
" return drop\n",
" \n",
"drop_tuples = remove_correlations(0.98)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def ids_label_save():\n",
" numpy.savetxt(\"/media/sunshine/Windows/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/label\", label)\n",
" numpy.savetxt(\"/media/sunshine/Windows/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/ids\", ids)\n",
"\n",
"ids_label_save()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def ids_label_load():\n",
" label = numpy.loadtxt(\"/media/sunshine/Windows/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/label\")\n",
" ids = numpy.loadtxt(\"/media/sunshine/Windows/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/ids\")\n",
"\n",
"ids_label_load()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def onehot(dataframe):\n",
" enc = preprocessing.OneHotEncoder()\n",
" onehot_df_start = pandas.DataFrame(enc.fit_transform(dataframe[[non_numeric[0]]]).todense())\n",
" onehot_df_start.columns = map(lambda x: non_numeric[0] + '_' + str(x), onehot_df_start.columns.tolist())\n",
" for elem in non_numeric[1:]:\n",
" if elem in dataframe.columns:\n",
" onehot_df = pandas.DataFrame(enc.fit_transform(dataframe[[elem]]).todense())\n",
" onehot_df.columns = map(lambda x: elem + '_' + str(x), onehot_df.columns.tolist())\n",
" dataframe.drop(elem, axis = 1, inplace = True)\n",
" #print len(onehot_df_start.columns)\n",
" onehot_df_start = onehot_df_start.join(onehot_df)\n",
" #print len(onehot_df_start.columns)\n",
" dataframe = dataframe.join(onehot_df_start)\n",
" return dataframe\n",
"\n",
"#I got worse results with one-hot encoding v. label encoding\n",
"#df_train = onehot(df_train)\n",
"#df_test = onehot(df_test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def save_dataframe(dataframe, path):\n",
" dataframe.to_csv(path)\n",
" \n",
"def load_dataframe(path)\n",
" return pandas.read_csv(path)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"save_dataframe(df_train, \"C:/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/df_train_label_encoded\")\n",
"save_dataframe(df_test, \"C:/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/df_test_label_encoded\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df_train = load_dataframe(\"/media/sunshine/Windows/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/df_train_onehot.csv\")\n",
"df_test = load_dataframe(\"/media/sunshine/Windows/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/df_test_onehot.csv\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def train_test_columns():\n",
" train_columns = set(df_train.columns.tolist())\n",
" test_columns = set(df_test.columns.tolist())\n",
" #train_columns - test_columns\n",
" df_train.drop(train_columns - test_columns, axis = 1, inplace = True)\n",
" \n",
"train_test_columns()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#this may not current work\n",
"def boost_load():\n",
" bst.load_model(\"/media/sunshine/Windows/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/xgboost_model_1000rounds\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def store_logit(output, path, column):\n",
" def logit(x):\n",
" return (math.log(x) - math.log(1 - x))\n",
"\n",
" #create new column for df_train that are the logit(prediction) from earlier model\n",
" logit_list = list()\n",
" for elem in output.tolist():\n",
" try:\n",
" logit_list.append(logit(elem[1]))\n",
" except ValueError:\n",
" logit_list.append(10)\n",
" pandas.DataFrame(logit_list, columns=[column]).to_csv(path)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def attach_logit(dataframe, paths):\n",
" values = pandas.read_csv(paths[0])[[1]]\n",
" for path in paths[1:]:\n",
" new_df = pandas.read_csv(path)[[1]]\n",
" values = pandas.concat([values, new_df], axis=1)\n",
" return pandas.concat([dataframe, values], axis=1)\n",
" \n",
"train_logit_list = ['logit_column_extra_trees_train', 'logit_column_logistic_train']\n",
"df_train = attach_logit(df_train, train_logit_list)\n",
"test_logit_list = ['logit_column_extra_trees_test', 'logit_column_logistic_test']\n",
"df_test = attach_logit(df_test, test_logit_list)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def xgboost(na, path):\n",
" dtrain = xgboost.DMatrix(df_train.fillna(na), label = label, missing = na)\n",
" dtest = xgboost.DMatrix(df_test.fillna(na), label = label, missing = na)\n",
" \n",
" param = {'max_depth':10,\n",
" 'eta':0.1,\n",
" 'objective':'binary:logistic',\n",
" #'base_score': 0.76,\n",
" 'eval_metric': 'logloss',\n",
" 'subsample': 0.75, \n",
" 'colsample_bytree': 0.7}\n",
" \n",
" num_round = 20 #45-60 min for 1000\n",
" #watchlist = [(dtest,'eval'), (dtrain,'train')]\n",
" \n",
" bst = xgboost.train(param, dtrain, num_round)#, watchlist)\n",
" bst.save_model(\"/media/sunshine/Windows/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/\" + path)\n",
" output = bst.predict(dtest)\n",
" return output\n",
" \n",
"xgboost(-999, \"xgboost_model_1000rounds\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def xgboost_plot():\n",
" #xgboost.plot_importance(bst)\n",
" #xgboost.plot_tree(bst, num_trees=2)\n",
" #xgb.to_graphviz(bst, num_trees=2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def xgboost_cv():\n",
" bst_cv = xgboost.cv(param, dtrain, num_boost_round = 10, nfold = 5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def sklearn_ml_model(algorithm, criterion = 'entropy'):\n",
" params = {}\n",
" if algorithm == 'logistic':\n",
" model = linear_model.LogisticRegression()\n",
" elif algorithm == 'random_forest':\n",
" params = {'n_estimators':25,\n",
" 'n_jobs':3,\n",
" 'max_features':50,\n",
" 'criterion':criterion,\n",
" 'min_samples_split':4,\n",
" 'max_depth':50,\n",
" 'min_samples_leaf':4}\n",
" model = ensemble.RandomForestClassifier(**params)\n",
" elif algorithm == 'extra_trees':\n",
" #n_estimators=850,max_features= 60,criterion= 'entropy',min_samples_split= 4,\n",
" # max_depth= 40, min_samples_leaf= 2, n_jobs = -1) \n",
" params = {'n_estimators':25,\n",
" 'n_jobs':3,\n",
" 'max_features':45,\n",
" 'criterion':criterion,\n",
" 'min_samples_split':4,\n",
" 'max_depth':50,\n",
" 'min_samples_leaf':4}\n",
" model = ensemble.ExtraTreesClassifier(**params)\n",
" elif algorithm == 'gradient_boosting':\n",
" model = ensemble.GradientBoostingClassifier()\n",
" elif algorithm == 'naive_bayes':\n",
" from sklearn.naive_bayes import BernoulliNB\n",
" model = BernoulliNB()\n",
" elif algorithm == 'svm':\n",
" from sklearn import svm\n",
" model = svm.NuSVC(nu = 0.1)\n",
" return model, params"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def na_find(train_dataframe, test_dataframe, na):\n",
" if na == 'median':\n",
" train_na = train_dataframe.median()\n",
" test_na = test_dataframe.median()\n",
" else:\n",
" train_na = test_na = na\n",
" return train_na, test_na"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def sklearn_ml(algorithm, na, logit_path):\n",
" train_na, test_na = na_find(df_train, df_test, na)\n",
" \n",
" model, params = sklearn_ml_model(algorithm)\n",
" \n",
" model.fit(df_train.fillna(train_na), label)\n",
" output_train = model.predict_proba(df_train.fillna(train_na))\n",
" store_logit(output_train, logit_path + '_train', algorithm)\n",
" output_test = model.predict_proba(df_test.fillna(test_na))\n",
" store_logit(output_test, logit_path + '_test', algorithm)\n",
" results_save(output_test, 'predictions_' + algorithm + str(params.items()) + '.csv')\n",
" return output_test"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#2-3 minutes\n",
"#sklearn_ml('logistic', 'median', \"C:/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/logit_column_logistic_11Mar\")\n",
"\n",
"#sklearn_ml('random_forest', -999, \"C:/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/logit_column_rf\")\n",
"\n",
"#~100 minutes\n",
"sklearn_ml('extra_trees', -999, \"C:/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/logit_column_extra_trees_11Mar\")\n",
"\n",
"#\n",
"#sklearn_ml('naive_bayes', 'median', \"C:/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/logit_column_nb\")\n",
"#sklearn_ml('gradient_boosting', -999, )\n",
"#sklearn_ml('svm', 'median')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def cross_validation_sklearn(algorithm, na):\n",
" #manual\n",
" #Xtrain, Xtest, ytrain, ytest = cross_validation.train_test_split(df_train, df_test, test_size=0.20, random_state=0)\n",
" model, params = sklearn_ml_model(algorithm)\n",
" print model\n",
" scores = cross_validation.cross_val_score(model, df_train.fillna(na), label, cv = 5, scoring = 'log_loss')\n",
" print \"Accuracy: %0.4f (+/- %0.4f)\" % (scores.mean(), scores.std() * 2)\n",
" return scores\n",
" \n",
"#scores = cross_validation_sklearn('logistic', df_train.median())\n",
"#scores = cross_validation_sklearn('naive_bayes', df_train.median())\n",
"scores = cross_validation_sklearn('extra_trees', -999)\n",
"#scores = cross_validation_sklearn('svm', df_train.median())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def results_save(output, path):\n",
" #with open('/home/sunshine/xgboost_labelencode_100rounds_base_score0.76', 'w') as f:\n",
" with open('C:/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/' + path, 'w') as f:\n",
" f.write('ID,PredictedProb\\n')\n",
" for index in xrange(len(output)):\n",
" f.write('{ID},{value}\\n'.format(ID = int(ids[index]), value = output[index][1]))\n",
" \n",
"results_save(output_clf, 'predictions_extratrees_250features_calibrated_with_validation_set')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def report(grid_scores, n_top=3):\n",
" #http://scikit-learn.org/stable/auto_examples/model_selection/randomized_search.html#example-model-selection-randomized-search-py\n",
" import operator\n",
" top_scores = sorted(grid_scores, key=operator.itemgetter(1), reverse=True)[:n_top]\n",
" for i, score in enumerate(top_scores):\n",
" print \"Model with rank: {0}\".format(i + 1)\n",
" print \"Mean validation score: {0:.3f} (std: {1:.3f})\".format(\n",
" score.mean_validation_score,\n",
" numpy.std(score.cv_validation_scores))\n",
" print \"Parameters: {0}\".format(score.parameters)\n",
" print \"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#random grid search\n",
"def grid_search_sklearn(algorithm, random, na):\n",
" from scipy.stats import randint as sp_randint\n",
" if algorithm == 'logistic':\n",
" model = linear_model.LogisticRegression()\n",
" params = {\n",
" 'penalty': ['l1', 'l2'], \n",
" 'C': [1, 0.1, 0.01],\n",
" 'fit_intercept': [True, False],\n",
" 'n_jobs': [3]}\n",
" if algorithm == 'extra_trees':\n",
" model = ensemble.ExtraTreesClassifier()\n",
" params = {'n_estimators': sp_randint(25, 200),\n",
" \"max_depth\": sp_randint(10, 50),\n",
" \"max_features\": sp_randint(10, 110),\n",
" \"min_samples_split\": sp_randint(3, 6),\n",
" \"min_samples_leaf\": sp_randint(3, 6),\n",
" \"bootstrap\": [True, False],\n",
" \"criterion\": [\"gini\", \"entropy\"],\n",
" 'n_jobs': [3]}\n",
" n_iter_search = 6\n",
" \n",
" if random == 'random':\n",
" search = grid_search.RandomizedSearchCV(model, param_distributions=params, n_iter=n_iter_search)\n",
" else:\n",
" search = grid_search.GridSearchCV(model, param_distributions=params, n_iter=n_iter_search)\n",
" search.fit(df_train.fillna(na), label)\n",
" report(search.grid_scores_)\n",
"\n",
"#grid_search_sklearn('logistic', 'random', df_train.median()) #~10 minutes\n",
"grid_search_sklearn('extra_trees', 'random', -999) #~10 minutes"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def calibrated_classifier(dataframe, index, algorithm, na, model = None):\n",
" from sklearn.metrics import log_loss\n",
"\n",
" #https://github.com/christophebourguignat/notebooks/blob/master/Calibration.ipynb\n",
" xtrain, xtest, ytrain, ytest = cross_validation.train_test_split(dataframe, index, test_size=0.20, random_state=0)\n",
" #xtrain_valid, xtest, ytrain_valid, ytest = cross_validation.train_test_split(dataframe, label, test_size=0.20, random_state=0)\n",
" #xtrain, xvalid, ytrain, yvalid = cross_validation.train_test_split(xtrain_valid, ytrain_valid, test_size=0.25, random_state=0)\n",
" if model == None:\n",
" model, params = sklearn_ml_model(algorithm)\n",
" train_na, test_na = na_find(xtrain, xtest, na)\n",
" #model.fit(xtrain.fillna(na), ytrain)\n",
" #ypreds = model.predict_proba(xtest.fillna(na))\n",
" # in our case, 'isotonic' works better than default 'sigmoid'\n",
" calibrated_clf = calibration.CalibratedClassifierCV(model, method='isotonic', cv=5) #cv = 5\n",
" calibrated_clf.fit(xtrain.fillna(train_na), ytrain)\n",
" #ypreds = calibrated_clf.predict_proba(xtest.fillna(test_na))\n",
" #print \"%.4f\" % log_loss(ytest, ypreds, eps=1e-15, normalize=True)\n",
" return calibrated_clf\n",
"\n",
"#~5 minutes for nfeatures == 25\n",
"#calibrated_clf = calibrated_classifier(df_train, label, \"extra_trees\", -999)\n",
"#calibrated_clf = calibrated_classifier(df_train, label, \"logistic\", 'median')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def bagging_classifier(algorithm, na):\n",
" from sklearn.metrics import log_loss\n",
" xtrain, xtest, ytrain, ytest = cross_validation.train_test_split(df_train, label, test_size=0.20, random_state=0)\n",
" \n",
" model, params = sklearn_ml_model(algorithm)\n",
" \n",
" clfbag = ensemble.BaggingClassifier(model, n_estimators=5)\n",
" clfbag.fit(Xtrain.fillna(na), ytrain)\n",
" ypreds = clfbag.predict_proba(Xtest.fillna(na))\n",
" print \"%.4f\" % log_loss(ytest, ypreds, eps=1e-15, normalize=True)\n",
" return clfbag\n",
" \n",
"clf_bag = bagging_classifier(\"extra_trees\", -999)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def average_predictions(paths):\n",
" #with open(paths) as f:\n",
" values = pandas.read_csv(paths[0], names = '0', skiprows = 1)\n",
" for index, path in enumerate(paths[1:]):\n",
" new_df = pandas.read_csv(path, names = str(index + 1), skiprows = 1)\n",
" values = pandas.concat([values, new_df], axis=1)\n",
" values.mean(axis = 1).to_csv(\"C:/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/average.csv\")\n",
"\n",
"files = [\"C:/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/extratrees\",\n",
" \"C:/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/preds_blend.csv\",\n",
" \"C:/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/xgboost_labelencode_100rounds\", \n",
" \"C:/Users/ThatVoice/Desktop/Kaggle/BNP_Paribas_Cardiff/predictions_extratrees_250features_calibrated\"]\n",
"average_predictions(files)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def stacking(algorithm, na):\n",
" x_a, x_b, y_a, y_b = cross_validation.train_test_split(df_train, label, test_size=0.50, random_state=0)\n",
" model, params = sklearn_ml_model(algorithm)\n",
" \n",
" #split training set into folds, train on folds, predict out of fold\n",
" x_a_na, x_b_na = na_find(x_a, x_b, na)\n",
" model.fit(x_a.fillna(x_a_na), y_a)\n",
" x_b_preds = model.predict_proba(x_b.fillna(x_b_na))\n",
" model.fit(x_b.fillna(x_b_na), y_b)\n",
" x_a_preds = model.predict_proba(x_a.fillna(x_a_na))\n",
" \n",
" #make new column of predictions for training set\n",
" x_a_df = pandas.DataFrame(x_a_preds[:,1], columns = [algorithm])\n",
" x_b_df = pandas.DataFrame(x_b_preds[:,1], columns = [algorithm])\n",
" new_df = pandas.concat([x_a_df, x_b_df], axis=0)\n",
" return new_df\n",
" \n",
" train_na, test_na = na_find(df_train, df_test, na)\n",
" model.fit(df_train.fillna(train_na), label)\n",
" test_preds = model.predict_proba(df_test.fillna(test_na))\n",
" \n",
" #make new column of predictions for test set\n",
" test_df = pandas.DataFrame(test_preds[:,1], columns = [algorithm])\n",
"\n",
" def combiner(algorithm):\n",
" model, params = sklearn_ml_model(algorithm)\n",
" #train_na, test_na = na_find(df_train, df_test, na)\n",
" model.fit(new_df, label)\n",
" final_preds = model.predict_proba(test_df)\n",
" return final_preds\n",
" \n",
" return combiner('logistic') #try logistic regression here\n",
"\n",
"\n",
"output = stacking('extra_trees', -999)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from __future__ import division\n",
"\n",
"def blending(na):\n",
" numpy.random.seed(0) # seed to shuffle the train set\n",
" \n",
" global label, df_train, df_test\n",
" #print len(df_train), len(label)\n",
" n_folds = 10\n",
" verbose = True\n",
" shuffle = False\n",
"\n",
" #X, y, X_submission = load_data.load()\n",
"\n",
" if shuffle:\n",
" idx = np.random.permutation(label.size)\n",
" df_train = df_train[idx]\n",
" label = label[idx]\n",
"\n",
" skf = list(cross_validation.StratifiedKFold(label, n_folds))\n",
"\n",
" clfs = [ensemble.RandomForestClassifier(n_estimators=100, n_jobs=3, criterion='gini')]#,\n",
" #ensemble.RandomForestClassifier(n_estimators=100, n_jobs=3, criterion='entropy'),\n",
" #ensemble.ExtraTreesClassifier(n_estimators=100, n_jobs=3, criterion='gini'),\n",
" #ensemble.ExtraTreesClassifier(n_estimators=100, n_jobs=3, criterion='entropy'),\n",
" #ensemble.GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=50)]\n",
"\n",
" print \"Creating train and test sets for blending.\"\n",
" \n",
" dataset_blend_train = numpy.zeros((df_train.shape[0], len(clfs)))\n",
" dataset_blend_test = numpy.zeros((df_test.shape[0], len(clfs)))\n",
" \n",
" for j, clf in enumerate(clfs):\n",
" print j, clf\n",
" dataset_blend_test_j = numpy.zeros((df_test.shape[0], len(skf)))\n",
" for i, (train, test) in enumerate(skf):\n",
" print \"Fold\", i\n",
" X_train = df_train.loc[train, :]\n",
" y_train = label[train]\n",
" X_test = df_train.loc[test, :]\n",
" y_test = label[test]\n",
" cal_clf = calibrated_classifier(X_train, y_train, '', na, clf)\n",
" cal_clf.fit(X_train.fillna(-999), y_train)\n",
" y_submission = cal_clf.predict_proba(X_test.fillna(-999))[:,1]\n",
" dataset_blend_train[test, j] = y_submission\n",
" dataset_blend_test_j[:, i] = cal_clf.predict_proba(df_test.fillna(-999))[:,1]\n",
" dataset_blend_test[:,j] = dataset_blend_test_j.mean(1)\n",
"\n",
" print\n",
" print \"Blending.\"\n",
" clf = linear_model.LogisticRegression()\n",
" clf.fit(dataset_blend_train, label)\n",
" y_submission = clf.predict_proba(dataset_blend_test)[:,1]\n",
"\n",
" print \"Linear stretch of predictions to [0,1]\"\n",
" y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min())\n",
"\n",
" print \"Saving Results.\"\n",
" numpy.savetxt(fname='blend_cal.csv', X=y_submission, fmt='%0.9f')\n",
" numpy.savetxt(fname='dataset_blend_train_cal', X=dataset_blend_train, fmt='%0.9f')\n",
" numpy.savetxt(fname='dataset_blend_test_cal', X=dataset_blend_test, fmt='%0.9f')\n",
" return y_submission\n",
"\n",
"#~4 hours 3:37\n",
"train = blending(-999)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"a = numpy.loadtxt(fname='blend.csv')\n",
"\n",
"ids_df = pandas.DataFrame(ids)\n",
"\n",
"new_df = pandas.concat([ids_df, a], axis=1)\n",
"\n",
"new_df.to_csv(\"preds_blend\", index = False)\n",
"#still need first line of text"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment