#### linear regression import numpy as np import pandas as pd import os os.chdir("/Users/shuozhang/Desktop/data") nycmodel=pd.read_csv('nycmodeldata.csv', sep='\t', index_col=False, dtype={'zipcode':'S10'}) import statsmodels.api as sm add_dummies = pd.get_dummies(nycmodel['zipcode']) add_dummies=add_dummies.applymap(np.int) nycmodel = pd.concat([nycmodel, add_dummies], axis=1) nycmodel.drop(['zipcode','Unnamed: 0'], inplace=True, axis=1) target=nycmodel[['count']] data=nycmodel[[col for col in nycmodel.columns if col not in ['count']]] import sklearn.cross_validation as cv x_train, x_test, y_train, y_test = cv.train_test_split(data, target, test_size=2.0/10, random_state=0) from scipy import stats from sklearn import linear_model ols = linear_model.LinearRegression() ols.fit(x_train, y_train) 'training R^2: %.2f',ols.score(x_train, y_train) 'testing R^2: %.2f',ols.score(x_test, y_test) from sklearn.metrics import mean_squared_error 'training RMSE:', mean_squared_error(y_train, ols.predict(x_train)) 'testing RMSE:', mean_squared_error(y_test, ols.predict(x_test)) #### ridge regression from __future__ import print_function from __future__ import division from sklearn.cross_validation import cross_val_score from sklearn import linear_model from bayes_opt import BayesianOptimization from sklearn import metrics import math data=x_train target=y_train #### Bayesian Optimization def Ridgecv(alpha): return cross_val_score(linear_model.Ridge(alpha=float(alpha), random_state=2), data, target, 'mean_squared_error', cv=5).mean() if __name__ == "__main__": RidgeBO = BayesianOptimization(Ridgecv, {'alpha': (0, 8)}) RidgeBO.maximize(init_points=2, n_iter = 10) print('Final Results') print('Ridge: %f' % RidgeBO.res['max']['max_val']) ridge = linear_model.Ridge(alpha=0.3985) ridge.fit(x_train, y_train) ridge.score(x_train, y_train) ridge.score(x_test, y_test) mean_squared_error(y_train, ridge.predict(x_train)) mean_squared_error(y_test, ridge.predict(x_test)) #### randomforest RFR=RandomForestRegressor(max_features=14,n_estimators=300) RFR.fit(x_train, y_train) from sklearn.metrics import mean_squared_error mean_squared_error(y_train, RFR.predict(x_train)) mean_squared_error(y_test, RFR.predict(x_test)) RFR1=RandomForestRegressor(max_features=14,n_estimators=500) RFR1.fit(x_train, y_train) from sklearn.metrics import mean_squared_error mean_squared_error(y_train, RFR1.predict(x_train)) mean_squared_error(y_test, RFR1.predict(x_test)) #### xgboost #### Bayesian Optimization from __future__ import print_function from __future__ import division import xgboost as xgb from sklearn.cross_validation import cross_val_score from bayes_opt import bayesian_optimization def xgboostcv(max_depth, learning_rate, n_estimators, gamma, min_child_weight, subsample, colsample_bytree, silent=True, nthread=-1): return cross_val_score(xgb.XGBRegressor(max_depth=int(max_depth), learning_rate=learning_rate, n_estimators=int(n_estimators), silent=silent, nthread=nthread, gamma=gamma, min_child_weight=min_child_weight, subsample=subsample, colsample_bytree=colsample_bytree), x_train, y_train, "mean_squared_error", cv=5).mean() if __name__ == "__main__": xgboostBO = BayesianOptimization(xgboostcv, {'max_depth': (3, 14), 'learning_rate': (0.01, 0.2), 'n_estimators': (50, 1000), 'gamma': (1., 0.01), 'min_child_weight': (1, 10), 'subsample': (0.5, 1), 'colsample_bytree' :(0.5, 1)}) xgboostBO.maximize(init_points=2, n_iter = 28) print('-'*53) print('Final Results') print('XGBOOST: %f' % xgboostBO.res['max']['max_val']) XGB=xgb.XGBRegressor(max_depth=14,learning_rate=0.1186,n_estimators=463,silent=True, nthread=-1,gamma=1.0,min_child_weight=6.1929,subsample=0.9675,colsample_bytree=0.8544) XGB.fit(x_train, y_train) XGB.fit(x_test, y_test) from sklearn.metrics import mean_squared_error mean_squared_error(y_train, XGB.predict(x_train)) mean_squared_error(y_test, XGB.predict(x_test)) #### feature importance feature_imprtance = zip(x_trainsub.columns, RFR.feature_importances_) dtype = [('feature', 'S10'), ('importance', 'float')] feature_imprtance = np.array(feature_imprtance, dtype = dtype) feature_sort = np.sort(feature_imprtance, order='importance')[::-1] df=pd.DataFrame(feature_sort) import pylab as plt import numpy as np x = np.arange(1, 21) y= df['importance'] LABELS = df['feature'] plt.figure() plt.bar(x, y, align='center') plt.xticks(x, LABELS) plt.xlabel('Feature') plt.ylabel('RFR Importance') plt.title('RFR importance analysis of top 20 features') plt.show() #### ensemble: using linear regression combine two models: randomforest and xgboost pred_y_test_rf=RFR.predict(x_test) pred_y_test_rf=pd.DataFrame(pred_y_test, columns=['pred_y_testrf']) pred_y_train_rf=RFR.predict(x_train) pred_y_train_rf=pd.DataFrame(pred_y_train, columns=['pred_y_trainrf']) pred_y_test_xgb=XGB.predict(x_test) pred_y_test_xgb=pd.DataFrame(pred_y_test, columns=['pred_y_testxgb']) pred_y_train_xgb=XGB.predict(x_train) pred_y_train_xgb=pd.DataFrame(pred_y_train, columns=['pred_y_trainxgb']) pred_y_train_com=pd.concat([pred_y_trainrf,pred_y_trainxgb], axis=1) from sklearn import linear_model ols = linear_model.LinearRegression(fit_intercept=False) ols.fit(pred_y_train_com, y_train) ols.score(pred_y_train_com, y_train) from sklearn.metrics import mean_squared_error import math math.sqrt(mean_squared_error(y_train, pred_y_train_com)) pred_y_test_com=pd.concat([pred_y_testrf,pred_y_testxgb], axis=1) pred_y_ensemble=ols.fit(pred_y_test_com) math.sqrt(mean_squared_error(y_test, pred_y_ensemble)) pred_y_final=pd.concat([y_test, pred_y_testrf,pred_y_testxgb, pred_y_ensemble], axis=1) pred_y_final1=pred_y_final1.applymap(np.int)