Last active
August 15, 2018 21:46
-
-
Save accessnash/3c9086ff3a03c25faac7fd2842ea614d to your computer and use it in GitHub Desktop.
Machine Learning with Tree-Based Models in Python : Ch 2 : Bias-variance trade-off , Ensemble learning - Datacamp
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from sklearn.model_selection import train_test_split | |
| # Set SEED for reproducibility | |
| SEED = 1 | |
| # Split the data into 70% train and 30% test | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=SEED) | |
| # Instantiate a DecisionTreeRegressor dt | |
| dt = DecisionTreeRegressor(max_depth=4, min_samples_leaf=0.26, random_state=SEED) | |
| from sklearn.model_selection import cross_val_score | |
| # Compute the array containing the 10-folds CV MSEs | |
| MSE_CV_scores = - cross_val_score(dt, X_train, y_train, cv=10, | |
| scoring='neg_mean_squared_error', | |
| n_jobs=-1) | |
| # Compute the 10-folds CV RMSE | |
| RMSE_CV = (MSE_CV_scores.mean())**(0.5) | |
| # Print RMSE_CV | |
| print('CV RMSE: {:.2f}'.format(RMSE_CV)) | |
| from sklearn.metrics import mean_squared_error as MSE | |
| # Fit dt to the training set | |
| dt.fit(X_train, y_train) | |
| # Predict the labels of the training set | |
| y_pred_train = dt.predict(X_train) | |
| # Evaluate the training set RMSE of dt | |
| RMSE_train = (MSE(y_train, y_pred_train))**(1/2) | |
| # Print RMSE_train | |
| print('Train RMSE: {:.2f}'.format(RMSE_train)) | |
| # Ensemble learning | |
| from sklearn.neighbors import KNeighborsClassifier | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.tree import DecisionTreeClassifier | |
| # Instantiate lr | |
| lr = LogisticRegression(random_state=SEED) | |
| # Instantiate knn | |
| knn = KNeighborsClassifier(n_neighbors=27) | |
| # Instantiate dt | |
| dt = DecisionTreeClassifier(min_samples_leaf=0.13, random_state=SEED) | |
| # Define the list classifiers | |
| classifiers = [('Logistic Regression', lr), ('K Nearest Neighbours', knn), ('Classification Tree', dt)] | |
| for clf_name, clf in classifiers: | |
| # Fit clf to the training set | |
| clf.fit(X_train, y_train) | |
| # Predict y_pred | |
| y_pred = clf.predict(X_test) | |
| # Calculate accuracy | |
| accuracy = accuracy_score(y_test, y_pred) | |
| # Evaluate clf's accuracy on the test set | |
| print('{:s} : {:.3f}'.format(clf_name, accuracy)) | |
| from sklearn.ensemble import VotingClassifier | |
| # Instantiate a VotingClassifier vc | |
| vc = VotingClassifier(estimators=classifiers) | |
| # Fit vc to the training set | |
| vc.fit(X_train, y_train) | |
| # Evaluate the test set predictions | |
| y_pred = vc.predict(X_test) | |
| # Calculate accuracy score | |
| accuracy = accuracy_score(y_test, y_pred) | |
| print('Voting Classifier: {:.3f}'.format(accuracy)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment