Created
August 11, 2016 10:56
-
-
Save vvarma/b9acc635b687b1b6748b69813493b4bf to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Importing a few necessary libraries | |
| import matplotlib.pyplot as pl | |
| import numpy as np | |
| import pandas as pd | |
| from sklearn import datasets | |
| from sklearn.cross_validation import train_test_split | |
| from sklearn.tree import DecisionTreeRegressor | |
| from sklearn.metrics import make_scorer, mean_squared_error | |
| from sklearn.grid_search import GridSearchCV | |
| from sklearn import tree | |
| from sklearn.externals.six import StringIO | |
| import pydot | |
| # Create our client's feature set for which we will be predicting a selling price | |
| CLIENT_FEATURES = [[11.95, 0.00, 18.100, 0, 0.6590, 5.6090, 90.00, 1.385, 24, 680.0, 20.20, 332.09, 12.13]] | |
| # Load the Boston Housing dataset into the city_data variable | |
| city_data = datasets.load_boston() | |
| # Initialize the housing prices and housing features | |
| housing_prices = city_data.target | |
| housing_features = city_data.data | |
| print "Boston Housing dataset loaded successfully!" | |
| # # Statistical Analysis and Data Exploration | |
| total_houses, total_features = housing_features.shape | |
| # Minimum housing value in the dataset | |
| minimum_price = housing_prices.min(axis=0) | |
| # Maximum housing value in the dataset | |
| maximum_price = housing_prices.max(axis=0) | |
| # Mean house value of the dataset | |
| mean_price = housing_prices.mean(axis=0) | |
| # Median house value of the dataset | |
| median_price = None | |
| sorted_prices = housing_prices.copy() # I was sorting housing prices all along | |
| sorted_prices.sort(axis=0) | |
| if (total_houses % 2 == 0): | |
| median_price = (sorted_prices[total_houses / 2 - 1] + sorted_prices[total_houses / 2]) / 2 | |
| else: | |
| median_price = sorted_prices[total_houses / 2] | |
| # Standard deviation of housing values of the dataset | |
| std_dev = housing_prices.std(axis=0) | |
| # Show the calculated statistics | |
| print "Boston Housing dataset statistics (in $1000's):\n" | |
| print "Total number of houses:", total_houses | |
| print "Total number of features:", total_features | |
| print "Minimum house price:", minimum_price | |
| print "Maximum house price:", maximum_price | |
| print "Mean house price: {0:.3f}".format(mean_price) | |
| print "Median house price:", median_price | |
| print "Standard deviation of house price: {0:.3f}".format(std_dev) | |
| client_feat = pd.DataFrame(CLIENT_FEATURES, columns=city_data.feature_names) | |
| def shuffle_split_data(X, y): | |
| """ Shuffles and splits data into 70% training and 30% testing subsets, | |
| then returns the training and testing subsets. """ | |
| # Shuffle and split the data | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) | |
| # Return the training and testing data subsets | |
| return X_train, y_train, X_test, y_test | |
| # Test shuffle_split_data | |
| try: | |
| X_train, y_train, X_test, y_test = shuffle_split_data(housing_features, housing_prices) | |
| print "Successfully shuffled and split the data!" | |
| except Exception, e: | |
| print "Something went wrong with shuffling and splitting the data." + str(e) | |
| def performance_metric(y_true, y_predict): | |
| """ Calculates and returns the total error between true and predicted values | |
| based on a performance metric chosen by the student. """ | |
| error = mean_squared_error(y_true, y_predict) | |
| return error | |
| # Test performance_metric | |
| try: | |
| total_error = performance_metric(y_train, y_train) | |
| print "Successfully performed a metric calculation!" | |
| except Exception, e: | |
| print "Something went wrong with performing a metric calculation." + str(e) | |
| def fit_model(X, y): | |
| """ Tunes a decision tree regressor model using GridSearchCV on the input data X | |
| and target labels y and returns this optimal model. """ | |
| # Create a decision tree regressor object | |
| regressor = DecisionTreeRegressor() | |
| # Set up the parameters we wish to tune | |
| parameters = {'max_depth': (1, 2, 3, 4, 5, 6, 7, 8, 9, 10)} | |
| # Make an appropriate scoring function | |
| scoring_function = make_scorer(mean_squared_error, greater_is_better=False) | |
| # Make the GridSearchCV object | |
| reg = GridSearchCV(regressor, param_grid=parameters, scoring='mean_squared_error', verbose=0) | |
| # Fit the learner to the data to obtain the optimal model with tuned parameters | |
| reg.fit(X, y) | |
| # print reg | |
| return reg.best_estimator_ | |
| # Test fit_model on entire dataset | |
| try: | |
| reg = fit_model(housing_features, housing_prices) | |
| dot_data = StringIO() | |
| tree.export_graphviz(reg, out_file=dot_data) | |
| graph = pydot.graph_from_dot_data(dot_data.getvalue()) | |
| graph.write_pdf("boston.pdf") | |
| print reg.feature_importances_ | |
| print reg | |
| print "Successfully fit a model!" | |
| except Exception, e: | |
| print "Something went wrong with fitting a model." + str(e) | |
| def learning_curves(X_train, y_train, X_test, y_test): | |
| """ Calculates the performance of several models with varying sizes of training data. | |
| The learning and testing error rates for each model are then plotted. """ | |
| print "Creating learning curve graphs for max_depths of 1, 3, 6, and 10. . ." | |
| # Create the figure window | |
| fig = pl.figure(figsize=(10, 8)) | |
| # We will vary the training set size so that we have 50 different sizes | |
| sizes = np.rint(np.linspace(1, len(X_train), 50)).astype(int) | |
| train_err = np.zeros(len(sizes)) | |
| test_err = np.zeros(len(sizes)) | |
| # Create four different models based on max_depth | |
| for k, depth in enumerate([1, 3, 6, 10]): | |
| for i, s in enumerate(sizes): | |
| # Setup a decision tree regressor so that it learns a tree with max_depth = depth | |
| regressor = DecisionTreeRegressor(max_depth=depth) | |
| # Fit the learner to the training data | |
| regressor.fit(X_train[:s], y_train[:s]) | |
| # Find the performance on the training set | |
| train_err[i] = performance_metric(y_train[:s], regressor.predict(X_train[:s])) | |
| # Find the performance on the testing set | |
| test_err[i] = performance_metric(y_test, regressor.predict(X_test)) | |
| # Subplot the learning curve graph | |
| ax = fig.add_subplot(2, 2, k + 1) | |
| ax.plot(sizes, test_err, lw=2, label='Testing Error') | |
| ax.plot(sizes, train_err, lw=2, label='Training Error') | |
| ax.legend() | |
| ax.set_title('max_depth = %s' % (depth)) | |
| ax.set_xlabel('Number of Data Points in Training Set') | |
| ax.set_ylabel('Total Error') | |
| ax.set_xlim([0, len(X_train)]) | |
| # Visual aesthetics | |
| fig.suptitle('Decision Tree Regressor Learning Performances', fontsize=18, y=1.03) | |
| fig.tight_layout() | |
| fig.show() | |
| def model_complexity(X_train, y_train, X_test, y_test): | |
| """ Calculates the performance of the model as model complexity increases. | |
| The learning and testing errors rates are then plotted. """ | |
| print "Creating a model complexity graph. . . " | |
| # We will vary the max_depth of a decision tree model from 1 to 14 | |
| max_depth = np.arange(1, 14) | |
| train_err = np.zeros(len(max_depth)) | |
| test_err = np.zeros(len(max_depth)) | |
| for i, d in enumerate(max_depth): | |
| # Setup a Decision Tree Regressor so that it learns a tree with depth d | |
| regressor = DecisionTreeRegressor(max_depth=d) | |
| # Fit the learner to the training data | |
| regressor.fit(X_train, y_train) | |
| # Find the performance on the training set | |
| train_err[i] = performance_metric(y_train, regressor.predict(X_train)) | |
| # Find the performance on the testing set | |
| test_err[i] = performance_metric(y_test, regressor.predict(X_test)) | |
| # Plot the model complexity graph | |
| pl.figure(figsize=(7, 5)) | |
| pl.title('Decision Tree Regressor Complexity Performance') | |
| pl.plot(max_depth, test_err, lw=2, label='Testing Error') | |
| pl.plot(max_depth, train_err, lw=2, label='Training Error') | |
| pl.legend() | |
| pl.xlabel('Maximum Depth') | |
| pl.ylabel('Total Error') | |
| pl.show() | |
| # # Analyzing Model Performance | |
| learning_curves(X_train, y_train, X_test, y_test) | |
| model_complexity(X_train, y_train, X_test, y_test) | |
| sale_price = reg.predict(CLIENT_FEATURES) | |
| print "Predicted value of client's home: {0:.3f}".format(sale_price[0]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment