Skip to content

Instantly share code, notes, and snippets.

@vvarma
Created August 11, 2016 10:56
Show Gist options
  • Select an option

  • Save vvarma/b9acc635b687b1b6748b69813493b4bf to your computer and use it in GitHub Desktop.

Select an option

Save vvarma/b9acc635b687b1b6748b69813493b4bf to your computer and use it in GitHub Desktop.
# Importing a few necessary libraries
import matplotlib.pyplot as pl
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.grid_search import GridSearchCV
from sklearn import tree
from sklearn.externals.six import StringIO
import pydot
# Create our client's feature set for which we will be predicting a selling price
CLIENT_FEATURES = [[11.95, 0.00, 18.100, 0, 0.6590, 5.6090, 90.00, 1.385, 24, 680.0, 20.20, 332.09, 12.13]]
# Load the Boston Housing dataset into the city_data variable
city_data = datasets.load_boston()
# Initialize the housing prices and housing features
housing_prices = city_data.target
housing_features = city_data.data
print "Boston Housing dataset loaded successfully!"
# # Statistical Analysis and Data Exploration
total_houses, total_features = housing_features.shape
# Minimum housing value in the dataset
minimum_price = housing_prices.min(axis=0)
# Maximum housing value in the dataset
maximum_price = housing_prices.max(axis=0)
# Mean house value of the dataset
mean_price = housing_prices.mean(axis=0)
# Median house value of the dataset
median_price = None
sorted_prices = housing_prices.copy() # I was sorting housing prices all along
sorted_prices.sort(axis=0)
if (total_houses % 2 == 0):
median_price = (sorted_prices[total_houses / 2 - 1] + sorted_prices[total_houses / 2]) / 2
else:
median_price = sorted_prices[total_houses / 2]
# Standard deviation of housing values of the dataset
std_dev = housing_prices.std(axis=0)
# Show the calculated statistics
print "Boston Housing dataset statistics (in $1000's):\n"
print "Total number of houses:", total_houses
print "Total number of features:", total_features
print "Minimum house price:", minimum_price
print "Maximum house price:", maximum_price
print "Mean house price: {0:.3f}".format(mean_price)
print "Median house price:", median_price
print "Standard deviation of house price: {0:.3f}".format(std_dev)
client_feat = pd.DataFrame(CLIENT_FEATURES, columns=city_data.feature_names)
def shuffle_split_data(X, y):
""" Shuffles and splits data into 70% training and 30% testing subsets,
then returns the training and testing subsets. """
# Shuffle and split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
# Return the training and testing data subsets
return X_train, y_train, X_test, y_test
# Test shuffle_split_data
try:
X_train, y_train, X_test, y_test = shuffle_split_data(housing_features, housing_prices)
print "Successfully shuffled and split the data!"
except Exception, e:
print "Something went wrong with shuffling and splitting the data." + str(e)
def performance_metric(y_true, y_predict):
""" Calculates and returns the total error between true and predicted values
based on a performance metric chosen by the student. """
error = mean_squared_error(y_true, y_predict)
return error
# Test performance_metric
try:
total_error = performance_metric(y_train, y_train)
print "Successfully performed a metric calculation!"
except Exception, e:
print "Something went wrong with performing a metric calculation." + str(e)
def fit_model(X, y):
""" Tunes a decision tree regressor model using GridSearchCV on the input data X
and target labels y and returns this optimal model. """
# Create a decision tree regressor object
regressor = DecisionTreeRegressor()
# Set up the parameters we wish to tune
parameters = {'max_depth': (1, 2, 3, 4, 5, 6, 7, 8, 9, 10)}
# Make an appropriate scoring function
scoring_function = make_scorer(mean_squared_error, greater_is_better=False)
# Make the GridSearchCV object
reg = GridSearchCV(regressor, param_grid=parameters, scoring='mean_squared_error', verbose=0)
# Fit the learner to the data to obtain the optimal model with tuned parameters
reg.fit(X, y)
# print reg
return reg.best_estimator_
# Test fit_model on entire dataset
try:
reg = fit_model(housing_features, housing_prices)
dot_data = StringIO()
tree.export_graphviz(reg, out_file=dot_data)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph.write_pdf("boston.pdf")
print reg.feature_importances_
print reg
print "Successfully fit a model!"
except Exception, e:
print "Something went wrong with fitting a model." + str(e)
def learning_curves(X_train, y_train, X_test, y_test):
""" Calculates the performance of several models with varying sizes of training data.
The learning and testing error rates for each model are then plotted. """
print "Creating learning curve graphs for max_depths of 1, 3, 6, and 10. . ."
# Create the figure window
fig = pl.figure(figsize=(10, 8))
# We will vary the training set size so that we have 50 different sizes
sizes = np.rint(np.linspace(1, len(X_train), 50)).astype(int)
train_err = np.zeros(len(sizes))
test_err = np.zeros(len(sizes))
# Create four different models based on max_depth
for k, depth in enumerate([1, 3, 6, 10]):
for i, s in enumerate(sizes):
# Setup a decision tree regressor so that it learns a tree with max_depth = depth
regressor = DecisionTreeRegressor(max_depth=depth)
# Fit the learner to the training data
regressor.fit(X_train[:s], y_train[:s])
# Find the performance on the training set
train_err[i] = performance_metric(y_train[:s], regressor.predict(X_train[:s]))
# Find the performance on the testing set
test_err[i] = performance_metric(y_test, regressor.predict(X_test))
# Subplot the learning curve graph
ax = fig.add_subplot(2, 2, k + 1)
ax.plot(sizes, test_err, lw=2, label='Testing Error')
ax.plot(sizes, train_err, lw=2, label='Training Error')
ax.legend()
ax.set_title('max_depth = %s' % (depth))
ax.set_xlabel('Number of Data Points in Training Set')
ax.set_ylabel('Total Error')
ax.set_xlim([0, len(X_train)])
# Visual aesthetics
fig.suptitle('Decision Tree Regressor Learning Performances', fontsize=18, y=1.03)
fig.tight_layout()
fig.show()
def model_complexity(X_train, y_train, X_test, y_test):
""" Calculates the performance of the model as model complexity increases.
The learning and testing errors rates are then plotted. """
print "Creating a model complexity graph. . . "
# We will vary the max_depth of a decision tree model from 1 to 14
max_depth = np.arange(1, 14)
train_err = np.zeros(len(max_depth))
test_err = np.zeros(len(max_depth))
for i, d in enumerate(max_depth):
# Setup a Decision Tree Regressor so that it learns a tree with depth d
regressor = DecisionTreeRegressor(max_depth=d)
# Fit the learner to the training data
regressor.fit(X_train, y_train)
# Find the performance on the training set
train_err[i] = performance_metric(y_train, regressor.predict(X_train))
# Find the performance on the testing set
test_err[i] = performance_metric(y_test, regressor.predict(X_test))
# Plot the model complexity graph
pl.figure(figsize=(7, 5))
pl.title('Decision Tree Regressor Complexity Performance')
pl.plot(max_depth, test_err, lw=2, label='Testing Error')
pl.plot(max_depth, train_err, lw=2, label='Training Error')
pl.legend()
pl.xlabel('Maximum Depth')
pl.ylabel('Total Error')
pl.show()
# # Analyzing Model Performance
learning_curves(X_train, y_train, X_test, y_test)
model_complexity(X_train, y_train, X_test, y_test)
sale_price = reg.predict(CLIENT_FEATURES)
print "Predicted value of client's home: {0:.3f}".format(sale_price[0])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment