vvarma · August 11, 2016 10:56
diff --git a/boston_housing.py b/boston_housing.py
 # Importing a few necessary libraries
 import matplotlib.pyplot as pl
 import numpy as np
 import pandas as pd
 from sklearn import datasets
 from sklearn.cross_validation import train_test_split
 from sklearn.tree import DecisionTreeRegressor
 from sklearn.metrics import make_scorer, mean_squared_error
 from sklearn.grid_search import GridSearchCV
 from sklearn import tree
 from sklearn.externals.six import StringIO
 import pydot


 # Create our client's feature set for which we will be predicting a selling price
 CLIENT_FEATURES = [[11.95, 0.00, 18.100, 0, 0.6590, 5.6090, 90.00, 1.385, 24, 680.0, 20.20, 332.09, 12.13]]

 # Load the Boston Housing dataset into the city_data variable
 city_data = datasets.load_boston()

 # Initialize the housing prices and housing features
 housing_prices = city_data.target
 housing_features = city_data.data

 print "Boston Housing dataset loaded successfully!"

 # # Statistical Analysis and Data Exploration

 total_houses, total_features = housing_features.shape
 # Minimum housing value in the dataset
 minimum_price = housing_prices.min(axis=0)
 # Maximum housing value in the dataset
 maximum_price = housing_prices.max(axis=0)

 # Mean house value of the dataset
 mean_price = housing_prices.mean(axis=0)

 # Median house value of the dataset
 median_price = None
 sorted_prices = housing_prices.copy()  # I was sorting housing prices all along
 sorted_prices.sort(axis=0)
 if (total_houses % 2 == 0):
    median_price = (sorted_prices[total_houses / 2 - 1] + sorted_prices[total_houses / 2]) / 2
 else:
    median_price = sorted_prices[total_houses / 2]

 # Standard deviation of housing values of the dataset
 std_dev = housing_prices.std(axis=0)

 # Show the calculated statistics
 print "Boston Housing dataset statistics (in $1000's):\n"
 print "Total number of houses:", total_houses
 print "Total number of features:", total_features
 print "Minimum house price:", minimum_price
 print "Maximum house price:", maximum_price
 print "Mean house price: {0:.3f}".format(mean_price)
 print "Median house price:", median_price
 print "Standard deviation of house price: {0:.3f}".format(std_dev)

 client_feat = pd.DataFrame(CLIENT_FEATURES, columns=city_data.feature_names)


 def shuffle_split_data(X, y):
    """ Shuffles and splits data into 70% training and 30% testing subsets,
        then returns the training and testing subsets. """

    # Shuffle and split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    # Return the training and testing data subsets
    return X_train, y_train, X_test, y_test


 # Test shuffle_split_data
 try:
    X_train, y_train, X_test, y_test = shuffle_split_data(housing_features, housing_prices)
    print "Successfully shuffled and split the data!"
 except Exception, e:
    print "Something went wrong with shuffling and splitting the data." + str(e)

 def performance_metric(y_true, y_predict):
    """ Calculates and returns the total error between true and predicted values
        based on a performance metric chosen by the student. """

    error = mean_squared_error(y_true, y_predict)
    return error


 # Test performance_metric
 try:
    total_error = performance_metric(y_train, y_train)
    print "Successfully performed a metric calculation!"
 except Exception, e:
    print "Something went wrong with performing a metric calculation." + str(e)


 def fit_model(X, y):
    """ Tunes a decision tree regressor model using GridSearchCV on the input data X
        and target labels y and returns this optimal model. """
    # Create a decision tree regressor object
    regressor = DecisionTreeRegressor()

    # Set up the parameters we wish to tune
    parameters = {'max_depth': (1, 2, 3, 4, 5, 6, 7, 8, 9, 10)}

    # Make an appropriate scoring function
    scoring_function = make_scorer(mean_squared_error, greater_is_better=False)

    # Make the GridSearchCV object
    reg = GridSearchCV(regressor, param_grid=parameters, scoring='mean_squared_error', verbose=0)

    # Fit the learner to the data to obtain the optimal model with tuned parameters
    reg.fit(X, y)
    # print reg
    return reg.best_estimator_


 # Test fit_model on entire dataset
 try:
    reg = fit_model(housing_features, housing_prices)
    dot_data = StringIO()
    tree.export_graphviz(reg, out_file=dot_data)
    graph = pydot.graph_from_dot_data(dot_data.getvalue())
    graph.write_pdf("boston.pdf")

    print reg.feature_importances_
    print reg
    print "Successfully fit a model!"
 except  Exception, e:
    print "Something went wrong with fitting a model." + str(e)


 def learning_curves(X_train, y_train, X_test, y_test):
    """ Calculates the performance of several models with varying sizes of training data.
        The learning and testing error rates for each model are then plotted. """

    print "Creating learning curve graphs for max_depths of 1, 3, 6, and 10. . ."

    # Create the figure window
    fig = pl.figure(figsize=(10, 8))

    # We will vary the training set size so that we have 50 different sizes
    sizes = np.rint(np.linspace(1, len(X_train), 50)).astype(int)
    train_err = np.zeros(len(sizes))
    test_err = np.zeros(len(sizes))

    # Create four different models based on max_depth
    for k, depth in enumerate([1, 3, 6, 10]):

        for i, s in enumerate(sizes):
            # Setup a decision tree regressor so that it learns a tree with max_depth = depth
            regressor = DecisionTreeRegressor(max_depth=depth)

            # Fit the learner to the training data
            regressor.fit(X_train[:s], y_train[:s])

            # Find the performance on the training set
            train_err[i] = performance_metric(y_train[:s], regressor.predict(X_train[:s]))

            # Find the performance on the testing set
            test_err[i] = performance_metric(y_test, regressor.predict(X_test))

        # Subplot the learning curve graph
        ax = fig.add_subplot(2, 2, k + 1)
        ax.plot(sizes, test_err, lw=2, label='Testing Error')
        ax.plot(sizes, train_err, lw=2, label='Training Error')
        ax.legend()
        ax.set_title('max_depth = %s' % (depth))
        ax.set_xlabel('Number of Data Points in Training Set')
        ax.set_ylabel('Total Error')
        ax.set_xlim([0, len(X_train)])

    # Visual aesthetics
    fig.suptitle('Decision Tree Regressor Learning Performances', fontsize=18, y=1.03)
    fig.tight_layout()
    fig.show()


 def model_complexity(X_train, y_train, X_test, y_test):
    """ Calculates the performance of the model as model complexity increases.
        The learning and testing errors rates are then plotted. """

    print "Creating a model complexity graph. . . "

    # We will vary the max_depth of a decision tree model from 1 to 14
    max_depth = np.arange(1, 14)
    train_err = np.zeros(len(max_depth))
    test_err = np.zeros(len(max_depth))

    for i, d in enumerate(max_depth):
        # Setup a Decision Tree Regressor so that it learns a tree with depth d
        regressor = DecisionTreeRegressor(max_depth=d)

        # Fit the learner to the training data
        regressor.fit(X_train, y_train)

        # Find the performance on the training set
        train_err[i] = performance_metric(y_train, regressor.predict(X_train))

        # Find the performance on the testing set
        test_err[i] = performance_metric(y_test, regressor.predict(X_test))

    # Plot the model complexity graph
    pl.figure(figsize=(7, 5))
    pl.title('Decision Tree Regressor Complexity Performance')
    pl.plot(max_depth, test_err, lw=2, label='Testing Error')
    pl.plot(max_depth, train_err, lw=2, label='Training Error')
    pl.legend()
    pl.xlabel('Maximum Depth')
    pl.ylabel('Total Error')
    pl.show()


 # # Analyzing Model Performance

 learning_curves(X_train, y_train, X_test, y_test)


 model_complexity(X_train, y_train, X_test, y_test)

 sale_price = reg.predict(CLIENT_FEATURES)
 print "Predicted value of client's home: {0:.3f}".format(sale_price[0])
	# Importing a few necessary libraries
	import matplotlib.pyplot as pl
	import numpy as np
	import pandas as pd
	from sklearn import datasets
	from sklearn.cross_validation import train_test_split
	from sklearn.tree import DecisionTreeRegressor
	from sklearn.metrics import make_scorer, mean_squared_error
	from sklearn.grid_search import GridSearchCV
	from sklearn import tree
	from sklearn.externals.six import StringIO
	import pydot


	# Create our client's feature set for which we will be predicting a selling price
	CLIENT_FEATURES = [[11.95, 0.00, 18.100, 0, 0.6590, 5.6090, 90.00, 1.385, 24, 680.0, 20.20, 332.09, 12.13]]

	# Load the Boston Housing dataset into the city_data variable
	city_data = datasets.load_boston()

	# Initialize the housing prices and housing features
	housing_prices = city_data.target
	housing_features = city_data.data

	print "Boston Housing dataset loaded successfully!"

	# # Statistical Analysis and Data Exploration

	total_houses, total_features = housing_features.shape
	# Minimum housing value in the dataset
	minimum_price = housing_prices.min(axis=0)
	# Maximum housing value in the dataset
	maximum_price = housing_prices.max(axis=0)

	# Mean house value of the dataset
	mean_price = housing_prices.mean(axis=0)

	# Median house value of the dataset
	median_price = None
	sorted_prices = housing_prices.copy() # I was sorting housing prices all along
	sorted_prices.sort(axis=0)
	if (total_houses % 2 == 0):
	median_price = (sorted_prices[total_houses / 2 - 1] + sorted_prices[total_houses / 2]) / 2
	else:
	median_price = sorted_prices[total_houses / 2]

	# Standard deviation of housing values of the dataset
	std_dev = housing_prices.std(axis=0)

	# Show the calculated statistics
	print "Boston Housing dataset statistics (in $1000's):\n"
	print "Total number of houses:", total_houses
	print "Total number of features:", total_features
	print "Minimum house price:", minimum_price
	print "Maximum house price:", maximum_price
	print "Mean house price: {0:.3f}".format(mean_price)
	print "Median house price:", median_price
	print "Standard deviation of house price: {0:.3f}".format(std_dev)

	client_feat = pd.DataFrame(CLIENT_FEATURES, columns=city_data.feature_names)


	def shuffle_split_data(X, y):
	""" Shuffles and splits data into 70% training and 30% testing subsets,
	then returns the training and testing subsets. """

	# Shuffle and split the data
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
	# Return the training and testing data subsets
	return X_train, y_train, X_test, y_test


	# Test shuffle_split_data
	try:
	X_train, y_train, X_test, y_test = shuffle_split_data(housing_features, housing_prices)
	print "Successfully shuffled and split the data!"
	except Exception, e:
	print "Something went wrong with shuffling and splitting the data." + str(e)

	def performance_metric(y_true, y_predict):
	""" Calculates and returns the total error between true and predicted values
	based on a performance metric chosen by the student. """

	error = mean_squared_error(y_true, y_predict)
	return error


	# Test performance_metric
	try:
	total_error = performance_metric(y_train, y_train)
	print "Successfully performed a metric calculation!"
	except Exception, e:
	print "Something went wrong with performing a metric calculation." + str(e)


	def fit_model(X, y):
	""" Tunes a decision tree regressor model using GridSearchCV on the input data X
	and target labels y and returns this optimal model. """
	# Create a decision tree regressor object
	regressor = DecisionTreeRegressor()

	# Set up the parameters we wish to tune
	parameters = {'max_depth': (1, 2, 3, 4, 5, 6, 7, 8, 9, 10)}

	# Make an appropriate scoring function
	scoring_function = make_scorer(mean_squared_error, greater_is_better=False)

	# Make the GridSearchCV object
	reg = GridSearchCV(regressor, param_grid=parameters, scoring='mean_squared_error', verbose=0)

	# Fit the learner to the data to obtain the optimal model with tuned parameters
	reg.fit(X, y)
	# print reg
	return reg.best_estimator_


	# Test fit_model on entire dataset
	try:
	reg = fit_model(housing_features, housing_prices)
	dot_data = StringIO()
	tree.export_graphviz(reg, out_file=dot_data)
	graph = pydot.graph_from_dot_data(dot_data.getvalue())
	graph.write_pdf("boston.pdf")

	print reg.feature_importances_
	print reg
	print "Successfully fit a model!"
	except Exception, e:
	print "Something went wrong with fitting a model." + str(e)


	def learning_curves(X_train, y_train, X_test, y_test):
	""" Calculates the performance of several models with varying sizes of training data.
	The learning and testing error rates for each model are then plotted. """

	print "Creating learning curve graphs for max_depths of 1, 3, 6, and 10. . ."

	# Create the figure window
	fig = pl.figure(figsize=(10, 8))

	# We will vary the training set size so that we have 50 different sizes
	sizes = np.rint(np.linspace(1, len(X_train), 50)).astype(int)
	train_err = np.zeros(len(sizes))
	test_err = np.zeros(len(sizes))

	# Create four different models based on max_depth
	for k, depth in enumerate([1, 3, 6, 10]):

	for i, s in enumerate(sizes):
	# Setup a decision tree regressor so that it learns a tree with max_depth = depth
	regressor = DecisionTreeRegressor(max_depth=depth)

	# Fit the learner to the training data
	regressor.fit(X_train[:s], y_train[:s])

	# Find the performance on the training set
	train_err[i] = performance_metric(y_train[:s], regressor.predict(X_train[:s]))

	# Find the performance on the testing set
	test_err[i] = performance_metric(y_test, regressor.predict(X_test))

	# Subplot the learning curve graph
	ax = fig.add_subplot(2, 2, k + 1)
	ax.plot(sizes, test_err, lw=2, label='Testing Error')
	ax.plot(sizes, train_err, lw=2, label='Training Error')
	ax.legend()
	ax.set_title('max_depth = %s' % (depth))
	ax.set_xlabel('Number of Data Points in Training Set')
	ax.set_ylabel('Total Error')
	ax.set_xlim([0, len(X_train)])

	# Visual aesthetics
	fig.suptitle('Decision Tree Regressor Learning Performances', fontsize=18, y=1.03)
	fig.tight_layout()
	fig.show()


	def model_complexity(X_train, y_train, X_test, y_test):
	""" Calculates the performance of the model as model complexity increases.
	The learning and testing errors rates are then plotted. """

	print "Creating a model complexity graph. . . "

	# We will vary the max_depth of a decision tree model from 1 to 14
	max_depth = np.arange(1, 14)
	train_err = np.zeros(len(max_depth))
	test_err = np.zeros(len(max_depth))

	for i, d in enumerate(max_depth):
	# Setup a Decision Tree Regressor so that it learns a tree with depth d
	regressor = DecisionTreeRegressor(max_depth=d)

	# Fit the learner to the training data
	regressor.fit(X_train, y_train)

	# Find the performance on the training set
	train_err[i] = performance_metric(y_train, regressor.predict(X_train))

	# Find the performance on the testing set
	test_err[i] = performance_metric(y_test, regressor.predict(X_test))

	# Plot the model complexity graph
	pl.figure(figsize=(7, 5))
	pl.title('Decision Tree Regressor Complexity Performance')
	pl.plot(max_depth, test_err, lw=2, label='Testing Error')
	pl.plot(max_depth, train_err, lw=2, label='Training Error')
	pl.legend()
	pl.xlabel('Maximum Depth')
	pl.ylabel('Total Error')
	pl.show()


	# # Analyzing Model Performance

	learning_curves(X_train, y_train, X_test, y_test)


	model_complexity(X_train, y_train, X_test, y_test)

	sale_price = reg.predict(CLIENT_FEATURES)
	print "Predicted value of client's home: {0:.3f}".format(sale_price[0])
No results found