Mashimo · April 29, 2018 22:39
diff --git a/Decision Tree b/Decision Tree
 Decision trees are a supervised, probabilistic, machine learning classifier that are often used as decision support tools. Like any other classifier, they are capable of predicting the label of a sample, and the way they do this is by examining the probabilistic outcomes of your samples' features.   
 Decision trees are one of the oldest and most used machine learning algorithms, perhaps even pre-dating machine learning. They're very popular and have been around for decades. Following through with sequential cause-and-effect decisions comes very naturally.  
 Decision trees are a good tool to use when you want backing evidence to support a decision.
diff --git a/mushroomsTree.py b/mushroomsTree.py
 """
 Use decision trees to peruse The Mushroom Data Set, drawn from the Audobon 
 Society Field Guide to North American Mushrooms (1981). The data set details 
 mushrooms described in terms of many physical characteristics, such as cap size 
 and stalk length, along with a classification of poisonous or edible.

 As a standard disclaimer, if you eat a random mushroom you find, you are doing 
 so at your own risk.
 """
 import pandas as pd


 #dataset is here:
 #    https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.names


 # 
 # : Load up the mushroom dataset into dataframe 'X'
 # Header information is on the dataset's website at the UCI ML Repo
 #
 colNames=['label', 'cap-shape','cap-surface','cap-color','bruises','odor',
          'gill-attachment','gill-spacing','gill-size','gill-color','stalk-shape',
          'stalk-root','stalk-surface-above-ring','stalk-surface-below-ring',
          'stalk-color-above-ring','stalk-color-below-ring','veil-type',
          'veil-color','ring-number','ring-type','spore-print-color','population',
          'habitat']
 X = pd.read_csv("Datasets/agaricus-lepiota.data", header=None, na_values='?',
                names=colNames) 


 # 
 # : Go ahead and drop any row with a nan
 #
 X.dropna(axis=0, inplace=True) 
 print (X.shape)


 #
 # : Copy the labels out of the dset into variable 'y' then Remove
 # them from X. Encode the labels poisonous / edible

 y = X[X.columns[0]].copy()
 X.drop(X.columns[0], axis=1,inplace=True)

 y = y.map({'p':0, 'e':1})

 #
 # : Encode the entire dataset using dummies
 #
 X = pd.get_dummies(X)


 # 
 # : Split data into test / train sets
 #
 from sklearn.model_selection import train_test_split

 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, 
                                                    random_state=7)


 #
 # : Create a DT classifier. No need to set any parameters
 #
 from sklearn import tree

 model = tree.DecisionTreeClassifier()

 #
 # : train the classifier on the training data / labels:
 #
 model.fit(X_train, y_train)

 # : score the classifier on the testing data / labels:
 score = model.score(X_test, y_test)

 print ("High-Dimensionality Score: ", round((score*100), 3))

 # RESULT:
 # top two features you should consider when deciding if a mushroom is eadible or not:
 # Odor, and Gill Size
 #
 # output a .DOT file
 # .DOT files can be rendered to .PNGs, if you've already `brew install graphviz`.
 # If not, `brew install graphviz`. If you can't, use: http://webgraphviz.com/.

 tree.export_graphviz(model.tree_, out_file='tree.dot', feature_names=X.columns)
diff --git a/tree.py b/tree.py
 """
 Revisite UCI's wheat-seeds dataset with decision trees, to benchmark how long 
 it takes to train and predict with decision trees relative to the speed of 
 KNeighbors and SVC, as well as compare the decision boundary plots produced by it.
 """

 import matplotlib as mpl
 import matplotlib.pyplot as plt

 import pandas as pd
 import numpy as np 
 import time


 # 
 # INFO: Parameters.
 # You can adjust them 

 iterations = 100   

 #
 # INFO: You can set this to false if you want to
 # draw the full square matrix
 FAST_DRAW = True



 def drawPlots(model, X_train, X_test, y_train, y_test, wintitle='Figure 1'):
  # INFO: A convenience function to break any higher-dimensional space down
  # And view cross sections of it.

  mpl.style.use('ggplot') # Look Pretty

  padding = 3
  resolution = 0.5
  max_2d_score = 0
  score = 0


  y_colors = ['#ff0000', '#00ff00', '#0000ff']
  my_cmap = mpl.colors.ListedColormap(['#ffaaaa', '#aaffaa', '#aaaaff'])
  colors = [y_colors[i] for i in y_train]
  num_columns = len(X_train.columns)

  fig = plt.figure()
  fig.canvas.set_window_title(wintitle)
  
  cnt = 0
  for col in range(num_columns):
    for row in range(num_columns):
      # Easy out
      if FAST_DRAW and col > row:
        cnt += 1
        continue

      ax = plt.subplot(num_columns, num_columns, cnt + 1)
      plt.xticks(())
      plt.yticks(())

          # Intersection:
      if col == row:
        plt.text(0.5, 0.5, X_train.columns[row], verticalalignment='center', 
                 horizontalalignment='center', fontsize=12)
        cnt += 1
        continue


          # Only select two features to display, then train the model
      X_train_bag = X_train.ix[:, [row,col]]
      X_test_bag = X_test.ix[:, [row,col]]
      model.fit(X_train_bag, y_train)

          # Create a mesh to plot in
      x_min, x_max = X_train_bag.ix[:, 0].min() - padding, X_train_bag.ix[:, 0].max() + padding
      y_min, y_max = X_train_bag.ix[:, 1].min() - padding, X_train_bag.ix[:, 1].max() + padding
      xx, yy = np.meshgrid(np.arange(x_min, x_max, resolution),
                           np.arange(y_min, y_max, resolution))

          # Plot Boundaries
      plt.xlim(xx.min(), xx.max())
      plt.ylim(yy.min(), yy.max())

          # Prepare the contour
      Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
      Z = Z.reshape(xx.shape)
      plt.contourf(xx, yy, Z, cmap=my_cmap, alpha=0.8)
      plt.scatter(X_train_bag.ix[:, 0], X_train_bag.ix[:, 1], c=colors, alpha=0.5)


      score = round(model.score(X_test_bag, y_test) * 100, 3)
      plt.text(0.5, 0, "Score: {0}".format(score), transform = ax.transAxes, 
               horizontalalignment='center', fontsize=8)
      max_2d_score = score if score > max_2d_score else max_2d_score

      cnt += 1

  print ("Max 2D Score: ", max_2d_score)
  fig.set_tight_layout(True)


 def benchmark(model, X_train, X_test, y_train, y_test, wintitle='Figure 1'):
  print ('\n\n' + wintitle + ' Results')
  
  # the only purpose of doing many iterations is to get a more accurate 
  # count of the time it took for each classifier
  s = time.time()
  for i in range(iterations):
    #
    # : train the classifier on the training data / labels:
    #
    model.fit(X_train, y_train) 

    
  print ("{0} Iterations Training Time: ".format(iterations), time.time() - s)


  scoreBch = 0

  s = time.time()
  for i in range(iterations):
    #
    # : score the classifier on the testing data / labels:
    #
    scoreBch = model.score(X_test, y_test)

    
  print ("{0} Iterations Scoring Time: ".format(iterations), time.time() - s)
  print ("High-Dimensionality Score: ", round((scoreBch*100), 3))



 # 
 # : Load up the wheat dataset into dataframe 'X'
 #
 df = pd.read_csv("Datasets/wheat.data", index_col='id')


 # INFO: An easy way to show which rows have nans in them
 print (df[pd.isnull(df).any(axis=1)])


 # 
 # : Go ahead and drop any row with a nan
 #
 df.dropna(axis=0, inplace=True)


 # 
 # INFO: # In the future, you might try setting the nan values to the
 # mean value of that column, the mean should only be calculated for
 # the specific class rather than across all classes, now that you
 # have the labels



 #
 # : Copy the labels out of the dset into variable 'y' then Remove
 # them from X. Encode the labels -- canadian:0, kama:1, and rosa:2
 #
 labels = df.wheat_type.copy() # copy “y” values out
 df.drop(['wheat_type'], axis=1, inplace=True) # drop output column 

 labels = labels.map({'canadian':0, 'kama':1, 'rosa':2})


 # 
 # : Split data into test / train sets
 #
 from sklearn.model_selection import train_test_split


 X_train, X_test, y_train, y_test = train_test_split(df, labels, test_size=0.3, 
                                                    random_state=7)


 #
 # : Create a decision tree classifier 
 #
 from sklearn import tree
 """
 Reminder. Decision tree classifier - default values:

 DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=9,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
 """    
 model = tree.DecisionTreeClassifier(max_depth=6, random_state=2)
 model.fit(X_train, y_train)



 benchmark(model, X_train, X_test, y_train, y_test, 'Tree')
 drawPlots(model, X_train, X_test, y_train, y_test, 'Tree')

 plt.show()
	Decision trees are a supervised, probabilistic, machine learning classifier that are often used as decision support tools. Like any other classifier, they are capable of predicting the label of a sample, and the way they do this is by examining the probabilistic outcomes of your samples' features.
	Decision trees are one of the oldest and most used machine learning algorithms, perhaps even pre-dating machine learning. They're very popular and have been around for decades. Following through with sequential cause-and-effect decisions comes very naturally.
	Decision trees are a good tool to use when you want backing evidence to support a decision.
	"""
	Use decision trees to peruse The Mushroom Data Set, drawn from the Audobon
	Society Field Guide to North American Mushrooms (1981). The data set details
	mushrooms described in terms of many physical characteristics, such as cap size
	and stalk length, along with a classification of poisonous or edible.

	As a standard disclaimer, if you eat a random mushroom you find, you are doing
	so at your own risk.
	"""
	import pandas as pd


	#dataset is here:
	# https://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.names


	#
	# : Load up the mushroom dataset into dataframe 'X'
	# Header information is on the dataset's website at the UCI ML Repo
	#
	colNames=['label', 'cap-shape','cap-surface','cap-color','bruises','odor',
	'gill-attachment','gill-spacing','gill-size','gill-color','stalk-shape',
	'stalk-root','stalk-surface-above-ring','stalk-surface-below-ring',
	'stalk-color-above-ring','stalk-color-below-ring','veil-type',
	'veil-color','ring-number','ring-type','spore-print-color','population',
	'habitat']
	X = pd.read_csv("Datasets/agaricus-lepiota.data", header=None, na_values='?',
	names=colNames)


	#
	# : Go ahead and drop any row with a nan
	#
	X.dropna(axis=0, inplace=True)
	print (X.shape)


	#
	# : Copy the labels out of the dset into variable 'y' then Remove
	# them from X. Encode the labels poisonous / edible

	y = X[X.columns[0]].copy()
	X.drop(X.columns[0], axis=1,inplace=True)

	y = y.map({'p':0, 'e':1})

	#
	# : Encode the entire dataset using dummies
	#
	X = pd.get_dummies(X)


	#
	# : Split data into test / train sets
	#
	from sklearn.model_selection import train_test_split

	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
	random_state=7)


	#
	# : Create a DT classifier. No need to set any parameters
	#
	from sklearn import tree

	model = tree.DecisionTreeClassifier()

	#
	# : train the classifier on the training data / labels:
	#
	model.fit(X_train, y_train)

	# : score the classifier on the testing data / labels:
	score = model.score(X_test, y_test)

	print ("High-Dimensionality Score: ", round((score*100), 3))

	# RESULT:
	# top two features you should consider when deciding if a mushroom is eadible or not:
	# Odor, and Gill Size
	#
	# output a .DOT file
	# .DOT files can be rendered to .PNGs, if you've already `brew install graphviz`.
	# If not, `brew install graphviz`. If you can't, use: http://webgraphviz.com/.

	tree.export_graphviz(model.tree_, out_file='tree.dot', feature_names=X.columns)
	"""
	Revisite UCI's wheat-seeds dataset with decision trees, to benchmark how long
	it takes to train and predict with decision trees relative to the speed of
	KNeighbors and SVC, as well as compare the decision boundary plots produced by it.
	"""

	import matplotlib as mpl
	import matplotlib.pyplot as plt

	import pandas as pd
	import numpy as np
	import time


	#
	# INFO: Parameters.
	# You can adjust them

	iterations = 100

	#
	# INFO: You can set this to false if you want to
	# draw the full square matrix
	FAST_DRAW = True



	def drawPlots(model, X_train, X_test, y_train, y_test, wintitle='Figure 1'):
	# INFO: A convenience function to break any higher-dimensional space down
	# And view cross sections of it.

	mpl.style.use('ggplot') # Look Pretty

	padding = 3
	resolution = 0.5
	max_2d_score = 0
	score = 0


	y_colors = ['#ff0000', '#00ff00', '#0000ff']
	my_cmap = mpl.colors.ListedColormap(['#ffaaaa', '#aaffaa', '#aaaaff'])
	colors = [y_colors[i] for i in y_train]
	num_columns = len(X_train.columns)

	fig = plt.figure()
	fig.canvas.set_window_title(wintitle)

	cnt = 0
	for col in range(num_columns):
	for row in range(num_columns):
	# Easy out
	if FAST_DRAW and col > row:
	cnt += 1
	continue

	ax = plt.subplot(num_columns, num_columns, cnt + 1)
	plt.xticks(())
	plt.yticks(())

	# Intersection:
	if col == row:
	plt.text(0.5, 0.5, X_train.columns[row], verticalalignment='center',
	horizontalalignment='center', fontsize=12)
	cnt += 1
	continue


	# Only select two features to display, then train the model
	X_train_bag = X_train.ix[:, [row,col]]
	X_test_bag = X_test.ix[:, [row,col]]
	model.fit(X_train_bag, y_train)

	# Create a mesh to plot in
	x_min, x_max = X_train_bag.ix[:, 0].min() - padding, X_train_bag.ix[:, 0].max() + padding
	y_min, y_max = X_train_bag.ix[:, 1].min() - padding, X_train_bag.ix[:, 1].max() + padding
	xx, yy = np.meshgrid(np.arange(x_min, x_max, resolution),
	np.arange(y_min, y_max, resolution))

	# Plot Boundaries
	plt.xlim(xx.min(), xx.max())
	plt.ylim(yy.min(), yy.max())

	# Prepare the contour
	Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
	Z = Z.reshape(xx.shape)
	plt.contourf(xx, yy, Z, cmap=my_cmap, alpha=0.8)
	plt.scatter(X_train_bag.ix[:, 0], X_train_bag.ix[:, 1], c=colors, alpha=0.5)


	score = round(model.score(X_test_bag, y_test) * 100, 3)
	plt.text(0.5, 0, "Score: {0}".format(score), transform = ax.transAxes,
	horizontalalignment='center', fontsize=8)
	max_2d_score = score if score > max_2d_score else max_2d_score

	cnt += 1

	print ("Max 2D Score: ", max_2d_score)
	fig.set_tight_layout(True)


	def benchmark(model, X_train, X_test, y_train, y_test, wintitle='Figure 1'):
	print ('\n\n' + wintitle + ' Results')

	# the only purpose of doing many iterations is to get a more accurate
	# count of the time it took for each classifier
	s = time.time()
	for i in range(iterations):
	#
	# : train the classifier on the training data / labels:
	#
	model.fit(X_train, y_train)


	print ("{0} Iterations Training Time: ".format(iterations), time.time() - s)


	scoreBch = 0

	s = time.time()
	for i in range(iterations):
	#
	# : score the classifier on the testing data / labels:
	#
	scoreBch = model.score(X_test, y_test)


	print ("{0} Iterations Scoring Time: ".format(iterations), time.time() - s)
	print ("High-Dimensionality Score: ", round((scoreBch*100), 3))



	#
	# : Load up the wheat dataset into dataframe 'X'
	#
	df = pd.read_csv("Datasets/wheat.data", index_col='id')


	# INFO: An easy way to show which rows have nans in them
	print (df[pd.isnull(df).any(axis=1)])


	#
	# : Go ahead and drop any row with a nan
	#
	df.dropna(axis=0, inplace=True)


	#
	# INFO: # In the future, you might try setting the nan values to the
	# mean value of that column, the mean should only be calculated for
	# the specific class rather than across all classes, now that you
	# have the labels



	#
	# : Copy the labels out of the dset into variable 'y' then Remove
	# them from X. Encode the labels -- canadian:0, kama:1, and rosa:2
	#
	labels = df.wheat_type.copy() # copy “y” values out
	df.drop(['wheat_type'], axis=1, inplace=True) # drop output column

	labels = labels.map({'canadian':0, 'kama':1, 'rosa':2})


	#
	# : Split data into test / train sets
	#
	from sklearn.model_selection import train_test_split


	X_train, X_test, y_train, y_test = train_test_split(df, labels, test_size=0.3,
	random_state=7)


	#
	# : Create a decision tree classifier
	#
	from sklearn import tree
	"""
	Reminder. Decision tree classifier - default values:

	DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=9,
	max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
	min_samples_split=2, min_weight_fraction_leaf=0.0,
	presort=False, random_state=None, splitter='best')
	"""
	model = tree.DecisionTreeClassifier(max_depth=6, random_state=2)
	model.fit(X_train, y_train)



	benchmark(model, X_train, X_test, y_train, y_test, 'Tree')
	drawPlots(model, X_train, X_test, y_train, y_test, 'Tree')

	plt.show()