isauravmanitripathi · April 15, 2019 19:35
diff --git a/basic.ipynb b/basic.ipynb
 From preamble import *
 %matplotlib inline
 Introduction
 Why Machine Learning?
 Problems Machine Learning Can Solve
 Knowing Your Task and Knowing Your Data
 Why Python?
 scikit-learn
 Installing scikit-learn
 Essential Libraries and Tools
 Jupyter Notebook
 NumPy
 In [2]:
 import numpy as np

 x = np.array([[1, 2, 3], [4, 5, 6]])
 print("x:\n{}".format(x))
 x:
 [[1 2 3]
 [4 5 6]]
 SciPy
 In [3]:
 from scipy import sparse

 # Create a 2D NumPy array with a diagonal of ones, and zeros everywhere else
 eye = np.eye(4)
 print("NumPy array:\n", eye)
 NumPy array:
 [[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]]
 In [4]:
 # Convert the NumPy array to a SciPy sparse matrix in CSR format
 # Only the nonzero entries are stored
 sparse_matrix = sparse.csr_matrix(eye)
 print("\nSciPy sparse CSR matrix:\n", sparse_matrix)
 SciPy sparse CSR matrix:
   (0, 0)	1.0
  (1, 1)	1.0
  (2, 2)	1.0
  (3, 3)	1.0
 In [5]:
 data = np.ones(4)
 row_indices = np.arange(4)
 col_indices = np.arange(4)
 eye_coo = sparse.coo_matrix((data, (row_indices, col_indices)))
 print("COO representation:\n", eye_coo)
 COO representation:   (0, 0)	1.0
  (1, 1)	1.0
  (2, 2)	1.0
  (3, 3)	1.0
 matplotlib
 In [6]:
 %matplotlib inline
 import matplotlib.pyplot as plt

 # Generate a sequence of numbers from -10 to 10 with 100 steps in between
 x = np.linspace(-10, 10, 100)
 # Create a second array using sine
 y = np.sin(x)
 # The plot function makes a line chart of one array against another
 plt.plot(x, y, marker="x")
 Out[6]:
 [<matplotlib.lines.Line2D at 0x7f1d73e83ba8>]

 pandas
 In [7]:
 import pandas as pd

 # create a simple dataset of people
 data = {'Name': ["John", "Anna", "Peter", "Linda"],
        'Location' : ["New York", "Paris", "Berlin", "London"],
        'Age' : [24, 13, 53, 33]
       }

 data_pandas = pd.DataFrame(data)
 # IPython.display allows "pretty printing" of dataframes
 # in the Jupyter notebook
 display(data_pandas)
 Name	Location	Age
 0	John	New York	24
 1	Anna	Paris	13
 2	Peter	Berlin	53
 3	Linda	London	33
 In [8]:
 # Select all rows that have an age column greater than 30
 display(data_pandas[data_pandas.Age > 30])
 Name	Location	Age
 2	Peter	Berlin	53
 3	Linda	London	33
 mglearn
 Python 2 versus Python 3
 Versions Used in this Book
 In [9]:
 import sys
 print("Python version:", sys.version)

 import pandas as pd
 print("pandas version:", pd.__version__)

 import matplotlib
 print("matplotlib version:", matplotlib.__version__)

 import numpy as np
 print("NumPy version:", np.__version__)

 import scipy as sp
 print("SciPy version:", sp.__version__)

 import IPython
 print("IPython version:", IPython.__version__)

 import sklearn
 print("scikit-learn version:", sklearn.__version__)
 Python version: 3.7.0 (default, Jun 28 2018, 13:15:42) 
 [GCC 7.2.0]
 pandas version: 0.23.4
 matplotlib version: 3.0.0
 NumPy version: 1.15.2
 SciPy version: 1.1.0
 IPython version: 6.4.0
 scikit-learn version: 0.21.dev0
 A First Application: Classifying Iris Species
 sepal_petal
 Meet the Data
 In [10]:
 from sklearn.datasets import load_iris
 iris_dataset = load_iris()
 In [11]:
 print("Keys of iris_dataset:\n", iris_dataset.keys())
 Keys of iris_dataset:
 dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])
 In [12]:
 print(iris_dataset['DESCR'][:193] + "\n...")
 .. _iris_dataset:

 Iris plants dataset
 --------------------

 **Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, pre
 ...
 In [13]:
 print("Target names:", iris_dataset['target_names'])
 Target names: ['setosa' 'versicolor' 'virginica']
 In [14]:
 print("Feature names:\n", iris_dataset['feature_names'])
 Feature names:
 ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
 In [15]:
 print("Type of data:", type(iris_dataset['data']))
 Type of data: <class 'numpy.ndarray'>
 In [16]:
 print("Shape of data:", iris_dataset['data'].shape)
 Shape of data: (150, 4)
 In [17]:
 print("First five rows of data:\n", iris_dataset['data'][:5])
 First five rows of data:
 [[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]
 In [18]:
 print("Type of target:", type(iris_dataset['target']))
 Type of target: <class 'numpy.ndarray'>
 In [19]:
 print("Shape of target:", iris_dataset['target'].shape)
 Shape of target: (150,)
 In [20]:
 print("Target:\n", iris_dataset['target'])
 Target:
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
 Measuring Success: Training and Testing Data
 In [21]:
 from sklearn.model_selection import train_test_split
 X_train, X_test, y_train, y_test = train_test_split(
    iris_dataset['data'], iris_dataset['target'], random_state=0)
 In [22]:
 print("X_train shape:", X_train.shape)
 print("y_train shape:", y_train.shape)
 X_train shape: (112, 4)
 y_train shape: (112,)
 In [23]:
 print("X_test shape:", X_test.shape)
 print("y_test shape:", y_test.shape)
 X_test shape: (38, 4)
 y_test shape: (38,)
 First Things First: Look at Your Data
 In [24]:
 # create dataframe from data in X_train
 # label the columns using the strings in iris_dataset.feature_names
 iris_dataframe = pd.DataFrame(X_train, columns=iris_dataset.feature_names)
 # create a scatter matrix from the dataframe, color by y_train
 pd.plotting.scatter_matrix(iris_dataframe, c=y_train, figsize=(15, 15),
                           marker='o', hist_kwds={'bins': 20}, s=60,
                           alpha=.8, cmap=mglearn.cm3)
 Out[24]:
 array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f1e5c9a3ef0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1e5c9520f0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1e5c9794e0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1e5c91f978>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f1e5c8c7e48>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1e5c8f8320>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1e5c8a27f0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1e5c84acf8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f1e5c84ad30>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1e5c822710>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1e5c7cec18>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1e5c7fe160>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7f1e5c7a5668>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1e5c74db70>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1e5c77f0b8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7f1e5c7265c0>]],
      dtype=object)

 Building Your First Model: k-Nearest Neighbors
 In [25]:
 from sklearn.neighbors import KNeighborsClassifier
 knn = KNeighborsClassifier(n_neighbors=1)
 In [26]:
 knn.fit(X_train, y_train)
 Out[26]:
 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=1, p=2,
           weights='uniform')
 Making Predictions
 In [27]:
 X_new = np.array([[5, 2.9, 1, 0.2]])
 print("X_new.shape:", X_new.shape)
 X_new.shape: (1, 4)
 In [28]:
 prediction = knn.predict(X_new)
 print("Prediction:", prediction)
 print("Predicted target name:",
       iris_dataset['target_names'][prediction])
 Prediction: [0]
 Predicted target name: ['setosa']
 Evaluating the Model
 In [29]:
 y_pred = knn.predict(X_test)
 print("Test set predictions:\n", y_pred)
 Test set predictions:
 [2 1 0 2 0 2 0 1 1 1 2 1 1 1 1 0 1 1 0 0 2 1 0 0 2 0 0 1 1 0 2 1 0 2 2 1 0
 2]
 In [30]:
 print("Test set score: {:.2f}".format(np.mean(y_pred == y_test)))
 Test set score: 0.97
 In [31]:
 print("Test set score: {:.2f}".format(knn.score(X_test, y_test)))
 Test set score: 0.97
 Summary and Outlook
 In [32]:
 X_train, X_test, y_train, y_test = train_test_split(
    iris_dataset['data'], iris_dataset['target'], random_state=0)

 knn = KNeighborsClassifier(n_neighbors=1)
 knn.fit(X_train, y_train)

 print("Test set score: {:.2f}".format(knn.score(X_test, y_test)))
 Test set score: 0.97
	From preamble import *
	%matplotlib inline
	Introduction
	Why Machine Learning?
	Problems Machine Learning Can Solve
	Knowing Your Task and Knowing Your Data
	Why Python?
	scikit-learn
	Installing scikit-learn
	Essential Libraries and Tools
	Jupyter Notebook
	NumPy
	In [2]:
	import numpy as np

	x = np.array([[1, 2, 3], [4, 5, 6]])
	print("x:\n{}".format(x))
	x:
	[[1 2 3]
	[4 5 6]]
	SciPy
	In [3]:
	from scipy import sparse

	# Create a 2D NumPy array with a diagonal of ones, and zeros everywhere else
	eye = np.eye(4)
	print("NumPy array:\n", eye)
	NumPy array:
	[[1. 0. 0. 0.]
	[0. 1. 0. 0.]
	[0. 0. 1. 0.]
	[0. 0. 0. 1.]]
	In [4]:
	# Convert the NumPy array to a SciPy sparse matrix in CSR format
	# Only the nonzero entries are stored
	sparse_matrix = sparse.csr_matrix(eye)
	print("\nSciPy sparse CSR matrix:\n", sparse_matrix)
	SciPy sparse CSR matrix:
	(0, 0) 1.0
	(1, 1) 1.0
	(2, 2) 1.0
	(3, 3) 1.0
	In [5]:
	data = np.ones(4)
	row_indices = np.arange(4)
	col_indices = np.arange(4)
	eye_coo = sparse.coo_matrix((data, (row_indices, col_indices)))
	print("COO representation:\n", eye_coo)
	COO representation: (0, 0) 1.0
	(1, 1) 1.0
	(2, 2) 1.0
	(3, 3) 1.0
	matplotlib
	In [6]:
	%matplotlib inline
	import matplotlib.pyplot as plt

	# Generate a sequence of numbers from -10 to 10 with 100 steps in between
	x = np.linspace(-10, 10, 100)
	# Create a second array using sine
	y = np.sin(x)
	# The plot function makes a line chart of one array against another
	plt.plot(x, y, marker="x")
	Out[6]:
	[<matplotlib.lines.Line2D at 0x7f1d73e83ba8>]

	pandas
	In [7]:
	import pandas as pd

	# create a simple dataset of people
	data = {'Name': ["John", "Anna", "Peter", "Linda"],
	'Location' : ["New York", "Paris", "Berlin", "London"],
	'Age' : [24, 13, 53, 33]
	}

	data_pandas = pd.DataFrame(data)
	# IPython.display allows "pretty printing" of dataframes
	# in the Jupyter notebook
	display(data_pandas)
	Name Location Age
	0 John New York 24
	1 Anna Paris 13
	2 Peter Berlin 53
	3 Linda London 33
	In [8]:
	# Select all rows that have an age column greater than 30
	display(data_pandas[data_pandas.Age > 30])
	Name Location Age
	2 Peter Berlin 53
	3 Linda London 33
	mglearn
	Python 2 versus Python 3
	Versions Used in this Book
	In [9]:
	import sys
	print("Python version:", sys.version)

	import pandas as pd
	print("pandas version:", pd.__version__)

	import matplotlib
	print("matplotlib version:", matplotlib.__version__)

	import numpy as np
	print("NumPy version:", np.__version__)

	import scipy as sp
	print("SciPy version:", sp.__version__)

	import IPython
	print("IPython version:", IPython.__version__)

	import sklearn
	print("scikit-learn version:", sklearn.__version__)
	Python version: 3.7.0 (default, Jun 28 2018, 13:15:42)
	[GCC 7.2.0]
	pandas version: 0.23.4
	matplotlib version: 3.0.0
	NumPy version: 1.15.2
	SciPy version: 1.1.0
	IPython version: 6.4.0
	scikit-learn version: 0.21.dev0
	A First Application: Classifying Iris Species
	sepal_petal
	Meet the Data
	In [10]:
	from sklearn.datasets import load_iris
	iris_dataset = load_iris()
	In [11]:
	print("Keys of iris_dataset:\n", iris_dataset.keys())
	Keys of iris_dataset:
	dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])
	In [12]:
	print(iris_dataset['DESCR'][:193] + "\n...")
	.. _iris_dataset:

	Iris plants dataset
	--------------------

	Data Set Characteristics:

	:Number of Instances: 150 (50 in each of three classes)
	:Number of Attributes: 4 numeric, pre
	...
	In [13]:
	print("Target names:", iris_dataset['target_names'])
	Target names: ['setosa' 'versicolor' 'virginica']
	In [14]:
	print("Feature names:\n", iris_dataset['feature_names'])
	Feature names:
	['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
	In [15]:
	print("Type of data:", type(iris_dataset['data']))
	Type of data: <class 'numpy.ndarray'>
	In [16]:
	print("Shape of data:", iris_dataset['data'].shape)
	Shape of data: (150, 4)
	In [17]:
	print("First five rows of data:\n", iris_dataset['data'][:5])
	First five rows of data:
	[[5.1 3.5 1.4 0.2]
	[4.9 3. 1.4 0.2]
	[4.7 3.2 1.3 0.2]
	[4.6 3.1 1.5 0.2]
	[5. 3.6 1.4 0.2]]
	In [18]:
	print("Type of target:", type(iris_dataset['target']))
	Type of target: <class 'numpy.ndarray'>
	In [19]:
	print("Shape of target:", iris_dataset['target'].shape)
	Shape of target: (150,)
	In [20]:
	print("Target:\n", iris_dataset['target'])
	Target:
	[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
	0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
	1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
	2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
	2 2]
	Measuring Success: Training and Testing Data
	In [21]:
	from sklearn.model_selection import train_test_split
	X_train, X_test, y_train, y_test = train_test_split(
	iris_dataset['data'], iris_dataset['target'], random_state=0)
	In [22]:
	print("X_train shape:", X_train.shape)
	print("y_train shape:", y_train.shape)
	X_train shape: (112, 4)
	y_train shape: (112,)
	In [23]:
	print("X_test shape:", X_test.shape)
	print("y_test shape:", y_test.shape)
	X_test shape: (38, 4)
	y_test shape: (38,)
	First Things First: Look at Your Data
	In [24]:
	# create dataframe from data in X_train
	# label the columns using the strings in iris_dataset.feature_names
	iris_dataframe = pd.DataFrame(X_train, columns=iris_dataset.feature_names)
	# create a scatter matrix from the dataframe, color by y_train
	pd.plotting.scatter_matrix(iris_dataframe, c=y_train, figsize=(15, 15),
	marker='o', hist_kwds={'bins': 20}, s=60,
	alpha=.8, cmap=mglearn.cm3)
	Out[24]:
	array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f1e5c9a3ef0>,
	<matplotlib.axes._subplots.AxesSubplot object at 0x7f1e5c9520f0>,
	<matplotlib.axes._subplots.AxesSubplot object at 0x7f1e5c9794e0>,
	<matplotlib.axes._subplots.AxesSubplot object at 0x7f1e5c91f978>],
	[<matplotlib.axes._subplots.AxesSubplot object at 0x7f1e5c8c7e48>,
	<matplotlib.axes._subplots.AxesSubplot object at 0x7f1e5c8f8320>,
	<matplotlib.axes._subplots.AxesSubplot object at 0x7f1e5c8a27f0>,
	<matplotlib.axes._subplots.AxesSubplot object at 0x7f1e5c84acf8>],
	[<matplotlib.axes._subplots.AxesSubplot object at 0x7f1e5c84ad30>,
	<matplotlib.axes._subplots.AxesSubplot object at 0x7f1e5c822710>,
	<matplotlib.axes._subplots.AxesSubplot object at 0x7f1e5c7cec18>,
	<matplotlib.axes._subplots.AxesSubplot object at 0x7f1e5c7fe160>],
	[<matplotlib.axes._subplots.AxesSubplot object at 0x7f1e5c7a5668>,
	<matplotlib.axes._subplots.AxesSubplot object at 0x7f1e5c74db70>,
	<matplotlib.axes._subplots.AxesSubplot object at 0x7f1e5c77f0b8>,
	<matplotlib.axes._subplots.AxesSubplot object at 0x7f1e5c7265c0>]],
	dtype=object)

	Building Your First Model: k-Nearest Neighbors
	In [25]:
	from sklearn.neighbors import KNeighborsClassifier
	knn = KNeighborsClassifier(n_neighbors=1)
	In [26]:
	knn.fit(X_train, y_train)
	Out[26]:
	KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
	metric_params=None, n_jobs=None, n_neighbors=1, p=2,
	weights='uniform')
	Making Predictions
	In [27]:
	X_new = np.array([[5, 2.9, 1, 0.2]])
	print("X_new.shape:", X_new.shape)
	X_new.shape: (1, 4)
	In [28]:
	prediction = knn.predict(X_new)
	print("Prediction:", prediction)
	print("Predicted target name:",
	iris_dataset['target_names'][prediction])
	Prediction: [0]
	Predicted target name: ['setosa']
	Evaluating the Model
	In [29]:
	y_pred = knn.predict(X_test)
	print("Test set predictions:\n", y_pred)
	Test set predictions:
	[2 1 0 2 0 2 0 1 1 1 2 1 1 1 1 0 1 1 0 0 2 1 0 0 2 0 0 1 1 0 2 1 0 2 2 1 0
	2]
	In [30]:
	print("Test set score: {:.2f}".format(np.mean(y_pred == y_test)))
	Test set score: 0.97
	In [31]:
	print("Test set score: {:.2f}".format(knn.score(X_test, y_test)))
	Test set score: 0.97
	Summary and Outlook
	In [32]:
	X_train, X_test, y_train, y_test = train_test_split(
	iris_dataset['data'], iris_dataset['target'], random_state=0)

	knn = KNeighborsClassifier(n_neighbors=1)
	knn.fit(X_train, y_train)

	print("Test set score: {:.2f}".format(knn.score(X_test, y_test)))
	Test set score: 0.97
No results found