cnh · November 21, 2017 06:49
diff --git a/pca_v0.py b/pca_v0.py
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
 Created on Tue Nov 21 08:29:57 2017

 @author: tp53
 """

 import pandas as pd
 import numpy as np

 #read data and data wrangling

 #added a row for response
 #metf = metformin, n = normalize, r= raw dat, from csv, before doing transpose(T) & other manipulations
 metf_n_r_data = pd.read_csv("./data/metf_n_resp.csv")
 thiog_n_data = pd.read_csv("./data/THIOG_normalized.csv")

 res_csv = pd.read_csv('./data/response.csv')
 res_data = res_csv['METFORMIN']

 #build the target
 target = res_data



 #transpose metf_n_r_data, features in columns, samples(patient data) as rows 
 metf_n_r_t_data = metf_n_r_data.T

 #drop the target column, and save rest of the sample data, as 
 #predictors = metf_n_data.drop(['response'], axis=1).as_matrix()

 #create a copy of metf_n_r_t_data (its too long a name !!!, plus experiments) in df0
 df0 = metf_n_r_t_data.copy(deep=True)
 df_no_r = df0.drop(df0.columns[[0]], axis=1)

 predictors = df_no_r.as_matrix()
 '''
 predictors.shape will fail
 '''

 preds_wo_labels = predictors[1:]
 '''
 cant do preds_wo_labels.head
 as preds_wo_labels is not a dataframe object, but numpy.ndarray
 '''


 preds_wo_labels_df = pd.DataFrame(data=preds_wo_labels)

 import numpy as np
 preds_wo_labels = np.matrix(preds_wo_labels, dtype=float)
 preds_wo_labels


 cov_matrix = np.cov(preds_wo_labels)

 '''
 lets calculate eigenvalues and eigenvectors
 '''
 eig_vals, eig_vecs = np.linalg.eig(cov_matrix)

 eig_vals
 #eig_vecs

 preds_wo_labels[0]
 target

 # Visually confirm that the list is correctly sorted by decreasing eigenvalues
 eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]

 eig_pairs[0][0]
 a = eig_pairs[0][1]
 resp_np= np.array(target)

 np.corrcoef(a, resp_np)
 c1 =preds_wo_labels[:,0].T

 c0 = np.mean(preds_wo_labels[:,0].T)

 c0_std = np.std(c1)

 preds_wo_labels_df.head()


 print('Eigenvalues in descending order:')
 for i in eig_pairs:
    print(i)

 '''
 http://blog.districtdatalabs.com/principal-component-analysis-with-python

 To get a better idea of how principal components describe the variance 
 in the data,
 we will look at the explained variance ratio of 
 the first two principal components.
 '''

 from sklearn.manifold import TSNE

 from sklearn.preprocessing import StandardScaler
 from sklearn.decomposition import PCA


 pca = PCA(n_components=2)
 pca
 pca.fit_transform(preds_wo_labels_df)
 print(pca.explained_variance_ratio_)

 #Explained variance
 #pca = PCA().fit(X_std)
 from sklearn.preprocessing import StandardScaler

 X_std = StandardScaler().fit_transform(preds_wo_labels_df)
 pca = PCA().fit(X_std)

 import matplotlib.pyplot as plt

 plt.plot(np.cumsum(pca.explained_variance_ratio_))
 plt.xlabel('number of components')
 plt.ylabel('cumulative explained variance')
 plt.show()

 pca = PCA(n_components=21)
 pca.fit_transform(preds_wo_labels_df)
 print(pca.explained_variance_ratio_)


 '''

 '''





 #lets do t-SNE, PCA

 #tsne, pca imports
 from sklearn.manifold import TSNE

 from sklearn.preprocessing import StandardScaler
 from sklearn.decomposition import PCA


 '''
 Summarizing the PCA approach

 Listed below are the 6 general steps for performing a principal component analysis, which we will investigate in the following sections.

 1. Take the whole dataset consisting of dd-dimensional samples ignoring the class labels
 2. Compute the dd-dimensional mean vector (i.e., the means for every dimension of the whole dataset)
 3. Compute the scatter matrix (alternatively, the covariance matrix) of the whole data set
 4. Compute eigenvectors (ee1,ee2,...,eedee1,ee2,...,eed) and corresponding eigenvalues (λλ1,λλ2,...,λλdλλ1,λλ2,...,λλd)
 5. Sort the eigenvectors by decreasing eigenvalues and choose kk eigenvectors with the largest eigenvalues to form a d×kd×k dimensional matrix WWWW(where every column represents an eigenvector)
 6. Use this d×kd×k eigenvector matrix to transform the samples onto the new subspace. This can be summarized by the mathematical equation: yy=WWT×xxyy=WWT×xx (where xxxx is a d×1d×1-dimensional vector representing one sample, and yyyy is the transformed k×1k×1-dimensional sample in the new subspace.)
 '''


 '''
 lets plot the data
 '''

 from matplotlib import pyplot as plt
 from mpl_toolkits.mplot3d import Axes3D
 from mpl_toolkits.mplot3d import proj3d

 from matplotlib.patches import FancyArrowPatch


 class Arrow3D(FancyArrowPatch):
    def __init__(self, xs, ys, zs, *args, **kwargs):
        FancyArrowPatch.__init__(self, (0,0), (0,0), *args, **kwargs)
        self._verts3d = xs, ys, zs

    def draw(self, renderer):
        xs3d, ys3d, zs3d = self._verts3d
        xs, ys, zs = proj3d.proj_transform(xs3d, ys3d, zs3d, renderer.M)
        self.set_positions((xs[0],ys[0]),(xs[1],ys[1]))
        FancyArrowPatch.draw(self, renderer)


 fig = plt.figure(figsize=(8,8))
 ax = fig.add_subplot(111, projection='3d')
 plt.rcParams['legend.fontsize'] = 10 

 '''
 lets separate samples between class1(response =0)
 & class2(response=1)
 '''

 ax.plot(class1_sample[0,:], class1_sample[1,:], class1_sample[2,:], 'o', markersize=8, color='blue', alpha=0.5, label='class1')
 ax.plot(class2_sample[0,:], class2_sample[1,:], class2_sample[2,:], '^', markersize=8, alpha=0.5, color='red', label='class2')

 plt.title('Samples for class 1 and class 2')
 ax.legend(loc='upper right')

 plt.show()
	#!/usr/bin/env python3
	# -- coding: utf-8 --
	"""
	Created on Tue Nov 21 08:29:57 2017

	@author: tp53
	"""

	import pandas as pd
	import numpy as np

	#read data and data wrangling

	#added a row for response
	#metf = metformin, n = normalize, r= raw dat, from csv, before doing transpose(T) & other manipulations
	metf_n_r_data = pd.read_csv("./data/metf_n_resp.csv")
	thiog_n_data = pd.read_csv("./data/THIOG_normalized.csv")

	res_csv = pd.read_csv('./data/response.csv')
	res_data = res_csv['METFORMIN']

	#build the target
	target = res_data



	#transpose metf_n_r_data, features in columns, samples(patient data) as rows
	metf_n_r_t_data = metf_n_r_data.T

	#drop the target column, and save rest of the sample data, as
	#predictors = metf_n_data.drop(['response'], axis=1).as_matrix()

	#create a copy of metf_n_r_t_data (its too long a name !!!, plus experiments) in df0
	df0 = metf_n_r_t_data.copy(deep=True)
	df_no_r = df0.drop(df0.columns[[0]], axis=1)

	predictors = df_no_r.as_matrix()
	'''
	predictors.shape will fail
	'''

	preds_wo_labels = predictors[1:]
	'''
	cant do preds_wo_labels.head
	as preds_wo_labels is not a dataframe object, but numpy.ndarray
	'''


	preds_wo_labels_df = pd.DataFrame(data=preds_wo_labels)

	import numpy as np
	preds_wo_labels = np.matrix(preds_wo_labels, dtype=float)
	preds_wo_labels


	cov_matrix = np.cov(preds_wo_labels)

	'''
	lets calculate eigenvalues and eigenvectors
	'''
	eig_vals, eig_vecs = np.linalg.eig(cov_matrix)

	eig_vals
	#eig_vecs

	preds_wo_labels[0]
	target

	# Visually confirm that the list is correctly sorted by decreasing eigenvalues
	eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]

	eig_pairs[0][0]
	a = eig_pairs[0][1]
	resp_np= np.array(target)

	np.corrcoef(a, resp_np)
	c1 =preds_wo_labels[:,0].T

	c0 = np.mean(preds_wo_labels[:,0].T)

	c0_std = np.std(c1)

	preds_wo_labels_df.head()


	print('Eigenvalues in descending order:')
	for i in eig_pairs:
	print(i)

	'''
	http://blog.districtdatalabs.com/principal-component-analysis-with-python

	To get a better idea of how principal components describe the variance
	in the data,
	we will look at the explained variance ratio of
	the first two principal components.
	'''

	from sklearn.manifold import TSNE

	from sklearn.preprocessing import StandardScaler
	from sklearn.decomposition import PCA


	pca = PCA(n_components=2)
	pca
	pca.fit_transform(preds_wo_labels_df)
	print(pca.explained_variance_ratio_)

	#Explained variance
	#pca = PCA().fit(X_std)
	from sklearn.preprocessing import StandardScaler

	X_std = StandardScaler().fit_transform(preds_wo_labels_df)
	pca = PCA().fit(X_std)

	import matplotlib.pyplot as plt

	plt.plot(np.cumsum(pca.explained_variance_ratio_))
	plt.xlabel('number of components')
	plt.ylabel('cumulative explained variance')
	plt.show()

	pca = PCA(n_components=21)
	pca.fit_transform(preds_wo_labels_df)
	print(pca.explained_variance_ratio_)


	'''

	'''





	#lets do t-SNE, PCA

	#tsne, pca imports
	from sklearn.manifold import TSNE

	from sklearn.preprocessing import StandardScaler
	from sklearn.decomposition import PCA


	'''
	Summarizing the PCA approach

	Listed below are the 6 general steps for performing a principal component analysis, which we will investigate in the following sections.

	1. Take the whole dataset consisting of dd-dimensional samples ignoring the class labels
	2. Compute the dd-dimensional mean vector (i.e., the means for every dimension of the whole dataset)
	3. Compute the scatter matrix (alternatively, the covariance matrix) of the whole data set
	4. Compute eigenvectors (ee1,ee2,...,eedee1,ee2,...,eed) and corresponding eigenvalues (λλ1,λλ2,...,λλdλλ1,λλ2,...,λλd)
	5. Sort the eigenvectors by decreasing eigenvalues and choose kk eigenvectors with the largest eigenvalues to form a d×kd×k dimensional matrix WWWW(where every column represents an eigenvector)
	6. Use this d×kd×k eigenvector matrix to transform the samples onto the new subspace. This can be summarized by the mathematical equation: yy=WWT×xxyy=WWT×xx (where xxxx is a d×1d×1-dimensional vector representing one sample, and yyyy is the transformed k×1k×1-dimensional sample in the new subspace.)
	'''


	'''
	lets plot the data
	'''

	from matplotlib import pyplot as plt
	from mpl_toolkits.mplot3d import Axes3D
	from mpl_toolkits.mplot3d import proj3d

	from matplotlib.patches import FancyArrowPatch


	class Arrow3D(FancyArrowPatch):
	def __init__(self, xs, ys, zs, args, *kwargs):
	FancyArrowPatch.__init__(self, (0,0), (0,0), args, *kwargs)
	self._verts3d = xs, ys, zs

	def draw(self, renderer):
	xs3d, ys3d, zs3d = self._verts3d
	xs, ys, zs = proj3d.proj_transform(xs3d, ys3d, zs3d, renderer.M)
	self.set_positions((xs[0],ys[0]),(xs[1],ys[1]))
	FancyArrowPatch.draw(self, renderer)


	fig = plt.figure(figsize=(8,8))
	ax = fig.add_subplot(111, projection='3d')
	plt.rcParams['legend.fontsize'] = 10

	'''
	lets separate samples between class1(response =0)
	& class2(response=1)
	'''

	ax.plot(class1_sample[0,:], class1_sample[1,:], class1_sample[2,:], 'o', markersize=8, color='blue', alpha=0.5, label='class1')
	ax.plot(class2_sample[0,:], class2_sample[1,:], class2_sample[2,:], '^', markersize=8, alpha=0.5, color='red', label='class2')

	plt.title('Samples for class 1 and class 2')
	ax.legend(loc='upper right')

	plt.show()