sjtalkar · June 21, 2021 13:56
diff --git a/gistfile1.txt b/gistfile1.txt

 def biplot(score, coeff, maxdim, pcax, pcay, labels=None):
    """
      This function uses 
      score - the transformed data returned by pca - data as expressed according to the new axis
      coeff - the loadings from the pca_components_
      
      For the feaures we are interested in, it plots the correlation between the original features and the PCAs.
      Use cosine similarity and angle measures between axes.
      
      It shows how the data is related to the ORIGINAL features in the positive and negative direction.
      
    """
    zoom = 0.5
    pca1=pcax-1
    pca2=pcay-1
    xs = score[:,pca1]
    ys = score[:,pca2]
    n = min(coeff.shape[0], maxdim)
    width = 2.0 * zoom
    scalex = width/(xs.max()- xs.min())
    scaley = width/(ys.max()- ys.min())
    text_scale_factor = 1.3
        
    fig = plt.gcf()
    fig.set_size_inches(9, 9)
    
    plt.scatter(xs*scalex, ys*scaley, s=9)
    for i in range(n):
        plt.arrow(0, 0, coeff[i,pca1], coeff[i,pca2],
                  color='b',alpha=0.9, head_width = 0.03 * zoom) 
        if labels is None:
            plt.text(coeff[i,pca1]* text_scale_factor, 
                     coeff[i,pca2] * text_scale_factor, 
                     "Var"+str(i+1), color='g', ha='center', va='center')
        else:
            plt.text(coeff[i,pca1]* text_scale_factor, 
                     coeff[i,pca2] * text_scale_factor, 
                     labels[i], color='g', ha='center', va='center')
    
    plt.xlim(-zoom,zoom)
    plt.ylim(-zoom,zoom)
    plt.xlabel("PC{}".format(pcax))
    plt.ylabel("PC{}".format(pcay))
    plt.grid()
    return 

 plt.figure()

 feature_subset = slice(0, feature_subset_count, 1)

 biplot(X_pca, np.transpose(pca.components_[0:2, feature_subset]), 
       feature_subset_count, 1, 2, labels=feature_names[feature_subset])

 print("explained_variance_ratio:", pca.explained_variance_ratio_)
 print("sum of explained variance ratios:", np.sum(pca.explained_variance_ratio_))
 print("singular values:", pca.singular_values_)  

 # The variances of the PCs are given by the squares of the singular values of X*, divided by n−1. 
 # Since they are the eigenvalues of the (n-1)S matrix where S is the correlation matrix of X
 print(np.power(pca.singular_values_, 2) / (scaled_features.shape[0] - 1))
 print(scaled_features.shape[0])

	def biplot(score, coeff, maxdim, pcax, pcay, labels=None):
	"""
	This function uses
	score - the transformed data returned by pca - data as expressed according to the new axis
	coeff - the loadings from the pca_components_

	For the feaures we are interested in, it plots the correlation between the original features and the PCAs.
	Use cosine similarity and angle measures between axes.

	It shows how the data is related to the ORIGINAL features in the positive and negative direction.

	"""
	zoom = 0.5
	pca1=pcax-1
	pca2=pcay-1
	xs = score[:,pca1]
	ys = score[:,pca2]
	n = min(coeff.shape[0], maxdim)
	width = 2.0 * zoom
	scalex = width/(xs.max()- xs.min())
	scaley = width/(ys.max()- ys.min())
	text_scale_factor = 1.3

	fig = plt.gcf()
	fig.set_size_inches(9, 9)

	plt.scatter(xsscalex, ysscaley, s=9)
	for i in range(n):
	plt.arrow(0, 0, coeff[i,pca1], coeff[i,pca2],
	color='b',alpha=0.9, head_width = 0.03 * zoom)
	if labels is None:
	plt.text(coeff[i,pca1]* text_scale_factor,
	coeff[i,pca2] * text_scale_factor,
	"Var"+str(i+1), color='g', ha='center', va='center')
	else:
	plt.text(coeff[i,pca1]* text_scale_factor,
	coeff[i,pca2] * text_scale_factor,
	labels[i], color='g', ha='center', va='center')

	plt.xlim(-zoom,zoom)
	plt.ylim(-zoom,zoom)
	plt.xlabel("PC{}".format(pcax))
	plt.ylabel("PC{}".format(pcay))
	plt.grid()
	return

	plt.figure()

	feature_subset = slice(0, feature_subset_count, 1)

	biplot(X_pca, np.transpose(pca.components_[0:2, feature_subset]),
	feature_subset_count, 1, 2, labels=feature_names[feature_subset])

	print("explained_variance_ratio:", pca.explained_variance_ratio_)
	print("sum of explained variance ratios:", np.sum(pca.explained_variance_ratio_))
	print("singular values:", pca.singular_values_)

	# The variances of the PCs are given by the squares of the singular values of X*, divided by n−1.
	# Since they are the eigenvalues of the (n-1)S matrix where S is the correlation matrix of X
	print(np.power(pca.singular_values_, 2) / (scaled_features.shape[0] - 1))
	print(scaled_features.shape[0])