Created
June 21, 2021 13:56
-
-
Save sjtalkar/e4f5448f9593e83a458f5d2164985073 to your computer and use it in GitHub Desktop.
Creating a Biplot (University of Michigan Masters Unsupervised Learning)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def biplot(score, coeff, maxdim, pcax, pcay, labels=None): | |
""" | |
This function uses | |
score - the transformed data returned by pca - data as expressed according to the new axis | |
coeff - the loadings from the pca_components_ | |
For the feaures we are interested in, it plots the correlation between the original features and the PCAs. | |
Use cosine similarity and angle measures between axes. | |
It shows how the data is related to the ORIGINAL features in the positive and negative direction. | |
""" | |
zoom = 0.5 | |
pca1=pcax-1 | |
pca2=pcay-1 | |
xs = score[:,pca1] | |
ys = score[:,pca2] | |
n = min(coeff.shape[0], maxdim) | |
width = 2.0 * zoom | |
scalex = width/(xs.max()- xs.min()) | |
scaley = width/(ys.max()- ys.min()) | |
text_scale_factor = 1.3 | |
fig = plt.gcf() | |
fig.set_size_inches(9, 9) | |
plt.scatter(xs*scalex, ys*scaley, s=9) | |
for i in range(n): | |
plt.arrow(0, 0, coeff[i,pca1], coeff[i,pca2], | |
color='b',alpha=0.9, head_width = 0.03 * zoom) | |
if labels is None: | |
plt.text(coeff[i,pca1]* text_scale_factor, | |
coeff[i,pca2] * text_scale_factor, | |
"Var"+str(i+1), color='g', ha='center', va='center') | |
else: | |
plt.text(coeff[i,pca1]* text_scale_factor, | |
coeff[i,pca2] * text_scale_factor, | |
labels[i], color='g', ha='center', va='center') | |
plt.xlim(-zoom,zoom) | |
plt.ylim(-zoom,zoom) | |
plt.xlabel("PC{}".format(pcax)) | |
plt.ylabel("PC{}".format(pcay)) | |
plt.grid() | |
return | |
plt.figure() | |
feature_subset = slice(0, feature_subset_count, 1) | |
biplot(X_pca, np.transpose(pca.components_[0:2, feature_subset]), | |
feature_subset_count, 1, 2, labels=feature_names[feature_subset]) | |
print("explained_variance_ratio:", pca.explained_variance_ratio_) | |
print("sum of explained variance ratios:", np.sum(pca.explained_variance_ratio_)) | |
print("singular values:", pca.singular_values_) | |
# The variances of the PCs are given by the squares of the singular values of X*, divided by n−1. | |
# Since they are the eigenvalues of the (n-1)S matrix where S is the correlation matrix of X | |
print(np.power(pca.singular_values_, 2) / (scaled_features.shape[0] - 1)) | |
print(scaled_features.shape[0]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment