- PCA_armadillo: From 3D rendering to 2D plot
- PCA_kidney: reduce the dense kidney clinic study feature set to its two main components
-
-
Save Mashimo/69f0972d51358d65f088a7147dfc5ff1 to your computer and use it in GitHub Desktop.
PCA - Principal component Analysis
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Principal Component Analysis |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import matplotlib.pyplot as plt | |
import matplotlib | |
import datetime | |
from plyfile import PlyData | |
from sklearn.decomposition import PCA | |
""" | |
The goal is to reduce the dimensionality of a 3D scan from three to two using PCA to cast a shadow of the data onto its two most | |
important principal components. Then render the resulting 2D scatter plot. | |
The scan is a real life armadillo sculpture scanned using a Cyberware 3030 MS 3D scanner at Stanford University. | |
The sculpture is available as part of their 3D Scanning Repository, and is a very dense 3D mesh consisting of 172974 vertices! | |
The PLY file is available at https://graphics.stanford.edu/data/3Dscanrep/ | |
""" | |
# Every 100 data samples, we save 1. If things run too | |
# slow, try increasing this number. If things run too fast, | |
# try decreasing it... =) | |
reduce_factor = 100 | |
# Look pretty... | |
matplotlib.style.use('ggplot') | |
# Load up the scanned armadillo | |
plyfile = PlyData.read('Datasets/stanford_armadillo.ply') | |
armadillo = pd.DataFrame({ | |
'x':plyfile['vertex']['z'][::reduce_factor], | |
'y':plyfile['vertex']['x'][::reduce_factor], | |
'z':plyfile['vertex']['y'][::reduce_factor] | |
}) | |
def do_PCA(armadillo): | |
# | |
# import the libraries required for PCA. | |
# Then, train your PCA on the armadillo dataframe. Finally, | |
# drop one dimension (reduce it down to 2D) and project the | |
# armadillo down to the 2D principal component feature space. | |
# | |
pca = PCA(n_components=2) | |
pca.fit(armadillo) | |
PCA(copy=True, n_components=2, whiten=False) | |
reducedArmadillo = pca.transform(armadillo) | |
return reducedArmadillo | |
def do_RandomizedPCA(armadillo): | |
# | |
# import the libraries required for | |
# RandomizedPCA. Then, train your RandomizedPCA on the armadillo | |
# dataframe. Finally, drop one dimension (reduce it down to 2D) | |
# and project the armadillo down to the 2D principal component | |
# feature space. | |
# | |
# | |
# NOTE: SKLearn deprecated the RandomizedPCA method, but still | |
# has instructions on how to use randomized (truncated) method | |
# for the SVD solver. To find out how to use it, check out the | |
# full docs here: | |
# http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html | |
# | |
pca = PCA(n_components=2, svd_solver='randomized') | |
pca.fit(armadillo) | |
PCA(copy=True, n_components=2, whiten=False) | |
reducedArmadillo = pca.transform(armadillo) | |
return reducedArmadillo | |
# Render the Original Armadillo | |
fig = plt.figure() | |
ax = fig.add_subplot(111, projection='3d') | |
ax.set_title('Armadillo 3D') | |
ax.set_xlabel('X') | |
ax.set_ylabel('Y') | |
ax.set_zlabel('Z') | |
ax.scatter(armadillo.x, armadillo.y, armadillo.z, c='green', marker='.', alpha=0.75) | |
# Time the execution of PCA 5000x | |
# PCA is ran 5000x in order to help decrease the potential of rogue | |
# processes altering the speed of execution. | |
t1 = datetime.datetime.now() | |
for i in range(5000): pca = do_PCA(armadillo) | |
time_delta = datetime.datetime.now() - t1 | |
# Render the newly transformed PCA armadillo! | |
if not pca is None: | |
fig = plt.figure() | |
ax = fig.add_subplot(111) | |
ax.set_title('PCA, build time: ' + str(time_delta)) | |
ax.scatter(pca[:,0], pca[:,1], c='blue', marker='.', alpha=0.75) | |
# Time the execution of rPCA 5000x | |
t1 = datetime.datetime.now() | |
for i in range(5000): rpca = do_RandomizedPCA(armadillo) | |
time_delta = datetime.datetime.now() - t1 | |
# Render the newly transformed RandomizedPCA armadillo! | |
if not rpca is None: | |
fig = plt.figure() | |
ax = fig.add_subplot(111) | |
ax.set_title('RandomizedPCA, build time: ' + str(time_delta)) | |
ax.scatter(rpca[:,0], rpca[:,1], c='red', marker='.', alpha=0.75) | |
plt.show() | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import matplotlib.pyplot as plt | |
import PCA_kidney_helper as helper | |
from sklearn.decomposition import PCA | |
""" | |
The UCI's Chronic Kidney Disease data set is a collection of samples taken from patients in India over a two month period, | |
some of whom were in the early stages of the disease. Available here: https://archive.ics.uci.edu/ml/datasets/Chronic_Kidney_Disease | |
Reduce the dataset to two principal components by running it through PCA. | |
""" | |
# Look pretty... | |
plt.style.use('ggplot') | |
scaleFeatures = True | |
# Load up the dataset and remove any and all | |
# Rows that have a nan. | |
# | |
# | |
df = pd.read_csv("Datasets/kidney_disease.csv", index_col='id') | |
df.dropna(inplace=True) | |
# Create some color coded labels; the actual label feature | |
# will be removed prior to executing PCA, since it's unsupervised. | |
# You're only labeling by color so you can see the effects of PCA | |
labelsCol = ['red' if i=='ckd' else 'green' for i in df.classification] | |
# Get rid of nominal columns | |
# | |
df.drop(labels=['classification'], axis=1, inplace=True) | |
df = pd.get_dummies(df,columns=['rbc', 'pc', 'pcc', 'ba', 'htn', | |
'dm', 'cad', 'appet', 'pe', 'ane']) | |
print(df.dtypes) | |
""" | |
After adding in all the other features and properly encoding those that | |
need to be, you should notice that your separation between CKD and Non-CKD | |
patients increases significantly: by taking many secondary features and combining them, machine | |
learning is usually able to come up with more informative descriptions of | |
your data. | |
""" | |
# Print out and check your dataframe's dtypes. | |
# | |
# You can either take a look at the dataset webpage in the attribute info | |
# section: https://archive.ics.uci.edu/ml/datasets/Chronic_Kidney_Disease | |
# or you can actually peek through the dataframe by printing a few rows. | |
# | |
df.wc = pd.to_numeric(df.wc, errors='coerce') | |
df.rc = pd.to_numeric(df.rc, errors='coerce') | |
df.pcv = pd.to_numeric(df.pcv, errors='coerce') | |
print(df.dtypes) | |
# PCA Operates based on variance. The variable with the greatest | |
# variance will dominate. Go ahead and peek into your data using a | |
# command that will check the variance of every feature in your dataset. | |
# Print out the results. Also print out the results of running .describe | |
# on your dataset. | |
# | |
# | |
print(df.describe()) | |
if scaleFeatures: df = helper.scaleFeatures(df) | |
# Run PCA on your dataset and reduce it to 2 components | |
# | |
pca = PCA(n_components=2) | |
pca.fit(df) | |
PCA(copy=True, n_components=2, whiten=False) | |
T = pca.transform(df) | |
# Plot the transformed data as a scatter plot. Recall that transforming | |
# the data will result in a NumPy NDArray. You can either use MatPlotLib | |
# to graph it directly, or you can convert it to DataFrame and have pandas | |
# do it for you. | |
# | |
# Since we transformed via PCA, we no longer have column names. We know we | |
# are in P.C. space, so we'll just define the coordinates accordingly: | |
ax = helper.drawVectors(T, pca.components_, df.columns.values, plt, scaleFeatures) | |
T = pd.DataFrame(T) | |
T.columns = ['component1', 'component2'] | |
T.plot.scatter(x='component1', y='component2', marker='o', c=labelsCol, alpha=0.75, ax=ax) | |
plt.show() | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
import pandas as pd | |
from sklearn import preprocessing | |
# A Note on SKLearn .transform() calls: | |
# | |
# Any time you transform your data, you lose the column header names. | |
# This actually makes complete sense. There are essentially two types | |
# of transformations, those that change the scale of your features, | |
# and those that change your features entire. Changing the scale would | |
# be like changing centimeters to inches. Changing the features would | |
# be like using PCA to reduce 300 columns to 30. In either case, the | |
# original column's units have been altered or no longer exist, so it's | |
# up to you to rename your columns after ANY transformation. Due to | |
# this, SKLearn returns an NDArray from *transform() calls. | |
def scaleFeatures(df): | |
# SKLearn has many different methods for doing transforming your | |
# features by scaling them (this is a type of pre-processing). | |
# RobustScaler, Normalizer, MinMaxScaler, MaxAbsScaler, StandardScaler... | |
# http://scikit-learn.org/stable/modules/classes.html#module-sklearn.preprocessing | |
# | |
# However in order to be effective at PCA, there are a few requirements | |
# that must be met, and which will drive the selection of your scaler. | |
# PCA required your data is standardized -- in other words it's mean is | |
# equal to 0, and it has ~unit variance. | |
# | |
# SKLearn's regular Normalizer doesn't zero out the mean of your data, | |
# it only clamps it, so it's inappropriate to use here (depending on | |
# your data). MinMaxScaler and MaxAbsScaler both fail to set a unit | |
# variance, so you won't be using them either. RobustScaler can work, | |
# again depending on your data (watch for outliers). For these reasons | |
# we're going to use the StandardScaler. Get familiar with it by visiting | |
# these two websites: | |
# | |
# http://scikit-learn.org/stable/modules/preprocessing.html#preprocessing-scaler | |
# | |
# http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html#sklearn.preprocessing.StandardScaler | |
# | |
# --------- | |
# Feature scaling is the type of transformation that only changes the | |
# scale and not number of features, so we'll use the original dataset | |
# column names. However we'll keep in mind that the _units_ have been | |
# altered: | |
scaled = preprocessing.StandardScaler().fit_transform(df) | |
scaled = pd.DataFrame(scaled, columns=df.columns) | |
print ("New Variances:\n", scaled.var()) | |
print ("New Describe:\n", scaled.describe()) | |
return scaled | |
def drawVectors(transformed_features, components_, columns, plt, scaled): | |
if not scaled: | |
return plt.axes() # No cheating ;-) | |
num_columns = len(columns) | |
# This funtion will project your *original* feature (columns) | |
# onto your principal component feature-space, so that you can | |
# visualize how "important" each one was in the | |
# multi-dimensional scaling | |
# Scale the principal components by the max value in | |
# the transformed set belonging to that component | |
xvector = components_[0] * max(transformed_features[:,0]) | |
yvector = components_[1] * max(transformed_features[:,1]) | |
## visualize projections | |
# Sort each column by it's length. These are your *original* | |
# columns, not the principal components. | |
important_features = { columns[i] : math.sqrt(xvector[i]**2 + yvector[i]**2) for i in range(num_columns) } | |
important_features = sorted(zip(important_features.values(), important_features.keys()), reverse=True) | |
print ("Features by importance:\n", important_features) | |
ax = plt.axes() | |
for i in range(num_columns): | |
# Use an arrow to project each original feature as a | |
# labeled vector on your principal component axes | |
plt.arrow(0, 0, xvector[i], yvector[i], color='b', width=0.0005, head_width=0.02, alpha=0.75) | |
plt.text(xvector[i]*1.2, yvector[i]*1.2, list(columns)[i], color='b', alpha=0.75) | |
return ax | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
cool tkants