Created
August 9, 2013 06:32
-
-
Save aaronsaunders/6191558 to your computer and use it in GitHub Desktop.
Example code for doing PCA manually
[email protected]. http://www.bimcore.emory.edu/bbseries/PCA.ppt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(lattice) | |
my.wines <- read.csv("http://www.bimcore.emory.edu/wine.csv", header=TRUE) | |
# Look at the correlations | |
library(gclus) | |
my.abs <- abs(cor(my.wines)) | |
my.colors <- dmat.color(my.abs) | |
my.ordered <- order.single(cor(my.wines)) | |
cpairs(my.wines, my.ordered, panel.colors=my.colors, gap=0.5) | |
# Do the PCA | |
my.prc <- prcomp(my.wines, center=TRUE, scale=TRUE) | |
screeplot(my.prc, main="Scree Plot", xlab="Components") | |
screeplot(my.prc, main="Scree Plot", type="line" ) | |
# DotPlot PC1 | |
load <- my.prc$rotation | |
sorted.loadings <- load[order(load[, 1]), 1] | |
myTitle <- "Loadings Plot for PC1" | |
myXlab <- "Variable Loadings" | |
dotplot(sorted.loadings, main=myTitle, xlab=myXlab, cex=1.5, col="red") | |
# DotPlot PC2 | |
sorted.loadings <- load[order(load[, 2]), 2] | |
myTitle <- "Loadings Plot for PC2" | |
myXlab <- "Variable Loadings" | |
dotplot(sorted.loadings, main=myTitle, xlab=myXlab, cex=1.5, col="red") | |
# Now draw the BiPlot | |
biplot(my.prc, cex=c(1, 0.7)) | |
# Apply the Varimax Rotation | |
my.var <- varimax(my.prc$rotation) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Example code for doing PCA manually | |
# [email protected]. | |
# | |
library(calibrate) | |
my.classes <- read.csv("http://www.bimcore.emory.edu/marks.dat") | |
plot(my.classes, cex=0.9, col="blue", | |
main="Plot of Physics Scores vs. Stat Scores") | |
options(digits=3) | |
par(mfrow=c(1, 1)) | |
# Scale the data | |
standardize <- function(x) {(x - mean(x))} | |
my.scaled.classes <- apply(my.classes, 2, function(x) (x-mean(x))) | |
plot(my.scaled.classes, cex=0.9, col="blue", | |
main="Plot of Physics Scores vs. Stat Scores", | |
sub="Mean Scaled", xlim=c(-30, 30)) | |
# Find Eigen values of covariance matrix | |
my.cov <- cov(my.scaled.classes) | |
my.eigen <- eigen(my.cov) | |
rownames(my.eigen$vectorsmy) <- c("Physics","Stats") | |
colnames(my.eigen$vectors) <- c("PC1","PC") | |
# Note that the sum of the eigen values equals the total variance of the data | |
sum(my.eigen$values) | |
var(my.scaled.classes[, 1]) + var(my.scaled.classes[, 2]) | |
# The Eigen vectors are the principal components. We see to what extent each variable contributes | |
loadings <- my.eigen$vectors | |
# Let's plot them | |
pc1.slope <- my.eigen$vectors[1, 1] / my.eigen$vectors[2, 1] | |
pc2.slope <- my.eigen$vectors[1, 2] / my.eigen$vectors[2, 2] | |
abline(0, pc1.slope, col="red") | |
abline(0, pc2.slope, col="green") | |
textxy( 12, 10, "(-0.710, -0.695)", cx=0.9, dcol="red") | |
textxy(-12, 10, "(0.695, -0.719)", cx=0.9, dcol="green") | |
# See how much variation each eigenvector accounts for | |
pc1.var <- 100 * round(my.eigen$values[1] / sum(my.eigen$values), digits=2) | |
pc2.var <- 100 * round(my.eigen$values[2] / sum(my.eigen$values), digits=2) | |
xlab <- paste("PC1 - ", pc1.var, " % of variation", sep="") | |
ylab <- paste("PC2 - ", pc2.var, " % of variation", sep="") | |
# Multiply the scaled data by the eigen vectors (principal components) | |
scores <- my.scaled.classes %*% loadings | |
sd <- sqrt(my.eigen$values) | |
rownames(loadings) <- colnames(my.classes) | |
plot(scores, ylim=c(-10, 10), | |
main="Data in terms of EigenVectors / PCs", | |
xlab=xlab, ylab=ylab) | |
abline(0, 0, col="red") | |
abline(0, 90, col="green") | |
# Correlation BiPlot | |
scores.min <- min(scores[, 1:2]) | |
scores.max <- max(scores[, 1:2]) | |
plot(scores[, 1]/sd[1], scores[, 2]/sd[2], type="n" | |
main="My First BiPlot", xlab=xlab, ylab=ylab, ) | |
rownames(scores) <- seq(1:nrow(scores)) | |
abline(0, 0, col="red") | |
abline(0, 90, col="green") | |
# This is to make the size of the lines more apparent | |
factor <- 5 | |
# First plot the variables as vectors | |
arrows(0, 0, loadings[, 1] * sd[1] / factor, | |
loadings[, 2] * sd[2] / factor, length=0.1, | |
lwd=2, angle=20, col="red") | |
text(loadings[,1]*sd[1]/factor*1.2,loadings[,2]*sd[2]/factor*1.2,rownames(loadings), col="red", cex=1.2) | |
# Second plot the scores as points | |
text(scores[, 1] / sd[1], scores[, 2] / sd[2], rownames(scores), | |
col="blue", cex=0.7) | |
somelabs <- paste(round(my.classes[, 1], digits=1), | |
round(my.classes[, 2], digits=1), sep=" , ") | |
#identify(scores[, 1] / sd[1], scores[, 2] / sd[2], labels=somelabs, cex=0.8) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment