Skip to content

Instantly share code, notes, and snippets.

@aaronsaunders
Created August 9, 2013 06:32
Show Gist options
  • Save aaronsaunders/6191558 to your computer and use it in GitHub Desktop.
Save aaronsaunders/6191558 to your computer and use it in GitHub Desktop.
library(lattice)
my.wines <- read.csv("http://www.bimcore.emory.edu/wine.csv", header=TRUE)
# Look at the correlations
library(gclus)
my.abs <- abs(cor(my.wines))
my.colors <- dmat.color(my.abs)
my.ordered <- order.single(cor(my.wines))
cpairs(my.wines, my.ordered, panel.colors=my.colors, gap=0.5)
# Do the PCA
my.prc <- prcomp(my.wines, center=TRUE, scale=TRUE)
screeplot(my.prc, main="Scree Plot", xlab="Components")
screeplot(my.prc, main="Scree Plot", type="line" )
# DotPlot PC1
load <- my.prc$rotation
sorted.loadings <- load[order(load[, 1]), 1]
myTitle <- "Loadings Plot for PC1"
myXlab <- "Variable Loadings"
dotplot(sorted.loadings, main=myTitle, xlab=myXlab, cex=1.5, col="red")
# DotPlot PC2
sorted.loadings <- load[order(load[, 2]), 2]
myTitle <- "Loadings Plot for PC2"
myXlab <- "Variable Loadings"
dotplot(sorted.loadings, main=myTitle, xlab=myXlab, cex=1.5, col="red")
# Now draw the BiPlot
biplot(my.prc, cex=c(1, 0.7))
# Apply the Varimax Rotation
my.var <- varimax(my.prc$rotation)
# Example code for doing PCA manually
# [email protected].
#
library(calibrate)
my.classes <- read.csv("http://www.bimcore.emory.edu/marks.dat")
plot(my.classes, cex=0.9, col="blue",
main="Plot of Physics Scores vs. Stat Scores")
options(digits=3)
par(mfrow=c(1, 1))
# Scale the data
standardize <- function(x) {(x - mean(x))}
my.scaled.classes <- apply(my.classes, 2, function(x) (x-mean(x)))
plot(my.scaled.classes, cex=0.9, col="blue",
main="Plot of Physics Scores vs. Stat Scores",
sub="Mean Scaled", xlim=c(-30, 30))
# Find Eigen values of covariance matrix
my.cov <- cov(my.scaled.classes)
my.eigen <- eigen(my.cov)
rownames(my.eigen$vectorsmy) <- c("Physics","Stats")
colnames(my.eigen$vectors) <- c("PC1","PC")
# Note that the sum of the eigen values equals the total variance of the data
sum(my.eigen$values)
var(my.scaled.classes[, 1]) + var(my.scaled.classes[, 2])
# The Eigen vectors are the principal components. We see to what extent each variable contributes
loadings <- my.eigen$vectors
# Let's plot them
pc1.slope <- my.eigen$vectors[1, 1] / my.eigen$vectors[2, 1]
pc2.slope <- my.eigen$vectors[1, 2] / my.eigen$vectors[2, 2]
abline(0, pc1.slope, col="red")
abline(0, pc2.slope, col="green")
textxy( 12, 10, "(-0.710, -0.695)", cx=0.9, dcol="red")
textxy(-12, 10, "(0.695, -0.719)", cx=0.9, dcol="green")
# See how much variation each eigenvector accounts for
pc1.var <- 100 * round(my.eigen$values[1] / sum(my.eigen$values), digits=2)
pc2.var <- 100 * round(my.eigen$values[2] / sum(my.eigen$values), digits=2)
xlab <- paste("PC1 - ", pc1.var, " % of variation", sep="")
ylab <- paste("PC2 - ", pc2.var, " % of variation", sep="")
# Multiply the scaled data by the eigen vectors (principal components)
scores <- my.scaled.classes %*% loadings
sd <- sqrt(my.eigen$values)
rownames(loadings) <- colnames(my.classes)
plot(scores, ylim=c(-10, 10),
main="Data in terms of EigenVectors / PCs",
xlab=xlab, ylab=ylab)
abline(0, 0, col="red")
abline(0, 90, col="green")
# Correlation BiPlot
scores.min <- min(scores[, 1:2])
scores.max <- max(scores[, 1:2])
plot(scores[, 1]/sd[1], scores[, 2]/sd[2], type="n"
main="My First BiPlot", xlab=xlab, ylab=ylab, )
rownames(scores) <- seq(1:nrow(scores))
abline(0, 0, col="red")
abline(0, 90, col="green")
# This is to make the size of the lines more apparent
factor <- 5
# First plot the variables as vectors
arrows(0, 0, loadings[, 1] * sd[1] / factor,
loadings[, 2] * sd[2] / factor, length=0.1,
lwd=2, angle=20, col="red")
text(loadings[,1]*sd[1]/factor*1.2,loadings[,2]*sd[2]/factor*1.2,rownames(loadings), col="red", cex=1.2)
# Second plot the scores as points
text(scores[, 1] / sd[1], scores[, 2] / sd[2], rownames(scores),
col="blue", cex=0.7)
somelabs <- paste(round(my.classes[, 1], digits=1),
round(my.classes[, 2], digits=1), sep=" , ")
#identify(scores[, 1] / sd[1], scores[, 2] / sd[2], labels=somelabs, cex=0.8)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment