Created
March 15, 2013 04:19
-
-
Save jmdeldin/5167453 to your computer and use it in GitHub Desktop.
Getting familiar with LDA in R. See http://www.cs.umt.edu/~dougr/PattRec_files/PattRecProjLDA.htm
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Jon-Michael Deldin | |
## Pattern Recognition, SP 2013 | |
## LDA Project | |
## | |
## Usage: | |
## Rscript lda.R | |
## | |
## clear existing vars | |
rm(list = ls(all=TRUE)) | |
## load the LDA library | |
library(MASS) | |
## create the directory to download the data (like `mkdir -p` on *nix) | |
mkdirp <- function(directory) { | |
dir.create(directory, recursive=TRUE, showWarnings=FALSE) | |
} | |
## download a file if it doesn't exist | |
maybeDownload <- function(filename, url) { | |
if (!file.exists(filename)) { | |
download.file(url, filename) | |
} | |
} | |
## runs lda() and returns an LDA object | |
getLda <- function(data, train, classes, tolerance) { | |
return(lda(data[train,], classes[train], tol=tolerance)) | |
} | |
## returns both projections | |
projectLda <- function(data, ldaObj) { | |
mat <- as.matrix(data) | |
first <- mat %*% ldaObj$scaling[,1] | |
second <- mat %*% ldaObj$scaling[,2] | |
return(list(first=first, second=second)) | |
} | |
## run LDA | |
runLda <- function(data, train, classes, colorVector, pchSize, cexSize, mytol=NULL) { | |
if (is.null(mytol)) { | |
mytol <- 1e-04 | |
} | |
z <- getLda(data, train, classes, mytol) | |
## calculate the accuracy | |
actual <- classes[-train] | |
predicted <- predict(z, data[-train,])$class | |
failures <- length(which(actual != predicted)) | |
n <- length(actual) | |
accuracy <- (n - failures)/n | |
print(paste("ACCURACY = ", accuracy*100, "%", sep='')) | |
## plot the projections | |
projections <- projectLda(data, z) | |
p1 <- projections$first | |
p2 <- projections$second | |
plot(p1, p2, | |
cex=cexSize, | |
col=colorVector, | |
xlab="Linear Discriminant 1", | |
ylab="Linear Discriminant 2", | |
pch=pchSize) | |
return(c(z, accuracy)) | |
} | |
## run MDS | |
runMds <- function(data, colorVector, pchSize, cexSize) { | |
points <- cmdscale(dist(as.matrix(data))) | |
plot(points, | |
col=colorVector, | |
xlab="Eigen 1", | |
ylab="Eigen 2", | |
pch=pchSize, | |
cex=cexSize) | |
return(points) | |
} | |
## run PCA | |
runPca <- function(data, colorVector, pchSize, cexSize) { | |
prData <- prcomp(data) | |
x <- prData$x[,1] | |
y <- prData$x[,2] | |
plot(x, y, | |
col=colorVector, | |
xlab='Principal Component 1', | |
ylab='Principal Component 2', | |
cex=cexSize, | |
pch=pchSize) | |
return(prData) | |
} | |
## make a triptych of LDA, MDA, and PCA | |
makeTriptych <- function(data, classes, colors, filename, train=NULL, tol=NULL) { | |
if (is.null(train)) { | |
train <- sample(1:150, 125) | |
} | |
pchSize <- 0 | |
cexSize <- 0.4 | |
linePos <- 0.50 | |
pdf(paste('fig', filename, sep='/'), width=6.6, height=2.2) | |
par(mar=c(2.5, 2.5, 2.5, .1), mgp=c(1.5, .5, 0)) | |
layout(matrix(1:3, 1, 3), widths=c(1, 1, 1)) | |
lda <- runLda(data, train, classes, colors, pchSize, cexSize, mytol=tol) | |
mtext("(A)", line=linePos) | |
mds <- runMds(data, colors, pchSize, cexSize) | |
mtext("(B)", line=linePos) | |
pca <- runPca(data, colors, pchSize, cexSize) | |
mtext("(C)", line=linePos) | |
dev.off() | |
return(list(lda=lda, mds=mds, pca=pca)) | |
} | |
## make a density plot from LDA's projections' | |
runCancerDensity <- function(data, ldaObj, benignIndexes, malignantIndexes) { | |
projections <- projectLda(data, ldaObj) | |
dens <- density(projections$first) | |
xRange <- range(dens$x) | |
benignDens <- density(projections$first[benignIndexes]) | |
malignantDens <- density(projections$first[malignantIndexes]) | |
pdf('fig/cancer-density-plot.pdf', width=6.6, height=2.2) | |
par(mar=c(2.5, 2.5, 2.5, .1), mgp=c(1.5, .5, 0)) | |
plot(benignDens$x, benignDens$y, | |
type="l", | |
col="blue", | |
xlim=xRange, | |
xlab="Linear Discriminant 1", | |
ylab="Density") | |
lines(malignantDens$x, malignantDens$y, col="red") | |
} | |
############### | |
## IRIS DATA ## | |
############### | |
runIris <- function() { | |
## make a copy of the iris data | |
mydata <- iris | |
## grab the species column | |
classes <- mydata$Species | |
## remove the species column from the data set | |
mydata$Species <- NULL | |
## Vector of colors | |
myColorKeys <- c('red', 'green', 'blue') | |
## Separate classes into variables | |
setosaIndexes <- which(classes == "setosa") | |
versicolorIndexes <- which(classes == "versicolor") | |
virginicaIndexes <- which(classes == "virginica") | |
## Start building a real color vector by cloning the classes vector | |
myColors <- as.vector(classes) | |
myColors[setosaIndexes] <- myColorKeys[1] | |
myColors[versicolorIndexes] <- myColorKeys[2] | |
myColors[virginicaIndexes] <- myColorKeys[3] | |
makeTriptych(mydata, classes, myColors, 'iris-plots.pdf') | |
} | |
########### | |
## FRUIT ## | |
########### | |
runFruit <- function() { | |
## download the fruit data and read it into a data.frame | |
fruitFn <- 'data/fruit.csv' | |
maybeDownload(fruitFn, 'http://www.cs.umt.edu/~dougr/PattRecFiles/projects/pca/fruit.csv') | |
fruit <- read.csv(fruitFn) | |
## Extract the various fruit types | |
classes <- fruit$Class | |
## Drop the class column | |
fruitData <- fruit[,1:4] | |
## Colors for apples, lemons, oranges, and peaches | |
myColorKeys <- c('red', 'yellow', 'orange', 'burlywood1') | |
## Separate classes into variables so we can color them differently | |
appleIndexes <- which(classes == "apple") | |
lemonIndexes <- which(classes == "lemon") | |
orangeIndexes <- which(classes == "orange") | |
peachIndexes <- which(classes == "peach") | |
## Start building a real color vector and set the colors for matching indexes | |
myColors <- as.vector(classes) | |
myColors[appleIndexes] <- myColorKeys[1] | |
myColors[lemonIndexes] <- myColorKeys[2] | |
myColors[orangeIndexes] <- myColorKeys[3] | |
myColors[peachIndexes] <- myColorKeys[4] | |
train <- sample(1:length(fruitData), length(fruitData)*0.1) | |
makeTriptych(fruitData, classes, myColors, 'fruit-plots.pdf') | |
} | |
########### | |
## MOUSE ## | |
########### | |
runMouse <- function() { | |
## returns the indexes matching either a proximal or distal end | |
fetchExpIndexes <- function(headers, kind) { | |
regexp <- paste("^[A-Z]+", kind, "[0-9]", sep="") | |
return(grep(regexp, headers)) | |
} | |
mouseFn <- 'data/mouse.csv' | |
maybeDownload(mouseFn, 'http://www.cs.umt.edu/~dougr/PattRecFiles/projects/mds/otu_table_L6.txt') | |
## read in the mouse data | |
mouse <- read.csv(mouseFn, sep="\t") | |
## drop the taxon | |
mouse$Taxon <- NULL | |
## save a copy of the headers | |
headers <- colnames(mouse) | |
## there were 98 experiments, but the data is stored in the wrong order, so | |
## transpose the matrix | |
flipped <- data.frame(t(mouse)) | |
## colorize the proximal and distal ends | |
myColorKeys <- c('red', 'blue') | |
myColors <- as.vector(headers) | |
myColors[fetchExpIndexes(headers, "P")] <- myColorKeys[1] | |
myColors[fetchExpIndexes(headers, "D")] <- myColorKeys[2] | |
classes <- as.vector(sapply(headers, function(x) { | |
if (grepl("[A-Z]+P[0-9]", x)) { | |
return('proximal') | |
} | |
return('distal') | |
})) | |
train <- sample(1:length(headers), length(headers)) | |
makeTriptych(flipped, classes, myColors, 'mouse-plots.pdf', train=train, tol=0) | |
} | |
########### | |
## TUMOR ## | |
########### | |
runTumor <- function() { | |
tumorFn <- 'data/tumor.csv' | |
maybeDownload(tumorFn, 'http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data') | |
## read in tumor data and remove duplicates | |
tumor <- unique(read.csv(tumorFn, header=FALSE, na.strings='?')) | |
for (i in 1:length(tumor)) { | |
nonNAs <- which(!is.na(tumor[,i])) | |
tumor[-nonNAs,i] <- mean(tumor[nonNAs,i]) | |
} | |
## assign headers | |
headers <- c('id', 'thickness', 'uniSize', 'uniShape', 'adhesion', | |
'singleSize', 'bare', 'bland', 'normal', 'mitoses', 'class') | |
colnames(tumor) <- headers | |
## grab the classes | |
classes <- tumor$class | |
## extract the indexes for specific classes | |
benignIndexes <- which(tumor$class == 2) | |
malignantIndexes <- which(tumor$class == 4) | |
## colorize the indexes | |
myColorKeys <- c('blue', 'red') | |
myColors <- as.vector(classes) | |
myColors[benignIndexes] <- myColorKeys[1] | |
myColors[malignantIndexes] <- myColorKeys[2] | |
## drop the id and class columns | |
d <- tumor | |
d$id <- NULL | |
d$class <- NULL | |
# hold 10% | |
train <- sample(1:nrow(d), nrow(d)*0.1) | |
results <- makeTriptych(d, classes, myColors, 'tumor-plots.pdf', train=train, tol=0) | |
runCancerDensity(d, results$lda, benignIndexes, malignantIndexes) | |
} | |
########## | |
## MAIN ## | |
########## | |
mkdirp('data') | |
mkdirp('fig') | |
runIris() | |
runFruit() | |
runMouse() | |
runTumor() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment