Skip to content

Instantly share code, notes, and snippets.

@jmdeldin
Created March 15, 2013 04:19
Show Gist options
  • Save jmdeldin/5167453 to your computer and use it in GitHub Desktop.
Save jmdeldin/5167453 to your computer and use it in GitHub Desktop.
## Jon-Michael Deldin
## Pattern Recognition, SP 2013
## LDA Project
##
## Usage:
## Rscript lda.R
##
## clear existing vars
rm(list = ls(all=TRUE))
## load the LDA library
library(MASS)
## create the directory to download the data (like `mkdir -p` on *nix)
mkdirp <- function(directory) {
dir.create(directory, recursive=TRUE, showWarnings=FALSE)
}
## download a file if it doesn't exist
maybeDownload <- function(filename, url) {
if (!file.exists(filename)) {
download.file(url, filename)
}
}
## runs lda() and returns an LDA object
getLda <- function(data, train, classes, tolerance) {
return(lda(data[train,], classes[train], tol=tolerance))
}
## returns both projections
projectLda <- function(data, ldaObj) {
mat <- as.matrix(data)
first <- mat %*% ldaObj$scaling[,1]
second <- mat %*% ldaObj$scaling[,2]
return(list(first=first, second=second))
}
## run LDA
runLda <- function(data, train, classes, colorVector, pchSize, cexSize, mytol=NULL) {
if (is.null(mytol)) {
mytol <- 1e-04
}
z <- getLda(data, train, classes, mytol)
## calculate the accuracy
actual <- classes[-train]
predicted <- predict(z, data[-train,])$class
failures <- length(which(actual != predicted))
n <- length(actual)
accuracy <- (n - failures)/n
print(paste("ACCURACY = ", accuracy*100, "%", sep=''))
## plot the projections
projections <- projectLda(data, z)
p1 <- projections$first
p2 <- projections$second
plot(p1, p2,
cex=cexSize,
col=colorVector,
xlab="Linear Discriminant 1",
ylab="Linear Discriminant 2",
pch=pchSize)
return(c(z, accuracy))
}
## run MDS
runMds <- function(data, colorVector, pchSize, cexSize) {
points <- cmdscale(dist(as.matrix(data)))
plot(points,
col=colorVector,
xlab="Eigen 1",
ylab="Eigen 2",
pch=pchSize,
cex=cexSize)
return(points)
}
## run PCA
runPca <- function(data, colorVector, pchSize, cexSize) {
prData <- prcomp(data)
x <- prData$x[,1]
y <- prData$x[,2]
plot(x, y,
col=colorVector,
xlab='Principal Component 1',
ylab='Principal Component 2',
cex=cexSize,
pch=pchSize)
return(prData)
}
## make a triptych of LDA, MDA, and PCA
makeTriptych <- function(data, classes, colors, filename, train=NULL, tol=NULL) {
if (is.null(train)) {
train <- sample(1:150, 125)
}
pchSize <- 0
cexSize <- 0.4
linePos <- 0.50
pdf(paste('fig', filename, sep='/'), width=6.6, height=2.2)
par(mar=c(2.5, 2.5, 2.5, .1), mgp=c(1.5, .5, 0))
layout(matrix(1:3, 1, 3), widths=c(1, 1, 1))
lda <- runLda(data, train, classes, colors, pchSize, cexSize, mytol=tol)
mtext("(A)", line=linePos)
mds <- runMds(data, colors, pchSize, cexSize)
mtext("(B)", line=linePos)
pca <- runPca(data, colors, pchSize, cexSize)
mtext("(C)", line=linePos)
dev.off()
return(list(lda=lda, mds=mds, pca=pca))
}
## make a density plot from LDA's projections'
runCancerDensity <- function(data, ldaObj, benignIndexes, malignantIndexes) {
projections <- projectLda(data, ldaObj)
dens <- density(projections$first)
xRange <- range(dens$x)
benignDens <- density(projections$first[benignIndexes])
malignantDens <- density(projections$first[malignantIndexes])
pdf('fig/cancer-density-plot.pdf', width=6.6, height=2.2)
par(mar=c(2.5, 2.5, 2.5, .1), mgp=c(1.5, .5, 0))
plot(benignDens$x, benignDens$y,
type="l",
col="blue",
xlim=xRange,
xlab="Linear Discriminant 1",
ylab="Density")
lines(malignantDens$x, malignantDens$y, col="red")
}
###############
## IRIS DATA ##
###############
runIris <- function() {
## make a copy of the iris data
mydata <- iris
## grab the species column
classes <- mydata$Species
## remove the species column from the data set
mydata$Species <- NULL
## Vector of colors
myColorKeys <- c('red', 'green', 'blue')
## Separate classes into variables
setosaIndexes <- which(classes == "setosa")
versicolorIndexes <- which(classes == "versicolor")
virginicaIndexes <- which(classes == "virginica")
## Start building a real color vector by cloning the classes vector
myColors <- as.vector(classes)
myColors[setosaIndexes] <- myColorKeys[1]
myColors[versicolorIndexes] <- myColorKeys[2]
myColors[virginicaIndexes] <- myColorKeys[3]
makeTriptych(mydata, classes, myColors, 'iris-plots.pdf')
}
###########
## FRUIT ##
###########
runFruit <- function() {
## download the fruit data and read it into a data.frame
fruitFn <- 'data/fruit.csv'
maybeDownload(fruitFn, 'http://www.cs.umt.edu/~dougr/PattRecFiles/projects/pca/fruit.csv')
fruit <- read.csv(fruitFn)
## Extract the various fruit types
classes <- fruit$Class
## Drop the class column
fruitData <- fruit[,1:4]
## Colors for apples, lemons, oranges, and peaches
myColorKeys <- c('red', 'yellow', 'orange', 'burlywood1')
## Separate classes into variables so we can color them differently
appleIndexes <- which(classes == "apple")
lemonIndexes <- which(classes == "lemon")
orangeIndexes <- which(classes == "orange")
peachIndexes <- which(classes == "peach")
## Start building a real color vector and set the colors for matching indexes
myColors <- as.vector(classes)
myColors[appleIndexes] <- myColorKeys[1]
myColors[lemonIndexes] <- myColorKeys[2]
myColors[orangeIndexes] <- myColorKeys[3]
myColors[peachIndexes] <- myColorKeys[4]
train <- sample(1:length(fruitData), length(fruitData)*0.1)
makeTriptych(fruitData, classes, myColors, 'fruit-plots.pdf')
}
###########
## MOUSE ##
###########
runMouse <- function() {
## returns the indexes matching either a proximal or distal end
fetchExpIndexes <- function(headers, kind) {
regexp <- paste("^[A-Z]+", kind, "[0-9]", sep="")
return(grep(regexp, headers))
}
mouseFn <- 'data/mouse.csv'
maybeDownload(mouseFn, 'http://www.cs.umt.edu/~dougr/PattRecFiles/projects/mds/otu_table_L6.txt')
## read in the mouse data
mouse <- read.csv(mouseFn, sep="\t")
## drop the taxon
mouse$Taxon <- NULL
## save a copy of the headers
headers <- colnames(mouse)
## there were 98 experiments, but the data is stored in the wrong order, so
## transpose the matrix
flipped <- data.frame(t(mouse))
## colorize the proximal and distal ends
myColorKeys <- c('red', 'blue')
myColors <- as.vector(headers)
myColors[fetchExpIndexes(headers, "P")] <- myColorKeys[1]
myColors[fetchExpIndexes(headers, "D")] <- myColorKeys[2]
classes <- as.vector(sapply(headers, function(x) {
if (grepl("[A-Z]+P[0-9]", x)) {
return('proximal')
}
return('distal')
}))
train <- sample(1:length(headers), length(headers))
makeTriptych(flipped, classes, myColors, 'mouse-plots.pdf', train=train, tol=0)
}
###########
## TUMOR ##
###########
runTumor <- function() {
tumorFn <- 'data/tumor.csv'
maybeDownload(tumorFn, 'http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data')
## read in tumor data and remove duplicates
tumor <- unique(read.csv(tumorFn, header=FALSE, na.strings='?'))
for (i in 1:length(tumor)) {
nonNAs <- which(!is.na(tumor[,i]))
tumor[-nonNAs,i] <- mean(tumor[nonNAs,i])
}
## assign headers
headers <- c('id', 'thickness', 'uniSize', 'uniShape', 'adhesion',
'singleSize', 'bare', 'bland', 'normal', 'mitoses', 'class')
colnames(tumor) <- headers
## grab the classes
classes <- tumor$class
## extract the indexes for specific classes
benignIndexes <- which(tumor$class == 2)
malignantIndexes <- which(tumor$class == 4)
## colorize the indexes
myColorKeys <- c('blue', 'red')
myColors <- as.vector(classes)
myColors[benignIndexes] <- myColorKeys[1]
myColors[malignantIndexes] <- myColorKeys[2]
## drop the id and class columns
d <- tumor
d$id <- NULL
d$class <- NULL
# hold 10%
train <- sample(1:nrow(d), nrow(d)*0.1)
results <- makeTriptych(d, classes, myColors, 'tumor-plots.pdf', train=train, tol=0)
runCancerDensity(d, results$lda, benignIndexes, malignantIndexes)
}
##########
## MAIN ##
##########
mkdirp('data')
mkdirp('fig')
runIris()
runFruit()
runMouse()
runTumor()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment