jmdeldin · March 15, 2013 04:19
diff --git a/lda.R b/lda.R
 ## Jon-Michael Deldin
 ## Pattern Recognition, SP 2013
 ## LDA Project
 ##
 ## Usage:
 ##   Rscript lda.R
 ##

 ## clear existing vars
 rm(list = ls(all=TRUE))

 ## load the LDA library
 library(MASS)

 ## create the directory to download the data (like `mkdir -p` on *nix)
 mkdirp <- function(directory) {
  dir.create(directory, recursive=TRUE, showWarnings=FALSE)
 }

 ## download a file if it doesn't exist
 maybeDownload <- function(filename, url) {
  if (!file.exists(filename)) {
    download.file(url, filename)
  }
 }

 ## runs lda() and returns an LDA object
 getLda <- function(data, train, classes, tolerance) {
  return(lda(data[train,], classes[train], tol=tolerance))
 }

 ## returns both projections
 projectLda <- function(data, ldaObj) {
  mat <- as.matrix(data)
  first <- mat %*% ldaObj$scaling[,1]
  second <- mat %*% ldaObj$scaling[,2]
  return(list(first=first, second=second))
 }

 ## run LDA
 runLda <- function(data, train, classes, colorVector, pchSize, cexSize, mytol=NULL) {
  if (is.null(mytol)) {
    mytol <- 1e-04
  }
  z <- getLda(data, train, classes, mytol)

  ## calculate the accuracy
  actual <- classes[-train]
  predicted <- predict(z, data[-train,])$class
  failures <- length(which(actual != predicted))
  n <- length(actual)
  accuracy <- (n - failures)/n
  print(paste("ACCURACY = ", accuracy*100, "%", sep=''))

  ## plot the projections
  projections <- projectLda(data, z)
  p1 <- projections$first
  p2 <- projections$second
  plot(p1, p2,
       cex=cexSize,
       col=colorVector,
       xlab="Linear Discriminant 1",
       ylab="Linear Discriminant 2",
       pch=pchSize)
  return(c(z, accuracy))
 }

 ## run MDS
 runMds <- function(data, colorVector, pchSize, cexSize) {
  points <- cmdscale(dist(as.matrix(data)))
  plot(points,
       col=colorVector,
       xlab="Eigen 1",
       ylab="Eigen 2",
       pch=pchSize,
       cex=cexSize)
  return(points)
 }

 ## run PCA
 runPca <- function(data, colorVector, pchSize, cexSize) {
  prData <- prcomp(data)
  x <- prData$x[,1]
  y <- prData$x[,2]
  plot(x, y,
       col=colorVector,
       xlab='Principal Component 1',
       ylab='Principal Component 2',
       cex=cexSize,
       pch=pchSize)
  return(prData)
 }

 ## make a triptych of LDA, MDA, and PCA
 makeTriptych <- function(data, classes, colors, filename, train=NULL, tol=NULL) {
  if (is.null(train)) {
    train <- sample(1:150, 125)
  }
  pchSize <- 0
  cexSize <- 0.4
  linePos <- 0.50

  pdf(paste('fig', filename, sep='/'), width=6.6, height=2.2)
  par(mar=c(2.5, 2.5, 2.5, .1), mgp=c(1.5, .5, 0))
  layout(matrix(1:3, 1, 3), widths=c(1, 1, 1))

  lda <- runLda(data, train, classes, colors, pchSize, cexSize, mytol=tol)
  mtext("(A)", line=linePos)

  mds <- runMds(data, colors, pchSize, cexSize)
  mtext("(B)", line=linePos)

  pca <- runPca(data, colors, pchSize, cexSize)
  mtext("(C)", line=linePos)

  dev.off()

  return(list(lda=lda, mds=mds, pca=pca))
 }

 ## make a density plot from LDA's projections'
 runCancerDensity <- function(data, ldaObj, benignIndexes, malignantIndexes) {
  projections <- projectLda(data, ldaObj)
  dens <- density(projections$first)
  xRange <- range(dens$x)
  benignDens <- density(projections$first[benignIndexes])
  malignantDens <- density(projections$first[malignantIndexes])

  pdf('fig/cancer-density-plot.pdf', width=6.6, height=2.2)
  par(mar=c(2.5, 2.5, 2.5, .1), mgp=c(1.5, .5, 0))
  plot(benignDens$x, benignDens$y,
       type="l",
       col="blue",
       xlim=xRange,
       xlab="Linear Discriminant 1",
       ylab="Density")
  lines(malignantDens$x, malignantDens$y, col="red")
 }

 ###############
 ## IRIS DATA ##
 ###############
 runIris <- function() {
  ## make a copy of the iris data
  mydata <- iris

  ## grab the species column
  classes <- mydata$Species

  ## remove the species column from the data set
  mydata$Species <- NULL

  ## Vector of colors
  myColorKeys <- c('red', 'green', 'blue')

  ## Separate classes into variables
  setosaIndexes <- which(classes == "setosa")
  versicolorIndexes <- which(classes == "versicolor")
  virginicaIndexes <- which(classes == "virginica")

  ## Start building a real color vector by cloning the classes vector
  myColors <- as.vector(classes)
  myColors[setosaIndexes] <- myColorKeys[1]
  myColors[versicolorIndexes] <- myColorKeys[2]
  myColors[virginicaIndexes] <- myColorKeys[3]

  makeTriptych(mydata, classes, myColors, 'iris-plots.pdf')
 }

 ###########
 ## FRUIT ##
 ###########
 runFruit <- function() {
  ## download the fruit data and read it into a data.frame
  fruitFn <- 'data/fruit.csv'
  maybeDownload(fruitFn, 'http://www.cs.umt.edu/~dougr/PattRecFiles/projects/pca/fruit.csv')
  fruit <- read.csv(fruitFn)

  ## Extract the various fruit types
  classes <- fruit$Class

  ## Drop the class column
  fruitData <- fruit[,1:4]

  ## Colors for apples, lemons, oranges, and peaches
  myColorKeys <- c('red', 'yellow', 'orange', 'burlywood1')

  ## Separate classes into variables so we can color them differently
  appleIndexes  <- which(classes == "apple")
  lemonIndexes  <- which(classes == "lemon")
  orangeIndexes <- which(classes == "orange")
  peachIndexes  <- which(classes == "peach")

  ## Start building a real color vector and set the colors for matching indexes
  myColors <- as.vector(classes)
  myColors[appleIndexes]  <- myColorKeys[1]
  myColors[lemonIndexes]  <- myColorKeys[2]
  myColors[orangeIndexes] <- myColorKeys[3]
  myColors[peachIndexes]  <- myColorKeys[4]

  train <- sample(1:length(fruitData), length(fruitData)*0.1)
  makeTriptych(fruitData, classes, myColors, 'fruit-plots.pdf')
 }

 ###########
 ## MOUSE ##
 ###########
 runMouse <- function() {
  ## returns the indexes matching either a proximal or distal end
  fetchExpIndexes <- function(headers, kind) {
    regexp <- paste("^[A-Z]+", kind, "[0-9]", sep="")
    return(grep(regexp, headers))
  }

  mouseFn <- 'data/mouse.csv'
  maybeDownload(mouseFn, 'http://www.cs.umt.edu/~dougr/PattRecFiles/projects/mds/otu_table_L6.txt')

  ## read in the mouse data
  mouse <- read.csv(mouseFn, sep="\t")

  ## drop the taxon
  mouse$Taxon <- NULL

  ## save a copy of the headers
  headers <- colnames(mouse)

  ## there were 98 experiments, but the data is stored in the wrong order, so
  ## transpose the matrix
  flipped <- data.frame(t(mouse))

  ## colorize the proximal and distal ends
  myColorKeys <- c('red', 'blue')

  myColors <- as.vector(headers)
  myColors[fetchExpIndexes(headers, "P")] <- myColorKeys[1]
  myColors[fetchExpIndexes(headers, "D")] <- myColorKeys[2]

  classes <- as.vector(sapply(headers, function(x) {
    if (grepl("[A-Z]+P[0-9]", x)) {
      return('proximal')
    }
    return('distal')
  }))

  train <- sample(1:length(headers), length(headers))
  makeTriptych(flipped, classes, myColors, 'mouse-plots.pdf', train=train, tol=0)
 }

 ###########
 ## TUMOR ##
 ###########
 runTumor <- function() {
  tumorFn <- 'data/tumor.csv'
  maybeDownload(tumorFn, 'http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data')

  ## read in tumor data and remove duplicates
  tumor <- unique(read.csv(tumorFn, header=FALSE, na.strings='?'))

  for (i in 1:length(tumor)) {
    nonNAs <- which(!is.na(tumor[,i]))
    tumor[-nonNAs,i] <- mean(tumor[nonNAs,i])
  }

  ## assign headers
  headers <- c('id', 'thickness', 'uniSize', 'uniShape', 'adhesion',
               'singleSize', 'bare', 'bland', 'normal', 'mitoses', 'class')
  colnames(tumor) <- headers

  ## grab the classes
  classes <- tumor$class

  ## extract the indexes for specific classes
  benignIndexes <- which(tumor$class == 2)
  malignantIndexes <- which(tumor$class == 4)

  ## colorize the indexes
  myColorKeys <- c('blue', 'red')
  myColors <- as.vector(classes)
  myColors[benignIndexes] <- myColorKeys[1]
  myColors[malignantIndexes] <- myColorKeys[2]

  ## drop the id and class columns
  d <- tumor
  d$id <- NULL
  d$class <- NULL

  # hold 10%
  train <- sample(1:nrow(d), nrow(d)*0.1)

  results <- makeTriptych(d, classes, myColors, 'tumor-plots.pdf', train=train, tol=0)
  runCancerDensity(d, results$lda, benignIndexes, malignantIndexes)
 }

 ##########
 ## MAIN ##
 ##########
 mkdirp('data')
 mkdirp('fig')
 runIris()
 runFruit()
 runMouse()
 runTumor()
	## Jon-Michael Deldin
	## Pattern Recognition, SP 2013
	## LDA Project
	##
	## Usage:
	## Rscript lda.R
	##

	## clear existing vars
	rm(list = ls(all=TRUE))

	## load the LDA library
	library(MASS)

	## create the directory to download the data (like `mkdir -p` on *nix)
	mkdirp <- function(directory) {
	dir.create(directory, recursive=TRUE, showWarnings=FALSE)
	}

	## download a file if it doesn't exist
	maybeDownload <- function(filename, url) {
	if (!file.exists(filename)) {
	download.file(url, filename)
	}
	}

	## runs lda() and returns an LDA object
	getLda <- function(data, train, classes, tolerance) {
	return(lda(data[train,], classes[train], tol=tolerance))
	}

	## returns both projections
	projectLda <- function(data, ldaObj) {
	mat <- as.matrix(data)
	first <- mat %*% ldaObj$scaling[,1]
	second <- mat %*% ldaObj$scaling[,2]
	return(list(first=first, second=second))
	}

	## run LDA
	runLda <- function(data, train, classes, colorVector, pchSize, cexSize, mytol=NULL) {
	if (is.null(mytol)) {
	mytol <- 1e-04
	}
	z <- getLda(data, train, classes, mytol)

	## calculate the accuracy
	actual <- classes[-train]
	predicted <- predict(z, data[-train,])$class
	failures <- length(which(actual != predicted))
	n <- length(actual)
	accuracy <- (n - failures)/n
	print(paste("ACCURACY = ", accuracy*100, "%", sep=''))

	## plot the projections
	projections <- projectLda(data, z)
	p1 <- projections$first
	p2 <- projections$second
	plot(p1, p2,
	cex=cexSize,
	col=colorVector,
	xlab="Linear Discriminant 1",
	ylab="Linear Discriminant 2",
	pch=pchSize)
	return(c(z, accuracy))
	}

	## run MDS
	runMds <- function(data, colorVector, pchSize, cexSize) {
	points <- cmdscale(dist(as.matrix(data)))
	plot(points,
	col=colorVector,
	xlab="Eigen 1",
	ylab="Eigen 2",
	pch=pchSize,
	cex=cexSize)
	return(points)
	}

	## run PCA
	runPca <- function(data, colorVector, pchSize, cexSize) {
	prData <- prcomp(data)
	x <- prData$x[,1]
	y <- prData$x[,2]
	plot(x, y,
	col=colorVector,
	xlab='Principal Component 1',
	ylab='Principal Component 2',
	cex=cexSize,
	pch=pchSize)
	return(prData)
	}

	## make a triptych of LDA, MDA, and PCA
	makeTriptych <- function(data, classes, colors, filename, train=NULL, tol=NULL) {
	if (is.null(train)) {
	train <- sample(1:150, 125)
	}
	pchSize <- 0
	cexSize <- 0.4
	linePos <- 0.50

	pdf(paste('fig', filename, sep='/'), width=6.6, height=2.2)
	par(mar=c(2.5, 2.5, 2.5, .1), mgp=c(1.5, .5, 0))
	layout(matrix(1:3, 1, 3), widths=c(1, 1, 1))

	lda <- runLda(data, train, classes, colors, pchSize, cexSize, mytol=tol)
	mtext("(A)", line=linePos)

	mds <- runMds(data, colors, pchSize, cexSize)
	mtext("(B)", line=linePos)

	pca <- runPca(data, colors, pchSize, cexSize)
	mtext("(C)", line=linePos)

	dev.off()

	return(list(lda=lda, mds=mds, pca=pca))
	}

	## make a density plot from LDA's projections'
	runCancerDensity <- function(data, ldaObj, benignIndexes, malignantIndexes) {
	projections <- projectLda(data, ldaObj)
	dens <- density(projections$first)
	xRange <- range(dens$x)
	benignDens <- density(projections$first[benignIndexes])
	malignantDens <- density(projections$first[malignantIndexes])

	pdf('fig/cancer-density-plot.pdf', width=6.6, height=2.2)
	par(mar=c(2.5, 2.5, 2.5, .1), mgp=c(1.5, .5, 0))
	plot(benignDens$x, benignDens$y,
	type="l",
	col="blue",
	xlim=xRange,
	xlab="Linear Discriminant 1",
	ylab="Density")
	lines(malignantDens$x, malignantDens$y, col="red")
	}

	###############
	## IRIS DATA ##
	###############
	runIris <- function() {
	## make a copy of the iris data
	mydata <- iris

	## grab the species column
	classes <- mydata$Species

	## remove the species column from the data set
	mydata$Species <- NULL

	## Vector of colors
	myColorKeys <- c('red', 'green', 'blue')

	## Separate classes into variables
	setosaIndexes <- which(classes == "setosa")
	versicolorIndexes <- which(classes == "versicolor")
	virginicaIndexes <- which(classes == "virginica")

	## Start building a real color vector by cloning the classes vector
	myColors <- as.vector(classes)
	myColors[setosaIndexes] <- myColorKeys[1]
	myColors[versicolorIndexes] <- myColorKeys[2]
	myColors[virginicaIndexes] <- myColorKeys[3]

	makeTriptych(mydata, classes, myColors, 'iris-plots.pdf')
	}

	###########
	## FRUIT ##
	###########
	runFruit <- function() {
	## download the fruit data and read it into a data.frame
	fruitFn <- 'data/fruit.csv'
	maybeDownload(fruitFn, 'http://www.cs.umt.edu/~dougr/PattRecFiles/projects/pca/fruit.csv')
	fruit <- read.csv(fruitFn)

	## Extract the various fruit types
	classes <- fruit$Class

	## Drop the class column
	fruitData <- fruit[,1:4]

	## Colors for apples, lemons, oranges, and peaches
	myColorKeys <- c('red', 'yellow', 'orange', 'burlywood1')

	## Separate classes into variables so we can color them differently
	appleIndexes <- which(classes == "apple")
	lemonIndexes <- which(classes == "lemon")
	orangeIndexes <- which(classes == "orange")
	peachIndexes <- which(classes == "peach")

	## Start building a real color vector and set the colors for matching indexes
	myColors <- as.vector(classes)
	myColors[appleIndexes] <- myColorKeys[1]
	myColors[lemonIndexes] <- myColorKeys[2]
	myColors[orangeIndexes] <- myColorKeys[3]
	myColors[peachIndexes] <- myColorKeys[4]

	train <- sample(1:length(fruitData), length(fruitData)*0.1)
	makeTriptych(fruitData, classes, myColors, 'fruit-plots.pdf')
	}

	###########
	## MOUSE ##
	###########
	runMouse <- function() {
	## returns the indexes matching either a proximal or distal end
	fetchExpIndexes <- function(headers, kind) {
	regexp <- paste("^[A-Z]+", kind, "[0-9]", sep="")
	return(grep(regexp, headers))
	}

	mouseFn <- 'data/mouse.csv'
	maybeDownload(mouseFn, 'http://www.cs.umt.edu/~dougr/PattRecFiles/projects/mds/otu_table_L6.txt')

	## read in the mouse data
	mouse <- read.csv(mouseFn, sep="\t")

	## drop the taxon
	mouse$Taxon <- NULL

	## save a copy of the headers
	headers <- colnames(mouse)

	## there were 98 experiments, but the data is stored in the wrong order, so
	## transpose the matrix
	flipped <- data.frame(t(mouse))

	## colorize the proximal and distal ends
	myColorKeys <- c('red', 'blue')

	myColors <- as.vector(headers)
	myColors[fetchExpIndexes(headers, "P")] <- myColorKeys[1]
	myColors[fetchExpIndexes(headers, "D")] <- myColorKeys[2]

	classes <- as.vector(sapply(headers, function(x) {
	if (grepl("[A-Z]+P[0-9]", x)) {
	return('proximal')
	}
	return('distal')
	}))

	train <- sample(1:length(headers), length(headers))
	makeTriptych(flipped, classes, myColors, 'mouse-plots.pdf', train=train, tol=0)
	}

	###########
	## TUMOR ##
	###########
	runTumor <- function() {
	tumorFn <- 'data/tumor.csv'
	maybeDownload(tumorFn, 'http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data')

	## read in tumor data and remove duplicates
	tumor <- unique(read.csv(tumorFn, header=FALSE, na.strings='?'))

	for (i in 1:length(tumor)) {
	nonNAs <- which(!is.na(tumor[,i]))
	tumor[-nonNAs,i] <- mean(tumor[nonNAs,i])
	}

	## assign headers
	headers <- c('id', 'thickness', 'uniSize', 'uniShape', 'adhesion',
	'singleSize', 'bare', 'bland', 'normal', 'mitoses', 'class')
	colnames(tumor) <- headers

	## grab the classes
	classes <- tumor$class

	## extract the indexes for specific classes
	benignIndexes <- which(tumor$class == 2)
	malignantIndexes <- which(tumor$class == 4)

	## colorize the indexes
	myColorKeys <- c('blue', 'red')
	myColors <- as.vector(classes)
	myColors[benignIndexes] <- myColorKeys[1]
	myColors[malignantIndexes] <- myColorKeys[2]

	## drop the id and class columns
	d <- tumor
	d$id <- NULL
	d$class <- NULL

	# hold 10%
	train <- sample(1:nrow(d), nrow(d)*0.1)

	results <- makeTriptych(d, classes, myColors, 'tumor-plots.pdf', train=train, tol=0)
	runCancerDensity(d, results$lda, benignIndexes, malignantIndexes)
	}

	##########
	## MAIN ##
	##########
	mkdirp('data')
	mkdirp('fig')
	runIris()
	runFruit()
	runMouse()
	runTumor()