Created
February 11, 2015 18:19
-
-
Save hadley/59f1d0c98c567c24fe73 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| library(class) | |
| library(mass) | |
| library(mva) | |
| tst <- read.table("e:/uni/stats766/puktest.txt", header = TRUE) | |
| tst.v <- tst[,1:7] | |
| tst.g <- tst[,8] | |
| trn <- read.table("e:/uni/stats766/puktrain.txt", header = TRUE) | |
| trn.v <- trn[,1:7] | |
| trn.g <- trn[,8] | |
| all.v <- rbind(tst, trn) | |
| pca <- princomp(all.v[,1:7]) | |
| eqscplot(pca$scores[21:103,], pch=as.character(trn.g), cex=0.7) | |
| points(pca$scores[1:20,], pch=as.character(tst.g), cex=0.7, col="red") | |
| plot(pca$scores, pch=as.character(trn.g), cex=0.7) | |
| plot(trn.v, pch=as.character(trn.g), cex=0.7) | |
| m <- trn.v[grep("M",as.character(trn.g)),] | |
| f <- trn.v[grep("F",as.character(trn.g)),] | |
| mdev <- data.frame(V1 = m$V1 - median(m$V1), V2 = m$V2 - median(m$V2), V3 = m$V3 - median(m$V3), V4 = m$V4 - median(m$V4), V5 = m$V5 - median(m$V5) ,V6 = m$V6 - median(m$V6) ,V7 = m$V7 - median(m$V7)) | |
| mdev <- abs(mdev) | |
| fdev <- data.frame(V1 = f$V1 - median(f$V1), V2 = f$V2 - median(f$V2), V3 = f$V3 - median(f$V3), V4 = f$V4 - median(f$V4), V5 = f$V5 - median(f$V5) ,V6 = f$V6 - median(f$V6) ,V7 = f$V7 - median(f$V7)) | |
| fdev <- abs(fdev) | |
| g <- factor(rep(c("M","F"),c(43,40))) | |
| mfdev <- rbind(mdev, fdev) | |
| plot(trn.v$V3, trn.v$V7, pch=as.character(trn.g), cex=0.7) | |
| abline(v=29.75) | |
| r <- rpart(trn.g ~ trn.v$V1 + trn.v$V2 + trn.v$V3 + trn.v$V4 + trn.v$V5 + trn.v$V6 + trn.v$V7) | |
| rpred <- predict(r, trn.v, type="class") | |
| calcError <- function(pred, act = trn.g) { | |
| group.count <- tapply(act, act, length) | |
| f <- group.count["F"] | |
| m <- group.count["M"] | |
| if (is.na(f)) { | |
| f <- 0 | |
| } | |
| if (is.na(m)) { | |
| m <- 0 | |
| } | |
| false <- act[pred != act] | |
| list(error = false, m = m, f = f) | |
| } | |
| dispError <- function(e) { | |
| f <- e$f | |
| m <- e$m | |
| t <- f + m | |
| false <- e$error | |
| false.count <- tapply(false, false, length) | |
| f.false <- false.count["F"] | |
| m.false <- false.count["M"] | |
| t.false <- f.false + m.false | |
| cat("Incorrect classification: \n") | |
| cat("M: ", m.false, "/", m, " (", format(m.false / m * 100, digits = 3), "%)\n") | |
| cat("F: ", f.false, "/", f, " (", format(f.false / f * 100, digits = 3), "%)\n") | |
| cat("T: ", t.false, "/", t, " (", format(t.false / t * 100, digits = 3), "%)\n") | |
| } | |
| samplevector <- function(x, n) { | |
| size <- dim(x)[1] | |
| if (missing(n)) { | |
| n <- size | |
| } | |
| d <- dim(x)[2] | |
| v <- x[1:7, ] | |
| g <- x[8] | |
| bootstrap <- data.frame(x[floor(runif(1,0,1) * size) + 1, ]) | |
| #bootstrap <- matrix(0, n, d) | |
| for(i in 1:(n-1)) { | |
| i <- floor(runif(1,0,1) * size) + 1 | |
| bootstrap = rbind(bootstrap, x[i, ]) | |
| #bootstrap[i, ] = x[floor(runif(1,0,1) * size) + 1, ] | |
| } | |
| bootstrap | |
| } | |
| samplevectorrand <- function(x, n) { | |
| size <- dim(x)[1] | |
| if (missing(n)) { | |
| n <- size | |
| } | |
| d <- dim(x)[2] | |
| bootstrap <- data.frame(x[floor(runif(1,0,1) * size) + 1, ]) | |
| for(i in 1:(n-1)) { | |
| bootstrap = rbind(bootstrap, x[floor(runif(1,0,1) * size) + 1, ]) | |
| } | |
| bootstrap | |
| } | |
| crossvalidate <- function(train, group, func, ...) { | |
| e <- func(train, group, CV=true, ...) | |
| dispError(e) | |
| } | |
| chv.lda <- function(train, group) { | |
| result1 <- lda(x = train[1:41, ], grouping = group[1:41 ]) | |
| result2 <- lda(x = train[42:83, ], grouping = group[42:83 ]) | |
| pred1 <- predict(result1, train[42:83, ])$class | |
| pred2 <- predict(result2, train[1:41 , ])$class | |
| pred <- rbind(pred2, pred1) | |
| dispError(calcError(pred)) | |
| } | |
| chv.qda <- function(train, group) { | |
| result1 <- qda(x = train[1:41, ], grouping = group[1:41 ]) | |
| result2 <- qda(x = train[42:83, ], grouping = group[42:83 ]) | |
| pred1 <- predict(result1, train[42:83, ])$class | |
| pred2 <- predict(result2, train[1:41 , ])$class | |
| pred <- c(as.character(pred2), as.character(pred1)) | |
| dispError(calcError(pred)) | |
| } | |
| chv.qda <- function(train, group, k) { | |
| pred1 <- knn(train = train[1:41, ], test = train[42:83, ], cl = group[1:41],k) | |
| pred2 <- knn(train = train[42:83, ], test = train[1:41 , ], cl = group[42:83],k) | |
| pred <- c(as.character(pred2), as.character(pred1)) | |
| dispError(calcError(pred)) | |
| } | |
| crossvalidate.lda <- function(train, group) { | |
| crossvalidate(train, group, cv.lda.err) | |
| } | |
| crossvalidate.qda <- function(train, group) { | |
| n <- dim(train)[1] | |
| classes <- vector("numeric", n) | |
| for (i in 1:n) { | |
| result <- qda(x = train[-i, ], grouping = group[-i]) | |
| classes[i] <- as.character(predict(result, train[i, ])$class) | |
| } | |
| dispError(calcError(classes)) | |
| } | |
| crossvalidate.knn <- function(train, group ,k ) { | |
| n <- dim(train)[1] | |
| classes <- vector("numeric", n) | |
| for (i in 1:n) { | |
| classes[i] <- as.character(knn(train = train[-i, ], test = train[i, ], cl = group[-i], k)) | |
| } | |
| dispError(calcError(classes)) | |
| } | |
| resample <- function(train, group, func, ...) { | |
| e <- func(train, group, ...) | |
| dispError(e) | |
| } | |
| resample.lda <- function(train, group) { | |
| resample(train, group, cv.lda.err) | |
| } | |
| resample.qda <- function(train, group) { | |
| resample(train, group, cv.qda.err) | |
| } | |
| resample.knn <- function(train, group, k) { | |
| resample(train, group, cv.knn.err, k = k) | |
| } | |
| clv.lda.err <- function(sample, sampleg, orig = sample, origg = sampleg) { | |
| result <- lda(x = sample, grouping = sampleg, CV = true) | |
| pred <- predict(result, orig)$class | |
| calcError(pred, origg) | |
| } | |
| cv.qda.err <- function(sample, sampleg, orig = sample, origg = sampleg) { | |
| result <- qda(x = sample, grouping = sampleg) | |
| pred <- predict(result, orig)$class | |
| calcError(pred, origg) | |
| } | |
| cv.knn.err <- function(sample, sampleg, orig = sample, origg = sampleg, k) { | |
| pred <- knn(train = sample, test = orig, cl = sampleg, k) | |
| calcError(pred, origg) | |
| } | |
| #want to estimate bias of resubstitution method | |
| bootstrap <- function(train, group, func, n, ...) { | |
| combine <- cbind(train, group) | |
| dif = c(n) | |
| for (i in 1:n) { | |
| sv <- samplevector(combine) | |
| sample <- sv[, 1:7] | |
| sampleg <- sv[, 8] | |
| boot <- func(sample, sampleg, train, group, ...) | |
| orig <- func(sample, sampleg, ...) | |
| boot.e <- length(boot$error) | |
| orig.e <- length(orig$error) | |
| # orig is biased | |
| # boot is unbiased | |
| # orig - boot is estimate of bias | |
| dif[i] = orig.e - boot.e | |
| } | |
| hist(dif) | |
| invisible(dif) | |
| } | |
| bootstrap.knn <- function(train, group, n, k ) { | |
| bootstrap(train, group, cv.knn.err, n, k = k) | |
| } | |
| bootstrap.qda <- function(train, group, n) { | |
| bootstrap(train, group, cv.qda.err, n) | |
| } | |
| bootstrap.qda <- function(train, group, n) { | |
| bootstrap(train, group, cv.qda.err, n) | |
| } | |
| > boot <- bootstrap.lda(trn.v, trn.g, n=1000) | |
| > var(boot) | |
| [1] 3.643763 | |
| > quantile(c(0.025,0.975)) | |
| 0% 25% 50% 75% 100% | |
| 0.0250 0.2625 0.5000 0.7375 0.9750 | |
| > ?quantile | |
| > quantile(boot,c(0.025,0.975)) | |
| 2.5% 97.5% | |
| -2 5 | |
| > mean(boot) | |
| [1] 1.391 | |
| > |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment