Created
August 9, 2018 15:29
-
-
Save hadley/2f8f43a7e1aff3202eff78d8d7502eba to your computer and use it in GitHub Desktop.
The first R script I can find on my computer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(class) | |
library(mass) | |
library(mva) | |
tst <- read.table("e:/uni/stats766/puktest.txt", header = TRUE) | |
tst.v <- tst[,1:7] | |
tst.g <- tst[,8] | |
trn <- read.table("e:/uni/stats766/puktrain.txt", header = TRUE) | |
trn.v <- trn[,1:7] | |
trn.g <- trn[,8] | |
all.v <- rbind(tst, trn) | |
pca <- princomp(all.v[,1:7]) | |
eqscplot(pca$scores[21:103,], pch=as.character(trn.g), cex=0.7) | |
points(pca$scores[1:20,], pch=as.character(tst.g), cex=0.7, col="red") | |
plot(pca$scores, pch=as.character(trn.g), cex=0.7) | |
plot(trn.v, pch=as.character(trn.g), cex=0.7) | |
m <- trn.v[grep("M",as.character(trn.g)),] | |
f <- trn.v[grep("F",as.character(trn.g)),] | |
mdev <- data.frame(V1 = m$V1 - median(m$V1), V2 = m$V2 - median(m$V2), V3 = m$V3 - median(m$V3), V4 = m$V4 - median(m$V4), V5 = m$V5 - median(m$V5) ,V6 = m$V6 - median(m$V6) ,V7 = m$V7 - median(m$V7)) | |
mdev <- abs(mdev) | |
fdev <- data.frame(V1 = f$V1 - median(f$V1), V2 = f$V2 - median(f$V2), V3 = f$V3 - median(f$V3), V4 = f$V4 - median(f$V4), V5 = f$V5 - median(f$V5) ,V6 = f$V6 - median(f$V6) ,V7 = f$V7 - median(f$V7)) | |
fdev <- abs(fdev) | |
g <- factor(rep(c("M","F"),c(43,40))) | |
mfdev <- rbind(mdev, fdev) | |
plot(trn.v$V3, trn.v$V7, pch=as.character(trn.g), cex=0.7) | |
abline(v=29.75) | |
r <- rpart(trn.g ~ trn.v$V1 + trn.v$V2 + trn.v$V3 + trn.v$V4 + trn.v$V5 + trn.v$V6 + trn.v$V7) | |
rpred <- predict(r, trn.v, type="class") | |
calcError <- function(pred, act = trn.g) { | |
group.count <- tapply(act, act, length) | |
f <- group.count["F"] | |
m <- group.count["M"] | |
if (is.na(f)) { | |
f <- 0 | |
} | |
if (is.na(m)) { | |
m <- 0 | |
} | |
false <- act[pred != act] | |
list(error = false, m = m, f = f) | |
} | |
dispError <- function(e) { | |
f <- e$f | |
m <- e$m | |
t <- f + m | |
false <- e$error | |
false.count <- tapply(false, false, length) | |
f.false <- false.count["F"] | |
m.false <- false.count["M"] | |
t.false <- f.false + m.false | |
cat("Incorrect classification: \n") | |
cat("M: ", m.false, "/", m, " (", format(m.false / m * 100, digits = 3), "%)\n") | |
cat("F: ", f.false, "/", f, " (", format(f.false / f * 100, digits = 3), "%)\n") | |
cat("T: ", t.false, "/", t, " (", format(t.false / t * 100, digits = 3), "%)\n") | |
} | |
samplevector <- function(x, n) { | |
size <- dim(x)[1] | |
if (missing(n)) { | |
n <- size | |
} | |
d <- dim(x)[2] | |
v <- x[1:7, ] | |
g <- x[8] | |
bootstrap <- data.frame(x[floor(runif(1,0,1) * size) + 1, ]) | |
#bootstrap <- matrix(0, n, d) | |
for(i in 1:(n-1)) { | |
i <- floor(runif(1,0,1) * size) + 1 | |
bootstrap = rbind(bootstrap, x[i, ]) | |
#bootstrap[i, ] = x[floor(runif(1,0,1) * size) + 1, ] | |
} | |
bootstrap | |
} | |
samplevectorrand <- function(x, n) { | |
size <- dim(x)[1] | |
if (missing(n)) { | |
n <- size | |
} | |
d <- dim(x)[2] | |
bootstrap <- data.frame(x[floor(runif(1,0,1) * size) + 1, ]) | |
for(i in 1:(n-1)) { | |
bootstrap = rbind(bootstrap, x[floor(runif(1,0,1) * size) + 1, ]) | |
} | |
bootstrap | |
} | |
crossvalidate <- function(train, group, func, ...) { | |
e <- func(train, group, CV=true, ...) | |
dispError(e) | |
} | |
chv.lda <- function(train, group) { | |
result1 <- lda(x = train[1:41, ], grouping = group[1:41 ]) | |
result2 <- lda(x = train[42:83, ], grouping = group[42:83 ]) | |
pred1 <- predict(result1, train[42:83, ])$class | |
pred2 <- predict(result2, train[1:41 , ])$class | |
pred <- rbind(pred2, pred1) | |
dispError(calcError(pred)) | |
} | |
chv.qda <- function(train, group) { | |
result1 <- qda(x = train[1:41, ], grouping = group[1:41 ]) | |
result2 <- qda(x = train[42:83, ], grouping = group[42:83 ]) | |
pred1 <- predict(result1, train[42:83, ])$class | |
pred2 <- predict(result2, train[1:41 , ])$class | |
pred <- c(as.character(pred2), as.character(pred1)) | |
dispError(calcError(pred)) | |
} | |
chv.qda <- function(train, group, k) { | |
pred1 <- knn(train = train[1:41, ], test = train[42:83, ], cl = group[1:41],k) | |
pred2 <- knn(train = train[42:83, ], test = train[1:41 , ], cl = group[42:83],k) | |
pred <- c(as.character(pred2), as.character(pred1)) | |
dispError(calcError(pred)) | |
} | |
crossvalidate.lda <- function(train, group) { | |
crossvalidate(train, group, cv.lda.err) | |
} | |
crossvalidate.qda <- function(train, group) { | |
n <- dim(train)[1] | |
classes <- vector("numeric", n) | |
for (i in 1:n) { | |
result <- qda(x = train[-i, ], grouping = group[-i]) | |
classes[i] <- as.character(predict(result, train[i, ])$class) | |
} | |
dispError(calcError(classes)) | |
} | |
crossvalidate.knn <- function(train, group ,k ) { | |
n <- dim(train)[1] | |
classes <- vector("numeric", n) | |
for (i in 1:n) { | |
classes[i] <- as.character(knn(train = train[-i, ], test = train[i, ], cl = group[-i], k)) | |
} | |
dispError(calcError(classes)) | |
} | |
resample <- function(train, group, func, ...) { | |
e <- func(train, group, ...) | |
dispError(e) | |
} | |
resample.lda <- function(train, group) { | |
resample(train, group, cv.lda.err) | |
} | |
resample.qda <- function(train, group) { | |
resample(train, group, cv.qda.err) | |
} | |
resample.knn <- function(train, group, k) { | |
resample(train, group, cv.knn.err, k = k) | |
} | |
clv.lda.err <- function(sample, sampleg, orig = sample, origg = sampleg) { | |
result <- lda(x = sample, grouping = sampleg, CV = true) | |
pred <- predict(result, orig)$class | |
calcError(pred, origg) | |
} | |
cv.qda.err <- function(sample, sampleg, orig = sample, origg = sampleg) { | |
result <- qda(x = sample, grouping = sampleg) | |
pred <- predict(result, orig)$class | |
calcError(pred, origg) | |
} | |
cv.knn.err <- function(sample, sampleg, orig = sample, origg = sampleg, k) { | |
pred <- knn(train = sample, test = orig, cl = sampleg, k) | |
calcError(pred, origg) | |
} | |
#want to estimate bias of resubstitution method | |
bootstrap <- function(train, group, func, n, ...) { | |
combine <- cbind(train, group) | |
dif = c(n) | |
for (i in 1:n) { | |
sv <- samplevector(combine) | |
sample <- sv[, 1:7] | |
sampleg <- sv[, 8] | |
boot <- func(sample, sampleg, train, group, ...) | |
orig <- func(sample, sampleg, ...) | |
boot.e <- length(boot$error) | |
orig.e <- length(orig$error) | |
# orig is biased | |
# boot is unbiased | |
# orig - boot is estimate of bias | |
dif[i] = orig.e - boot.e | |
} | |
hist(dif) | |
invisible(dif) | |
} | |
bootstrap.knn <- function(train, group, n, k ) { | |
bootstrap(train, group, cv.knn.err, n, k = k) | |
} | |
bootstrap.qda <- function(train, group, n) { | |
bootstrap(train, group, cv.qda.err, n) | |
} | |
bootstrap.qda <- function(train, group, n) { | |
bootstrap(train, group, cv.qda.err, n) | |
} | |
> boot <- bootstrap.lda(trn.v, trn.g, n=1000) | |
> var(boot) | |
[1] 3.643763 | |
> quantile(c(0.025,0.975)) | |
0% 25% 50% 75% 100% | |
0.0250 0.2625 0.5000 0.7375 0.9750 | |
> ?quantile | |
> quantile(boot,c(0.025,0.975)) | |
2.5% 97.5% | |
-2 5 | |
> mean(boot) | |
[1] 1.391 | |
> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
your indenting wasn't bad even way back when