Last active
August 29, 2015 14:10
-
-
Save elyase/863b3517769462f4f5c9 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| function (x, y = NULL, xtest = NULL, ytest = NULL, ntree = 500, | |
| mtry = if (!is.null(y) && !is.factor(y)) max(floor(ncol(x)/3), | |
| 1) else floor(sqrt(ncol(x))), replace = TRUE, classwt = NULL, | |
| cutoff, strata, sampsize = if (replace) nrow(x) else ceiling(0.632 * | |
| nrow(x)), nodesize = if (!is.null(y) && !is.factor(y)) 5 else 1, | |
| maxnodes = NULL, importance = FALSE, localImp = FALSE, nPerm = 1, | |
| proximity, oob.prox = proximity, norm.votes = TRUE, do.trace = FALSE, | |
| keep.forest = !is.null(y) && is.null(xtest), corr.bias = FALSE, | |
| keep.inbag = FALSE, ...) | |
| { | |
| addclass <- is.null(y) | |
| classRF <- addclass || is.factor(y) | |
| if (!classRF && length(unique(y)) <= 5) { | |
| warning("The response has five or fewer unique values. Are you sure you want to do regression?") | |
| } | |
| if (classRF && !addclass && length(unique(y)) < 2) | |
| stop("Need at least two classes to do classification.") | |
| n <- nrow(x) | |
| p <- ncol(x) | |
| if (n == 0) | |
| stop("data (x) has 0 rows") | |
| x.row.names <- rownames(x) | |
| x.col.names <- if (is.null(colnames(x))) | |
| 1:ncol(x) | |
| else colnames(x) | |
| keep.forest <- keep.forest | |
| testdat <- !is.null(xtest) | |
| if (testdat) { | |
| if (ncol(x) != ncol(xtest)) | |
| stop("x and xtest must have same number of columns") | |
| ntest <- nrow(xtest) | |
| xts.row.names <- rownames(xtest) | |
| } | |
| if (mtry < 1 || mtry > p) | |
| warning("invalid mtry: reset to within valid range") | |
| mtry <- max(1, min(p, round(mtry))) | |
| if (!is.null(y)) { | |
| if (length(y) != n) | |
| stop("length of response must be the same as predictors") | |
| addclass <- FALSE | |
| } | |
| else { | |
| if (!addclass) | |
| addclass <- TRUE | |
| y <- factor(c(rep(1, n), rep(2, n))) | |
| x <- rbind(x, x) | |
| } | |
| if (any(is.na(x))) | |
| stop("NA not permitted in predictors") | |
| if (testdat && any(is.na(xtest))) | |
| stop("NA not permitted in xtest") | |
| if (any(is.na(y))) | |
| stop("NA not permitted in response") | |
| if (!is.null(ytest) && any(is.na(ytest))) | |
| stop("NA not permitted in ytest") | |
| if (is.data.frame(x)) { | |
| xlevels <- lapply(x, mylevels) | |
| ncat <- sapply(xlevels, length) | |
| ncat <- ifelse(sapply(x, is.ordered), 1, ncat) | |
| x <- data.matrix(x) | |
| if (testdat) { | |
| if (!is.data.frame(xtest)) | |
| stop("xtest must be data frame if x is") | |
| xfactor <- which(sapply(xtest, is.factor)) | |
| if (length(xfactor) > 0) { | |
| for (i in xfactor) { | |
| if (any(!levels(xtest[[i]]) %in% xlevels[[i]])) | |
| stop("New factor levels in xtest not present in x") | |
| xtest[[i]] <- factor(xlevels[[i]][match(xtest[[i]], | |
| xlevels[[i]])], levels = xlevels[[i]]) | |
| } | |
| } | |
| xtest <- data.matrix(xtest) | |
| } | |
| } | |
| else { | |
| ncat <- rep(1, p) | |
| xlevels <- as.list(rep(0, p)) | |
| } | |
| maxcat <- max(ncat) | |
| if (maxcat > 53) | |
| stop("Can not handle categorical predictors with more than 53 categories.") | |
| if (classRF) { | |
| nclass <- length(levels(y)) | |
| if (any(table(y) == 0)) | |
| stop("Can't have empty classes in y.") | |
| if (!is.null(ytest)) { | |
| if (!is.factor(ytest)) | |
| stop("ytest must be a factor") | |
| if (!all(levels(y) == levels(ytest))) | |
| stop("y and ytest must have the same levels") | |
| } | |
| if (missing(cutoff)) { | |
| cutoff <- rep(1/nclass, nclass) | |
| } | |
| else { | |
| if (sum(cutoff) > 1 || sum(cutoff) < 0 || !all(cutoff > | |
| 0) || length(cutoff) != nclass) { | |
| stop("Incorrect cutoff specified.") | |
| } | |
| if (!is.null(names(cutoff))) { | |
| if (!all(names(cutoff) %in% levels(y))) { | |
| stop("Wrong name(s) for cutoff") | |
| } | |
| cutoff <- cutoff[levels(y)] | |
| } | |
| } | |
| if (!is.null(classwt)) { | |
| if (length(classwt) != nclass) | |
| stop("length of classwt not equal to number of classes") | |
| if (!is.null(names(classwt))) { | |
| if (!all(names(classwt) %in% levels(y))) { | |
| stop("Wrong name(s) for classwt") | |
| } | |
| classwt <- classwt[levels(y)] | |
| } | |
| if (any(classwt <= 0)) | |
| stop("classwt must be positive") | |
| ipi <- 1 | |
| } | |
| else { | |
| classwt <- rep(1, nclass) | |
| ipi <- 0 | |
| } | |
| } | |
| else addclass <- FALSE | |
| if (missing(proximity)) | |
| proximity <- addclass | |
| if (proximity) { | |
| prox <- matrix(0, n, n) | |
| proxts <- if (testdat) | |
| matrix(0, ntest, ntest + n) | |
| else double(1) | |
| } | |
| else { | |
| prox <- proxts <- double(1) | |
| } | |
| if (localImp) { | |
| importance <- TRUE | |
| impmat <- matrix(0, p, n) | |
| } | |
| else impmat <- double(1) | |
| if (importance) { | |
| if (nPerm < 1) | |
| nPerm <- as.integer(1) | |
| else nPerm <- as.integer(nPerm) | |
| if (classRF) { | |
| impout <- matrix(0, p, nclass + 2) | |
| impSD <- matrix(0, p, nclass + 1) | |
| } | |
| else { | |
| impout <- matrix(0, p, 2) | |
| impSD <- double(p) | |
| names(impSD) <- x.col.names | |
| } | |
| } | |
| else { | |
| impout <- double(p) | |
| impSD <- double(1) | |
| } | |
| nsample <- if (addclass) | |
| 2 * n | |
| else n | |
| Stratify <- length(sampsize) > 1 | |
| if ((!Stratify) && sampsize > nrow(x)) | |
| stop("sampsize too large") | |
| if (Stratify && (!classRF)) | |
| stop("sampsize should be of length one") | |
| if (classRF) { | |
| if (Stratify) { | |
| if (missing(strata)) | |
| strata <- y | |
| if (!is.factor(strata)) | |
| strata <- as.factor(strata) | |
| nsum <- sum(sampsize) | |
| if (length(sampsize) > nlevels(strata)) | |
| stop("sampsize has too many elements.") | |
| if (any(sampsize <= 0) || nsum == 0) | |
| stop("Bad sampsize specification") | |
| if (!is.null(names(sampsize))) { | |
| sampsize <- sampsize[levels(strata)] | |
| } | |
| if (any(sampsize > table(strata))) | |
| stop("sampsize can not be larger than class frequency") | |
| } | |
| else { | |
| nsum <- sampsize | |
| } | |
| nrnodes <- 2 * trunc(nsum/nodesize) + 1 | |
| } | |
| else { | |
| nrnodes <- 2 * trunc(sampsize/max(1, nodesize - 4)) + | |
| 1 | |
| } | |
| if (!is.null(maxnodes)) { | |
| maxnodes <- 2 * maxnodes - 1 | |
| if (maxnodes > nrnodes) | |
| warning("maxnodes exceeds its max value.") | |
| nrnodes <- min(c(nrnodes, max(c(maxnodes, 1)))) | |
| } | |
| x <- t(x) | |
| storage.mode(x) <- "double" | |
| if (testdat) { | |
| xtest <- t(xtest) | |
| storage.mode(xtest) <- "double" | |
| if (is.null(ytest)) { | |
| ytest <- labelts <- 0 | |
| } | |
| else { | |
| labelts <- TRUE | |
| } | |
| } | |
| else { | |
| xtest <- double(1) | |
| ytest <- double(1) | |
| ntest <- 1 | |
| labelts <- FALSE | |
| } | |
| nt <- if (keep.forest) | |
| ntree | |
| else 1 | |
| if (classRF) { | |
| cwt <- classwt | |
| threshold <- cutoff | |
| error.test <- if (labelts) | |
| double((nclass + 1) * ntree) | |
| else double(1) | |
| rfout <- .C("classRF", x = x, xdim = as.integer(c(p, | |
| n)), y = as.integer(y), nclass = as.integer(nclass), | |
| ncat = as.integer(ncat), maxcat = as.integer(maxcat), | |
| sampsize = as.integer(sampsize), strata = if (Stratify) as.integer(strata) else integer(1), | |
| Options = as.integer(c(addclass, importance, localImp, | |
| proximity, oob.prox, do.trace, keep.forest, replace, | |
| Stratify, keep.inbag)), ntree = as.integer(ntree), | |
| mtry = as.integer(mtry), ipi = as.integer(ipi), classwt = as.double(cwt), | |
| cutoff = as.double(threshold), nodesize = as.integer(nodesize), | |
| outcl = integer(nsample), counttr = integer(nclass * | |
| nsample), prox = prox, impout = impout, impSD = impSD, | |
| impmat = impmat, nrnodes = as.integer(nrnodes), ndbigtree = integer(ntree), | |
| nodestatus = integer(nt * nrnodes), bestvar = integer(nt * | |
| nrnodes), treemap = integer(nt * 2 * nrnodes), | |
| nodepred = integer(nt * nrnodes), xbestsplit = double(nt * | |
| nrnodes), errtr = double((nclass + 1) * ntree), | |
| testdat = as.integer(testdat), xts = as.double(xtest), | |
| clts = as.integer(ytest), nts = as.integer(ntest), | |
| countts = double(nclass * ntest), outclts = as.integer(numeric(ntest)), | |
| labelts = as.integer(labelts), proxts = proxts, errts = error.test, | |
| inbag = if (keep.inbag) matrix(integer(n * ntree), | |
| n) else integer(n), DUP = FALSE, PACKAGE = "randomForest")[-1] | |
| if (keep.forest) { | |
| max.nodes <- max(rfout$ndbigtree) | |
| treemap <- aperm(array(rfout$treemap, dim = c(2, | |
| nrnodes, ntree)), c(2, 1, 3))[1:max.nodes, , | |
| , drop = FALSE] | |
| } | |
| if (!addclass) { | |
| out.class <- factor(rfout$outcl, levels = 1:nclass, | |
| labels = levels(y)) | |
| names(out.class) <- x.row.names | |
| con <- table(observed = y, predicted = out.class)[levels(y), | |
| levels(y)] | |
| con <- cbind(con, class.error = 1 - diag(con)/rowSums(con)) | |
| } | |
| out.votes <- t(matrix(rfout$counttr, nclass, nsample))[1:n, | |
| ] | |
| oob.times <- rowSums(out.votes) | |
| if (norm.votes) | |
| out.votes <- t(apply(out.votes, 1, function(x) x/sum(x))) | |
| dimnames(out.votes) <- list(x.row.names, levels(y)) | |
| class(out.votes) <- c(class(out.votes), "votes") | |
| if (testdat) { | |
| out.class.ts <- factor(rfout$outclts, levels = 1:nclass, | |
| labels = levels(y)) | |
| names(out.class.ts) <- xts.row.names | |
| out.votes.ts <- t(matrix(rfout$countts, nclass, ntest)) | |
| dimnames(out.votes.ts) <- list(xts.row.names, levels(y)) | |
| if (norm.votes) | |
| out.votes.ts <- t(apply(out.votes.ts, 1, function(x) x/sum(x))) | |
| class(out.votes.ts) <- c(class(out.votes.ts), "votes") | |
| if (labelts) { | |
| testcon <- table(observed = ytest, predicted = out.class.ts)[levels(y), | |
| levels(y)] | |
| testcon <- cbind(testcon, class.error = 1 - diag(testcon)/rowSums(testcon)) | |
| } | |
| } | |
| cl <- match.call() | |
| cl[[1]] <- as.name("randomForest") | |
| out <- list(call = cl, type = if (addclass) "unsupervised" else "classification", | |
| predicted = if (addclass) NULL else out.class, err.rate = if (addclass) NULL else t(matrix(rfout$errtr, | |
| nclass + 1, ntree, dimnames = list(c("OOB", levels(y)), | |
| NULL))), confusion = if (addclass) NULL else con, | |
| votes = out.votes, oob.times = oob.times, classes = levels(y), | |
| importance = if (importance) matrix(rfout$impout, | |
| p, nclass + 2, dimnames = list(x.col.names, c(levels(y), | |
| "MeanDecreaseAccuracy", "MeanDecreaseGini"))) else matrix(rfout$impout, | |
| ncol = 1, dimnames = list(x.col.names, "MeanDecreaseGini")), | |
| importanceSD = if (importance) matrix(rfout$impSD, | |
| p, nclass + 1, dimnames = list(x.col.names, c(levels(y), | |
| "MeanDecreaseAccuracy"))) else NULL, localImportance = if (localImp) matrix(rfout$impmat, | |
| p, n, dimnames = list(x.col.names, x.row.names)) else NULL, | |
| proximity = if (proximity) matrix(rfout$prox, n, | |
| n, dimnames = list(x.row.names, x.row.names)) else NULL, | |
| ntree = ntree, mtry = mtry, forest = if (!keep.forest) NULL else { | |
| list(ndbigtree = rfout$ndbigtree, nodestatus = matrix(rfout$nodestatus, | |
| ncol = ntree)[1:max.nodes, , drop = FALSE], | |
| bestvar = matrix(rfout$bestvar, ncol = ntree)[1:max.nodes, | |
| , drop = FALSE], treemap = treemap, nodepred = matrix(rfout$nodepred, | |
| ncol = ntree)[1:max.nodes, , drop = FALSE], | |
| xbestsplit = matrix(rfout$xbestsplit, ncol = ntree)[1:max.nodes, | |
| , drop = FALSE], pid = rfout$classwt, cutoff = cutoff, | |
| ncat = ncat, maxcat = maxcat, nrnodes = max.nodes, | |
| ntree = ntree, nclass = nclass, xlevels = xlevels) | |
| }, y = if (addclass) NULL else y, test = if (!testdat) NULL else list(predicted = out.class.ts, | |
| err.rate = if (labelts) t(matrix(rfout$errts, | |
| nclass + 1, ntree, dimnames = list(c("Test", | |
| levels(y)), NULL))) else NULL, confusion = if (labelts) testcon else NULL, | |
| votes = out.votes.ts, proximity = if (proximity) matrix(rfout$proxts, | |
| nrow = ntest, dimnames = list(xts.row.names, | |
| c(xts.row.names, x.row.names))) else NULL), | |
| inbag = if (keep.inbag) rfout$inbag else NULL) | |
| } | |
| else { | |
| ymean <- mean(y) | |
| y <- y - ymean | |
| ytest <- ytest - ymean | |
| rfout <- .C("regRF", x, as.double(y), as.integer(c(n, | |
| p)), as.integer(sampsize), as.integer(nodesize), | |
| as.integer(nrnodes), as.integer(ntree), as.integer(mtry), | |
| as.integer(c(importance, localImp, nPerm)), as.integer(ncat), | |
| as.integer(maxcat), as.integer(do.trace), as.integer(proximity), | |
| as.integer(oob.prox), as.integer(corr.bias), ypred = double(n), | |
| impout = impout, impmat = impmat, impSD = impSD, | |
| prox = prox, ndbigtree = integer(ntree), nodestatus = matrix(integer(nrnodes * | |
| nt), ncol = nt), leftDaughter = matrix(integer(nrnodes * | |
| nt), ncol = nt), rightDaughter = matrix(integer(nrnodes * | |
| nt), ncol = nt), nodepred = matrix(double(nrnodes * | |
| nt), ncol = nt), bestvar = matrix(integer(nrnodes * | |
| nt), ncol = nt), xbestsplit = matrix(double(nrnodes * | |
| nt), ncol = nt), mse = double(ntree), keep = as.integer(c(keep.forest, | |
| keep.inbag)), replace = as.integer(replace), | |
| testdat = as.integer(testdat), xts = xtest, ntest = as.integer(ntest), | |
| yts = as.double(ytest), labelts = as.integer(labelts), | |
| ytestpred = double(ntest), proxts = proxts, msets = double(if (labelts) ntree else 1), | |
| coef = double(2), oob.times = integer(n), inbag = if (keep.inbag) matrix(integer(n * | |
| ntree), n) else integer(1), DUP = FALSE, PACKAGE = "randomForest")[c(16:28, | |
| 36:41)] | |
| if (keep.forest) { | |
| max.nodes <- max(rfout$ndbigtree) | |
| rfout$nodestatus <- rfout$nodestatus[1:max.nodes, | |
| , drop = FALSE] | |
| rfout$bestvar <- rfout$bestvar[1:max.nodes, , drop = FALSE] | |
| rfout$nodepred <- rfout$nodepred[1:max.nodes, , drop = FALSE] + | |
| ymean | |
| rfout$xbestsplit <- rfout$xbestsplit[1:max.nodes, | |
| , drop = FALSE] | |
| rfout$leftDaughter <- rfout$leftDaughter[1:max.nodes, | |
| , drop = FALSE] | |
| rfout$rightDaughter <- rfout$rightDaughter[1:max.nodes, | |
| , drop = FALSE] | |
| } | |
| cl <- match.call() | |
| cl[[1]] <- as.name("randomForest") | |
| ypred <- rfout$ypred | |
| if (any(rfout$oob.times < 1)) { | |
| ypred[rfout$oob.times == 0] <- NA | |
| } | |
| out <- list(call = cl, type = "regression", predicted = structure(ypred + | |
| ymean, names = x.row.names), mse = rfout$mse, rsq = 1 - | |
| rfout$mse/(var(y) * (n - 1)/n), oob.times = rfout$oob.times, | |
| importance = if (importance) matrix(rfout$impout, | |
| p, 2, dimnames = list(x.col.names, c("%IncMSE", | |
| "IncNodePurity"))) else matrix(rfout$impout, | |
| ncol = 1, dimnames = list(x.col.names, "IncNodePurity")), | |
| importanceSD = if (importance) rfout$impSD else NULL, | |
| localImportance = if (localImp) matrix(rfout$impmat, | |
| p, n, dimnames = list(x.col.names, x.row.names)) else NULL, | |
| proximity = if (proximity) matrix(rfout$prox, n, | |
| n, dimnames = list(x.row.names, x.row.names)) else NULL, | |
| ntree = ntree, mtry = mtry, forest = if (keep.forest) c(rfout[c("ndbigtree", | |
| "nodestatus", "leftDaughter", "rightDaughter", | |
| "nodepred", "bestvar", "xbestsplit")], list(ncat = ncat), | |
| list(nrnodes = max.nodes), list(ntree = ntree), | |
| list(xlevels = xlevels)) else NULL, coefs = if (corr.bias) rfout$coef else NULL, | |
| y = y + ymean, test = if (testdat) { | |
| list(predicted = structure(rfout$ytestpred + | |
| ymean, names = xts.row.names), mse = if (labelts) rfout$msets else NULL, | |
| rsq = if (labelts) 1 - rfout$msets/(var(ytest) * | |
| (n - 1)/n) else NULL, proximity = if (proximity) matrix(rfout$proxts/ntree, | |
| nrow = ntest, dimnames = list(xts.row.names, | |
| c(xts.row.names, x.row.names))) else NULL) | |
| } else NULL, inbag = if (keep.inbag) matrix(rfout$inbag, | |
| nrow(rfout$inbag), dimnames = list(x.row.names, | |
| NULL)) else NULL) | |
| } | |
| class(out) <- "randomForest" | |
| return(out) | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment