Created
January 2, 2014 19:24
-
-
Save alienfluid/8224955 to your computer and use it in GitHub Desktop.
Convert data frame with factors into one with dummy variables for all the factors. Also take care of missing values etc.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## build a nice design matrix | |
## formula can be a vector of column names or a standard formula | |
design <- function(formula, response, data){ | |
if(is.character(formula)){ | |
data <- data[,formula] | |
formula <- ~. | |
} | |
if(is.null(dim(data))) data <- as.matrix(data) | |
if(is.null(rownames(data))) rownames(data) <- 1:nrow(data) | |
response <- c(response) | |
if(nrow(data)!=length(response)) | |
stop("response must have same number of observations as the data") | |
names(response) <- rownames(data) | |
data <- as.data.frame(data) | |
x <- as.data.frame(model.matrix(formula, data=data))[,-1] | |
y <- response[rownames(x)] | |
x <- x[!is.na(y),] | |
y <- y[!is.na(y)] | |
return(list(X=x,Y=y,formula=formula)) | |
} | |
## mean imputation for a numeric design matrix | |
meanimpute <- function(x){ | |
x$MISSING <- apply(x,1,function(r) as.numeric(any(is.na(r)))) | |
x <- as.data.frame(apply(x,2, | |
function(c){ | |
avg <- mean(c,na.rm=TRUE) | |
c[is.na(c)] <- avg | |
return(c) })) | |
return(x) } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment