Created
February 19, 2011 02:01
-
-
Save stephenturner/834760 to your computer and use it in GitHub Desktop.
splitdf.randomize.r
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#splitdf splits a data frame into a training and testing set. | |
#returns a list of two data frames: trainset and testset. | |
#you can optionally apply a random seed. | |
splitdf <- function(dataframe, seed=NULL, trainfrac=0.5) { | |
if (trainfrac<=0 | trainfrac>=1) stop("Training fraction must be between 0 and 1, not inclusive") | |
if (!is.null(seed)) set.seed(seed) | |
index <- 1:nrow(dataframe) | |
trainindex <- sample(index, trunc(length(index)/(1/trainfrac))) | |
trainset <- dataframe[trainindex, ] | |
testset <- dataframe[-trainindex, ] | |
list(trainset=trainset,testset=testset) | |
} | |
#this function utilizes the function above. | |
#you give it a data frame you want to randomize, | |
#and a character vector with column names you want to be sure are | |
#equally distributed among the two different sets. | |
#these columns must be continuous variables. chi2 not yet implemented. | |
splitdf.randomize <- function(dataframe, ttestcolnames=c("cols","to","test"), ...) { | |
d <- dataframe | |
if (!all(ttestcolnames %in% names(d))) stop(paste(ttestcolnames,"not in dataframe")) | |
ps <- NULL | |
while (is.null(ps) | any(ps<.5)) { | |
sets <- splitdf(d, trainfrac=...) | |
trainset <- sets$trainset | |
testset <- sets$testset | |
ttestcols <- which(names(d) %in% ttestcolnames) | |
ps <- NULL | |
for (col in ttestcols) { | |
p <- t.test(trainset[ ,col], testset[ ,col])$p.value | |
ps=c(ps,p) | |
} | |
print(paste(ttestcolnames," t-test p-value =",ps)) | |
cat("\n") | |
} | |
list(trainset=trainset,testset=testset) | |
} | |
# sometimes you might have significant differences in variables of interest | |
# between training and testing sets. | |
data(iris) | |
s44 <- splitdf(iris, seed=44) | |
train <- s44$trainset | |
test <- s44$testset | |
t.test(train$Sepal.Length, test$Sepal.Length) | |
#first, specify which columns you want to ensure are "even" between the sets | |
cols <- c("Sepal.Length","Sepal.Width","Petal.Length","Petal.Width") | |
#Now, split up the dataset again, keeping even distribution of those variables. | |
set.seed(80842) | |
evensplit <- splitdf.randomize(iris,cols) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment