stephenturner · February 19, 2011 02:01
diff --git a/splitdf.randomize.r b/splitdf.randomize.r
 #splitdf splits a data frame into a training and testing set.
 #returns a list of two data frames: trainset and testset.
 #you can optionally apply a random seed.
 splitdf <- function(dataframe, seed=NULL, trainfrac=0.5) {
 	if (trainfrac<=0 | trainfrac>=1) stop("Training fraction must be between 0 and 1, not inclusive")
 	if (!is.null(seed)) set.seed(seed)
 	index <- 1:nrow(dataframe)
 	trainindex <- sample(index, trunc(length(index)/(1/trainfrac)))
 	trainset <- dataframe[trainindex, ]
 	testset <- dataframe[-trainindex, ]
 	list(trainset=trainset,testset=testset)
 }

 #this function utilizes the function above.
 #you give it a data frame you want to randomize,
 #and a character vector with column names you want to be sure are 
 #equally distributed among the two different sets.
 #these columns must be continuous variables. chi2 not yet implemented.
 splitdf.randomize <- function(dataframe, ttestcolnames=c("cols","to","test"), ...) {
 	d <- dataframe
 	if (!all(ttestcolnames %in% names(d))) stop(paste(ttestcolnames,"not in dataframe"))
 	ps <- NULL
 	while (is.null(ps) | any(ps<.5)) {
 		sets <- splitdf(d, trainfrac=...)
 		trainset <- sets$trainset
 		testset <- sets$testset
 		ttestcols <- which(names(d) %in% ttestcolnames)
 		ps <- NULL
 		for (col in ttestcols) {
 			p <- t.test(trainset[ ,col], testset[ ,col])$p.value
 			ps=c(ps,p)
 		}
 		print(paste(ttestcolnames," t-test p-value =",ps))
 		cat("\n")
 	}
 	list(trainset=trainset,testset=testset)
 }

 # sometimes you might have significant differences in variables of interest
 # between training and testing sets.
 data(iris)
 s44 <- splitdf(iris, seed=44)
 train <- s44$trainset
 test <- s44$testset
 t.test(train$Sepal.Length, test$Sepal.Length)

 #first, specify which columns you want to ensure are "even" between the sets
 cols <- c("Sepal.Length","Sepal.Width","Petal.Length","Petal.Width")

 #Now, split up the dataset again, keeping even distribution of those variables.
 set.seed(80842)
 evensplit <- splitdf.randomize(iris,cols)
	#splitdf splits a data frame into a training and testing set.
	#returns a list of two data frames: trainset and testset.
	#you can optionally apply a random seed.
	splitdf <- function(dataframe, seed=NULL, trainfrac=0.5) {
	if (trainfrac<=0 \| trainfrac>=1) stop("Training fraction must be between 0 and 1, not inclusive")
	if (!is.null(seed)) set.seed(seed)
	index <- 1:nrow(dataframe)
	trainindex <- sample(index, trunc(length(index)/(1/trainfrac)))
	trainset <- dataframe[trainindex, ]
	testset <- dataframe[-trainindex, ]
	list(trainset=trainset,testset=testset)
	}

	#this function utilizes the function above.
	#you give it a data frame you want to randomize,
	#and a character vector with column names you want to be sure are
	#equally distributed among the two different sets.
	#these columns must be continuous variables. chi2 not yet implemented.
	splitdf.randomize <- function(dataframe, ttestcolnames=c("cols","to","test"), ...) {
	d <- dataframe
	if (!all(ttestcolnames %in% names(d))) stop(paste(ttestcolnames,"not in dataframe"))
	ps <- NULL
	while (is.null(ps) \| any(ps<.5)) {
	sets <- splitdf(d, trainfrac=...)
	trainset <- sets$trainset
	testset <- sets$testset
	ttestcols <- which(names(d) %in% ttestcolnames)
	ps <- NULL
	for (col in ttestcols) {
	p <- t.test(trainset[ ,col], testset[ ,col])$p.value
	ps=c(ps,p)
	}
	print(paste(ttestcolnames," t-test p-value =",ps))
	cat("\n")
	}
	list(trainset=trainset,testset=testset)
	}

	# sometimes you might have significant differences in variables of interest
	# between training and testing sets.
	data(iris)
	s44 <- splitdf(iris, seed=44)
	train <- s44$trainset
	test <- s44$testset
	t.test(train$Sepal.Length, test$Sepal.Length)

	#first, specify which columns you want to ensure are "even" between the sets
	cols <- c("Sepal.Length","Sepal.Width","Petal.Length","Petal.Width")

	#Now, split up the dataset again, keeping even distribution of those variables.
	set.seed(80842)
	evensplit <- splitdf.randomize(iris,cols)