-
-
Save mrdwab/6424112 to your computer and use it in GitHub Desktop.
stratified <- function(df, group, size, select = NULL, | |
replace = FALSE, bothSets = FALSE) { | |
if (is.null(select)) { | |
df <- df | |
} else { | |
if (is.null(names(select))) stop("'select' must be a named list") | |
if (!all(names(select) %in% names(df))) | |
stop("Please verify your 'select' argument") | |
temp <- sapply(names(select), | |
function(x) df[[x]] %in% select[[x]]) | |
df <- df[rowSums(temp) == length(select), ] | |
} | |
df.interaction <- interaction(df[group], drop = TRUE) | |
df.table <- table(df.interaction) | |
df.split <- split(df, df.interaction) | |
if (length(size) > 1) { | |
if (length(size) != length(df.split)) | |
stop("Number of groups is ", length(df.split), | |
" but number of sizes supplied is ", length(size)) | |
if (is.null(names(size))) { | |
n <- setNames(size, names(df.split)) | |
message(sQuote("size"), " vector entered as:\n\nsize = structure(c(", | |
paste(n, collapse = ", "), "),\n.Names = c(", | |
paste(shQuote(names(n)), collapse = ", "), ")) \n\n") | |
} else { | |
ifelse(all(names(size) %in% names(df.split)), | |
n <- size[names(df.split)], | |
stop("Named vector supplied with names ", | |
paste(names(size), collapse = ", "), | |
"\n but the names for the group levels are ", | |
paste(names(df.split), collapse = ", "))) | |
} | |
} else if (size < 1) { | |
n <- round(df.table * size, digits = 0) | |
} else if (size >= 1) { | |
if (all(df.table >= size) || isTRUE(replace)) { | |
n <- setNames(rep(size, length.out = length(df.split)), | |
names(df.split)) | |
} else { | |
message( | |
"Some groups\n---", | |
paste(names(df.table[df.table < size]), collapse = ", "), | |
"---\ncontain fewer observations", | |
" than desired number of samples.\n", | |
"All observations have been returned from those groups.") | |
n <- c(sapply(df.table[df.table >= size], function(x) x = size), | |
df.table[df.table < size]) | |
} | |
} | |
temp <- lapply( | |
names(df.split), | |
function(x) df.split[[x]][sample(df.table[x], | |
n[x], replace = replace), ]) | |
set1 <- do.call("rbind", temp) | |
if (isTRUE(bothSets)) { | |
set2 <- df[!rownames(df) %in% rownames(set1), ] | |
list(SET1 = set1, SET2 = set2) | |
} else { | |
set1 | |
} | |
} |
thank you!
Great Function !!!!!......Thanks a lot !!
Great. Thank you very much for that.
awesome!
Hi, Thank you for the amazing code. But i have a query regarding using multiple columns to create strata.
Here you have shown one example "stratified(dat1, c("E", "D"), size = 0.15)" where both "E" and "D" are categorical columns. I was wondering if we can use multiple numerical columns. Please guide me for the same.
Basically your code : stratified(dat1, c("B", "C"), size = 0.15) should return some output.
Thanks in advance.
Hi, I tried to load the function using the following commands:
library(devtools)
source_gist("https://gist.github.com/mrdwab/6424112")
But, I got the following error:
Error in r_files[[which]] : invalid subscript type 'closure'
Really appreciate your help to fix this. This is exactly the function that I have been looking for and desperately need to use it.
Wow this is exactly what I need! Thank you so much!
By the way, is there a way to apply population weights for the sampling?
Thanks so much for this code, it works perfectly.
Hi there Ananda,
How do I make attribution to your article?
Such as citing the material. This is top stuff, indeed.
Great Function! @mrdwab Could you please provide a official citation guide to cite your function/package? Thank you!