-
-
Save mrdwab/6424112 to your computer and use it in GitHub Desktop.
stratified <- function(df, group, size, select = NULL, | |
replace = FALSE, bothSets = FALSE) { | |
if (is.null(select)) { | |
df <- df | |
} else { | |
if (is.null(names(select))) stop("'select' must be a named list") | |
if (!all(names(select) %in% names(df))) | |
stop("Please verify your 'select' argument") | |
temp <- sapply(names(select), | |
function(x) df[[x]] %in% select[[x]]) | |
df <- df[rowSums(temp) == length(select), ] | |
} | |
df.interaction <- interaction(df[group], drop = TRUE) | |
df.table <- table(df.interaction) | |
df.split <- split(df, df.interaction) | |
if (length(size) > 1) { | |
if (length(size) != length(df.split)) | |
stop("Number of groups is ", length(df.split), | |
" but number of sizes supplied is ", length(size)) | |
if (is.null(names(size))) { | |
n <- setNames(size, names(df.split)) | |
message(sQuote("size"), " vector entered as:\n\nsize = structure(c(", | |
paste(n, collapse = ", "), "),\n.Names = c(", | |
paste(shQuote(names(n)), collapse = ", "), ")) \n\n") | |
} else { | |
ifelse(all(names(size) %in% names(df.split)), | |
n <- size[names(df.split)], | |
stop("Named vector supplied with names ", | |
paste(names(size), collapse = ", "), | |
"\n but the names for the group levels are ", | |
paste(names(df.split), collapse = ", "))) | |
} | |
} else if (size < 1) { | |
n <- round(df.table * size, digits = 0) | |
} else if (size >= 1) { | |
if (all(df.table >= size) || isTRUE(replace)) { | |
n <- setNames(rep(size, length.out = length(df.split)), | |
names(df.split)) | |
} else { | |
message( | |
"Some groups\n---", | |
paste(names(df.table[df.table < size]), collapse = ", "), | |
"---\ncontain fewer observations", | |
" than desired number of samples.\n", | |
"All observations have been returned from those groups.") | |
n <- c(sapply(df.table[df.table >= size], function(x) x = size), | |
df.table[df.table < size]) | |
} | |
} | |
temp <- lapply( | |
names(df.split), | |
function(x) df.split[[x]][sample(df.table[x], | |
n[x], replace = replace), ]) | |
set1 <- do.call("rbind", temp) | |
if (isTRUE(bothSets)) { | |
set2 <- df[!rownames(df) %in% rownames(set1), ] | |
list(SET1 = set1, SET2 = set2) | |
} else { | |
set1 | |
} | |
} |
Here are some examples:
library(devtools)
source_gist("https://gist.github.com/mrdwab/6424112")
## [1] "https://raw.github.com/gist/6424112"
## SHA-1 hash of file is 0006d8548785ec8a5651c3dd599648cc88d153a4
# Generate a couple of sample data.frames to play with
set.seed(1)
dat1 <- data.frame(ID = 1:100, A = sample(c("AA", "BB", "CC", "DD", "EE"), 100,
replace = TRUE), B = rnorm(100), C = abs(round(rnorm(100), digits = 1)),
D = sample(c("CA", "NY", "TX"), 100, replace = TRUE), E = sample(c("M",
"F"), 100, replace = TRUE))
# What do the data look like in general?
summary(dat1)
## ID A B C D E
## Min. : 1.0 AA:13 Min. :-1.9144 Min. :0.000 CA:23 F:54
## 1st Qu.: 25.8 BB:25 1st Qu.:-0.6141 1st Qu.:0.300 NY:42 M:46
## Median : 50.5 CC:19 Median :-0.1176 Median :0.650 TX:35
## Mean : 50.5 DD:26 Mean :-0.0176 Mean :0.825
## 3rd Qu.: 75.2 EE:17 3rd Qu.: 0.5382 3rd Qu.:1.200
## Max. :100.0 Max. : 2.4016 Max. :2.900
# Let's take a 10% sample from all -A- groups in dat1
stratified(dat1, "A", 0.1)
## ID A B C D E
## 92 92 AA 1.1766 1.0 CA F
## 86 86 BB -1.5364 0.1 NY F
## 74 74 BB -0.1796 0.3 CA F
## 60 60 CC 1.6822 0.5 TX F
## 58 58 CC 0.9102 0.5 CA F
## 9 9 DD 0.5697 1.4 TX F
## 42 42 DD 1.2079 0.4 TX M
## 46 46 DD 0.5585 1.0 CA M
## 99 99 EE -1.2863 0.2 NY F
## 72 72 EE 1.3430 0.0 NY F
# Let's take a 10% sample from only 'AA' and 'BB' groups from -A- in dat1
stratified(dat1, "A", 0.1, select = list(A = c("AA", "BB")))
## ID A B C D E
## 69 69 AA 0.4942 0.6 CA M
## 54 54 BB 0.1580 0.3 TX M
## 5 5 BB 1.4330 1.5 NY F
# Let's take 5 samples from all -D- groups in dat1, specified by column
# number
stratified(dat1, group = 5, size = 5)
## ID A B C D E
## 73 73 BB -0.2146 0.6 CA F
## 45 45 CC 1.5868 1.2 CA F
## 87 87 DD -0.3010 0.6 CA M
## 13 13 DD 0.6897 1.1 CA F
## 24 24 AA -0.9341 0.1 CA M
## 40 40 CC 0.2671 0.9 NY F
## 83 83 BB 0.5315 0.6 NY M
## 1 1 BB 0.3981 0.5 NY F
## 56 56 AA 1.7673 2.5 NY M
## 88 88 AA -0.5283 1.2 NY M
## 66 66 BB -0.3928 1.5 TX F
## 22 22 BB -0.7099 0.1 TX M
## 79 79 DD -0.6817 0.2 TX M
## 27 27 AA -0.4433 0.8 TX M
## 51 51 CC -0.6204 0.4 TX F
# Let's take a sample from all -A- groups in dat1, where we specify the
# number wanted from each group
stratified(dat1, "A", size = c(3, 5, 4, 5, 2))
## 'size' vector entered as:
##
## size = structure(c(3, 5, 4, 5, 2), .Names = c("AA", "BB", "CC", "DD",
## "EE"))
## ID A B C D E
## 38 38 AA -0.3042 0.8 NY M
## 10 10 AA -0.1351 1.9 NY M
## 56 56 AA 1.7673 2.5 NY M
## 62 62 BB -0.4616 0.4 CA F
## 74 74 BB -0.1796 0.3 CA F
## 2 2 BB -0.6120 0.0 TX F
## 66 66 BB -0.3928 1.5 TX F
## 22 22 BB -0.7099 0.1 TX M
## 40 40 CC 0.2671 0.9 NY F
## 67 67 CC -0.3200 0.3 CA F
## 16 16 CC 0.1888 2.2 CA M
## 3 3 CC 0.3411 0.3 NY M
## 39 39 DD 0.3700 0.4 CA F
## 49 49 DD -1.2246 0.4 NY F
## 50 50 DD -0.4734 0.4 TX F
## 23 23 DD 0.6107 0.5 NY F
## 17 17 DD -1.8050 0.3 NY M
## 35 35 EE 0.5939 0.5 CA M
## 41 41 EE -0.5425 0.2 NY F
# Use a two-column strata: -E- and -D- -E- varies more slowly, so it is
# better to put that first
stratified(dat1, c("E", "D"), size = 0.15)
## ID A B C D E
## 84 84 BB -1.518394 0.6 CA F
## 34 34 AA -1.523567 1.5 CA F
## 53 53 CC -0.910922 1.6 CA M
## 36 36 DD 0.332950 0.2 CA M
## 97 97 CC 2.087167 0.5 NY F
## 49 49 DD -1.224613 0.4 NY F
## 5 5 BB 1.433024 1.5 NY F
## 37 37 DD 1.063100 1.5 NY M
## 17 17 DD -1.804959 0.3 NY M
## 33 33 CC 1.178087 0.2 NY M
## 14 14 BB 0.028002 0.9 TX F
## 90 90 AA -0.056897 0.0 TX F
## 28 28 BB 0.001105 2.1 TX F
## 80 80 EE -0.324270 0.3 TX M
## 22 22 BB -0.709946 0.1 TX M
# Use a two-column strata (-E- and -D-) but only interested in cases where
# -E- == 'M'
stratified(dat1, c("E", "D"), 0.15, select = list(E = "M"))
## ID A B C D E
## 69 69 AA 0.49419 0.6 CA M
## 32 32 CC -0.13518 1.0 CA M
## 12 12 AA -0.03924 0.2 NY M
## 100 100 DD -1.64061 1.0 NY M
## 56 56 AA 1.76729 2.5 NY M
## 79 79 DD -0.68166 0.2 TX M
## 80 80 EE -0.32427 0.3 TX M
## As above, but where -E- == 'M' and -D- == 'CA' or 'TX'
stratified(dat1, c("E", "D"), 0.15, select = list(E = "M", D = c("CA", "TX")))
## ID A B C D E
## 53 53 CC -0.9109 1.6 CA M
## 36 36 DD 0.3330 0.2 CA M
## 80 80 EE -0.3243 0.3 TX M
## 8 8 DD -1.0441 0.6 TX M
# Use a three-column strata: -E-, -D-, and -A-
s.out <- stratified(dat1, c("E", "D", "A"), size = 2)
## Some groups ---M.CA.BB, M.CA.EE--- contain fewer observations than desired
## number of samples. All observations have been returned from those groups.
list(head(s.out), tail(s.out))
## [[1]]
## ID A B C D E
## 92 92 AA 1.1766 1.0 CA F
## 34 34 AA -1.5236 1.5 CA F
## 24 24 AA -0.9341 0.1 CA M
## 69 69 AA 0.4942 0.6 CA M
## 56 56 AA 1.7673 2.5 NY M
## 10 10 AA -0.1351 1.9 NY M
##
## [[2]]
## ID A B C D E
## 61 61 EE -0.63574 0.2 NY M
## 18 18 EE 1.46555 1.4 NY M
## 21 21 EE 0.47551 2.3 TX F
## 70 70 EE -0.17733 0.0 TX F
## 80 80 EE -0.32427 0.3 TX M
## 77 77 EE -0.07356 0.3 TX M
# How many samples were taken from each strata?
table(interaction(s.out[c("E", "D", "A")]))
##
## F.CA.AA M.CA.AA F.NY.AA M.NY.AA F.TX.AA M.TX.AA F.CA.BB M.CA.BB F.NY.BB
## 2 2 0 2 2 2 2 1 2
## M.NY.BB F.TX.BB M.TX.BB F.CA.CC M.CA.CC F.NY.CC M.NY.CC F.TX.CC M.TX.CC
## 2 2 2 2 2 2 2 2 0
## F.CA.DD M.CA.DD F.NY.DD M.NY.DD F.TX.DD M.TX.DD F.CA.EE M.CA.EE F.NY.EE
## 2 2 2 2 2 2 0 1 2
## M.NY.EE F.TX.EE M.TX.EE
## 2 2 2
# Can we verify the message about group sizes?
names(which(table(interaction(dat1[c("E", "D", "A")])) < 2))
## [1] "F.NY.AA" "M.CA.BB" "M.TX.CC" "F.CA.EE" "M.CA.EE"
names(which(table(interaction(s.out[c("E", "D", "A")])) < 2))
## [1] "F.NY.AA" "M.CA.BB" "M.TX.CC" "F.CA.EE" "M.CA.EE"
This is exactly what I was looking for. Thanks so much!
@mattsigal, No problem--glad it helped. Also, check out the data.table
version if you need something speedier.
Thanks. This is great!
Dearr mrdwab
Hope this find you well,
really it great code,
and i want to ask you question please about
stacked regression as it mention in article of Brieman
http://statistics.berkeley.edu/sites/default/files/tech-reports/367.pdf
have you any code for that, please.
This is genius. Thanks!
Thank you for posting this! It's just what I needed! I am using this function to prepare some data for my research. Do you have a preferred citation so I can cite your function in my paper?
Hi there! Great function! I am using it to randomly select a subset of species observations so that we can verify the species identification of a randomly selected subset. So, I can easily stratify by species. However, if I want the observations to also be stratified by transect so that if possible, the species checked are from different transects, it becomes more complicated. I tried: size=c("TransectName"=1,"SpeciesName"=5)
to choose 5 observations of each species, each from a different transect, but this didn't work.
Error message: (Error in stratified(df, group = c("SpeciesName", "TransectName"), size = c(TransectName = 1, : Number of groups is 508 but number of sizes supplied is 2)
It gets more complicated, because if it isn't possible to get 5 observations from different transects, they can come from the same transect.
Any ideas on how I would accomplish this?
If not, I'll drop the stratification by transect.
Thanks!!
Great function! I have a case in which I am not sure it could be used.
Let's say I have a dataset x and a dataset y. Dataset x contains N observations and dataset y contains M observations, where N>M. Both datasets contain the same variables k. If I want to make from dataset x a representative sample of dataset y.
Is it possible to use the function and to specify the proportions of dataset y for the category vars of: 1) Size and 2) Sector?
stratified(Dataset x, c("Size", "Sector"), ...)
Thanks very much!
Great Function! @mrdwab Could you please provide a official citation guide to cite your function/package? Thank you!
thank you!
Great Function !!!!!......Thanks a lot !!
Great. Thank you very much for that.
awesome!
Hi, Thank you for the amazing code. But i have a query regarding using multiple columns to create strata.
Here you have shown one example "stratified(dat1, c("E", "D"), size = 0.15)" where both "E" and "D" are categorical columns. I was wondering if we can use multiple numerical columns. Please guide me for the same.
Basically your code : stratified(dat1, c("B", "C"), size = 0.15) should return some output.
Thanks in advance.
Hi, I tried to load the function using the following commands:
library(devtools)
source_gist("https://gist.github.com/mrdwab/6424112")
But, I got the following error:
Error in r_files[[which]] : invalid subscript type 'closure'
Really appreciate your help to fix this. This is exactly the function that I have been looking for and desperately need to use it.
Wow this is exactly what I need! Thank you so much!
By the way, is there a way to apply population weights for the sampling?
Thanks so much for this code, it works perfectly.
Hi there Ananda,
How do I make attribution to your article?
Such as citing the material. This is top stuff, indeed.
The arguments to
stratified
are:df
: The inputdata.frame
group
: A character vector of the column or columns that make up the "strata".size
: The desired sample size.size
is a value less than 1, a proportionate sample is taken from each stratum.size
is a single integer of 1 or more, that number of samples is taken from each stratum.size
is a vector of integers, the specified number of samples is taken for each stratum. It is recommended that you use a named vector. For example, if you have two strata, "A" and "B", and you wanted 5 samples from "A" and 10 from "B", you would entersize = c(A = 5, B = 10)
.select
: This allows you to subset the groups in the sampling process. This is alist
. For instance, if yourgroup
variable was "Group", and it contained three strata, "A", "B", and "C", but you only wanted to sample from "A" and "C", you can useselect = list(Group = c("A", "C"))
.replace
: For sampling with replacement.