leeper · October 25, 2016 10:58 · leeper · Oct 25, 2016
diff --git a/oversampling.R b/oversampling.R
 # uses dev version of 'waffle'
 # devtools::install_github("leeper/waffle@patch-1")
 library("waffle")
 library("extrafont")

 # population
 set.seed(1)
 N <- 900L
 p <- c("Small Group 1" = 30, "Big Group 1" = 420, "Small Group 2" = 30,"Big Group 2" = 420)
 glyph <- c("male", "female")[sample(1:2, N, TRUE)]
 w <- waffle(p, rows = 30, flip = TRUE, use_glyph = glyph, glyph_size = 4)

 # SRS
 w1 <- w + annotate("rect", xmin = 0.5, xmax = 30.4, ymin = 0.4, ymax = 30.4, col = "black", fill = NA, lwd = 1) +
 # sample
 set.seed(123)
 n <- 50L
 xs <- sample(1:30, n, TRUE)
 ys <- sample(1:30, n, TRUE)
 # plot sampled units
 for (i in seq_len(n)) {
    w1 <- w1 + annotate("rect", xmin = xs[i]-0.5, xmax = xs[i]+0.4, ymin = ys[i]-0.6, ymax = ys[i]+0.4, col = "black", alpha=0.5, lwd = 0.5)
 }
 # save
 w1 + ggtitle("Sampling n=50\nfrom Population of N=900") + 
     xlab("Respondents from 'Small' Groups = 2, each of whom hugely influences results\nProbability of Being Sampled = 50/900 = 0.56 (All respondents weighted equally)")
 ggsave("srs.png", width = 8, height = 8)


 # Stratified Sampling
 w2 <- w
 w2 <- w2 + annotate("rect", xmin = 0.5, xmax = 30.4, ymin = 0.4, ymax = 1.4, col = "black", fill = NA, lwd = 1) +
           annotate("rect", xmin = 0.5, xmax = 30.4, ymin = 0.4, ymax = 15.4, col = "black", fill = NA, lwd = 1) +
           annotate("rect", xmin = 0.5, xmax = 30.4, ymin = 15.4, ymax = 16.4, col = "black", fill = NA, lwd = 1) +
           annotate("rect", xmin = 0.5, xmax = 30.4, ymin = 0.4, ymax = 30.4, col = "black", fill = NA, lwd = 1)
 # sample
 set.seed(456)
 n1 <- 10L
 n2 <- 15L
 n3 <- 10L
 n4 <- 15L
 xs <- c(sample(1:30, n1, FALSE), sample(1:30, n2, TRUE), sample(1:30, n3, FALSE), sample(1:30, n4, TRUE))
 ys <- c(rep(1, n1), sample(2:15, n2, TRUE), rep(16, n3), sample(17:30, n4, TRUE))

 # plot sampled units
 for (i in seq_len(sum(n1,n2,n3,n4))) {
    w2 <- w2 + annotate("rect", xmin = xs[i]-0.5, xmax = xs[i]+0.4, ymin = ys[i]-0.6, ymax = ys[i]+0.4, col = "black", alpha=0.5, lwd = 0.5)
 }
 # save
 w2 + ggtitle("Sampling n=50, oversampling 'small' groups\nfrom Population of N=900") +
     xlab("Respondents from 'Small' Groups = 20\nProbability of Being Sampled (Big) = 30/840 = 0.036; so weighted 1.56\nProbability of Being Sampled (Small) = 20/60 = 0.17; so weighted 0.12")
 ggsave("stratified.png", width = 8, height = 8)

 # weight for "big" group for n equivalent to 50
 wb <- (840/900)/(30/50)
 # weight for "small" group for n equivalent to 50
 ws <- (60/900)/(20/50)
 (wb*30) + (ws*20)

 # ratio of big/small groups in sample matches ratio of big/small groups in population
 840/60
 (wb*30)/(ws*20)
	# uses dev version of 'waffle'
	# devtools::install_github("leeper/waffle@patch-1")
	library("waffle")
	library("extrafont")

	# population
	set.seed(1)
	N <- 900L
	p <- c("Small Group 1" = 30, "Big Group 1" = 420, "Small Group 2" = 30,"Big Group 2" = 420)
	glyph <- c("male", "female")[sample(1:2, N, TRUE)]
	w <- waffle(p, rows = 30, flip = TRUE, use_glyph = glyph, glyph_size = 4)

	# SRS
	w1 <- w + annotate("rect", xmin = 0.5, xmax = 30.4, ymin = 0.4, ymax = 30.4, col = "black", fill = NA, lwd = 1) +
	# sample
	set.seed(123)
	n <- 50L
	xs <- sample(1:30, n, TRUE)
	ys <- sample(1:30, n, TRUE)
	# plot sampled units
	for (i in seq_len(n)) {
	w1 <- w1 + annotate("rect", xmin = xs[i]-0.5, xmax = xs[i]+0.4, ymin = ys[i]-0.6, ymax = ys[i]+0.4, col = "black", alpha=0.5, lwd = 0.5)
	}
	# save
	w1 + ggtitle("Sampling n=50\nfrom Population of N=900") +
	xlab("Respondents from 'Small' Groups = 2, each of whom hugely influences results\nProbability of Being Sampled = 50/900 = 0.56 (All respondents weighted equally)")
	ggsave("srs.png", width = 8, height = 8)


	# Stratified Sampling
	w2 <- w
	w2 <- w2 + annotate("rect", xmin = 0.5, xmax = 30.4, ymin = 0.4, ymax = 1.4, col = "black", fill = NA, lwd = 1) +
	annotate("rect", xmin = 0.5, xmax = 30.4, ymin = 0.4, ymax = 15.4, col = "black", fill = NA, lwd = 1) +
	annotate("rect", xmin = 0.5, xmax = 30.4, ymin = 15.4, ymax = 16.4, col = "black", fill = NA, lwd = 1) +
	annotate("rect", xmin = 0.5, xmax = 30.4, ymin = 0.4, ymax = 30.4, col = "black", fill = NA, lwd = 1)
	# sample
	set.seed(456)
	n1 <- 10L
	n2 <- 15L
	n3 <- 10L
	n4 <- 15L
	xs <- c(sample(1:30, n1, FALSE), sample(1:30, n2, TRUE), sample(1:30, n3, FALSE), sample(1:30, n4, TRUE))
	ys <- c(rep(1, n1), sample(2:15, n2, TRUE), rep(16, n3), sample(17:30, n4, TRUE))

	# plot sampled units
	for (i in seq_len(sum(n1,n2,n3,n4))) {
	w2 <- w2 + annotate("rect", xmin = xs[i]-0.5, xmax = xs[i]+0.4, ymin = ys[i]-0.6, ymax = ys[i]+0.4, col = "black", alpha=0.5, lwd = 0.5)
	}
	# save
	w2 + ggtitle("Sampling n=50, oversampling 'small' groups\nfrom Population of N=900") +
	xlab("Respondents from 'Small' Groups = 20\nProbability of Being Sampled (Big) = 30/840 = 0.036; so weighted 1.56\nProbability of Being Sampled (Small) = 20/60 = 0.17; so weighted 0.12")
	ggsave("stratified.png", width = 8, height = 8)

	# weight for "big" group for n equivalent to 50
	wb <- (840/900)/(30/50)
	# weight for "small" group for n equivalent to 50
	ws <- (60/900)/(20/50)
	(wb30) + (ws20)

	# ratio of big/small groups in sample matches ratio of big/small groups in population
	840/60
	(wb30)/(ws20)