bschneidr · February 25, 2021 20:26
diff --git a/visualizing-ratio-greg-estimator.r b/visualizing-ratio-greg-estimator.r
 library(magrittr)
 library(tidyverse)
 library(readr)
 library(readxl)
 library(schneidr)

 theme_set(theme_schneidr(base_font_family = 'sans',
                         titles_font_family = 'sans'))

 # Generate population

 x <- rnorm(n = 50, mean = 5, sd = 1)
 y <- 2*x + rnorm(n = 50, mean = 0, sd = 1)

 r_squared <- summary(lm(y ~ -1 + x))$adj.r.squared

 Y_total <- sum(y)
 X_total <- sum(x)
 Y_mean <- mean(y)
 X_mean <- mean(x)

 # Generate samples

 S <- combn(x = 1:50, m = 3, simplify = FALSE)

 sample_estimates <- lapply(S, function(s) {
  c('xbar' = mean(x[s]),
    'ybar' = mean(y[s]))
 })

 sample_estimates <- do.call(rbind, sample_estimates)
 sample_estimates <- as.data.frame(sample_estimates)

 sample_estimates <- sample_estimates %>%
  mutate(b = ybar/xbar,
         y_ratio = X_total * b,
         ybar_ratio = X_mean * b)

 # Plot the results

 plot_data <- sample_estimates %>%
  sample_n(size = 30) %>%
  mutate(sample_index = row_number()) %>%
  select(sample_index, xbar, ybar, ybar_ratio) %>%
  mutate(which_better = case_when(abs(ybar - Y_mean) > abs(ybar_ratio - Y_mean) ~ 'ratio',
                                  abs(ybar - Y_mean) < abs(ybar_ratio - Y_mean) ~ 'mean')) %>%
  tidyr::gather(key = 'estimator', value = 'estimate',
                matches('ybar'))

 plot_data %>%
  mutate(xbar = case_when(estimator == "ybar_ratio" ~ X_mean,
                          TRUE ~ xbar)) %>%
  arrange(estimator) %>%
  ggplot(aes(x = xbar, y = estimate)) +
  geom_hline(yintercept = Y_mean) +
  annotate('text', y = 10, x = Y_mean, vjust = -0.5,
           label = "True mean of Y") +
  geom_vline(xintercept = X_mean) +
  annotate('text', y = 20, x = X_mean, hjust = 1,
           label = "True mean of X ") +
  geom_point(data = plot_data %>%
               filter(estimator == "ybar"),
             aes(x = xbar, y = estimate, group = sample_index)) +
  geom_path(aes(group = sample_index,
                color = which_better),
            arrow = arrow(length = unit(0.075, 'inches'))) +
  geom_abline(intercept = 0, slope = Y_mean/X_mean) +
  annotate('text', x = 10, y = 10 * (Y_mean/X_mean), hjust = -0.25,
           label = "True ratio of means") +
  scale_x_continuous(limits = c(0, 15)) +
  scale_y_continuous(limits = c(0, 25)) +
  labs(
    y = 'estimate of\nY-bar',
    x = 'x-bar',
    title = "Simple means versus ratio (GREG) estimates",
    subtitle = "True means (vertical and horizontal lines)\nSample means (points)\ncorresponding GREG estimate given by arrow"
  )
	library(magrittr)
	library(tidyverse)
	library(readr)
	library(readxl)
	library(schneidr)

	theme_set(theme_schneidr(base_font_family = 'sans',
	titles_font_family = 'sans'))

	# Generate population

	x <- rnorm(n = 50, mean = 5, sd = 1)
	y <- 2*x + rnorm(n = 50, mean = 0, sd = 1)

	r_squared <- summary(lm(y ~ -1 + x))$adj.r.squared

	Y_total <- sum(y)
	X_total <- sum(x)
	Y_mean <- mean(y)
	X_mean <- mean(x)

	# Generate samples

	S <- combn(x = 1:50, m = 3, simplify = FALSE)

	sample_estimates <- lapply(S, function(s) {
	c('xbar' = mean(x[s]),
	'ybar' = mean(y[s]))
	})

	sample_estimates <- do.call(rbind, sample_estimates)
	sample_estimates <- as.data.frame(sample_estimates)

	sample_estimates <- sample_estimates %>%
	mutate(b = ybar/xbar,
	y_ratio = X_total * b,
	ybar_ratio = X_mean * b)

	# Plot the results

	plot_data <- sample_estimates %>%
	sample_n(size = 30) %>%
	mutate(sample_index = row_number()) %>%
	select(sample_index, xbar, ybar, ybar_ratio) %>%
	mutate(which_better = case_when(abs(ybar - Y_mean) > abs(ybar_ratio - Y_mean) ~ 'ratio',
	abs(ybar - Y_mean) < abs(ybar_ratio - Y_mean) ~ 'mean')) %>%
	tidyr::gather(key = 'estimator', value = 'estimate',
	matches('ybar'))

	plot_data %>%
	mutate(xbar = case_when(estimator == "ybar_ratio" ~ X_mean,
	TRUE ~ xbar)) %>%
	arrange(estimator) %>%
	ggplot(aes(x = xbar, y = estimate)) +
	geom_hline(yintercept = Y_mean) +
	annotate('text', y = 10, x = Y_mean, vjust = -0.5,
	label = "True mean of Y") +
	geom_vline(xintercept = X_mean) +
	annotate('text', y = 20, x = X_mean, hjust = 1,
	label = "True mean of X ") +
	geom_point(data = plot_data %>%
	filter(estimator == "ybar"),
	aes(x = xbar, y = estimate, group = sample_index)) +
	geom_path(aes(group = sample_index,
	color = which_better),
	arrow = arrow(length = unit(0.075, 'inches'))) +
	geom_abline(intercept = 0, slope = Y_mean/X_mean) +
	annotate('text', x = 10, y = 10 * (Y_mean/X_mean), hjust = -0.25,
	label = "True ratio of means") +
	scale_x_continuous(limits = c(0, 15)) +
	scale_y_continuous(limits = c(0, 25)) +
	labs(
	y = 'estimate of\nY-bar',
	x = 'x-bar',
	title = "Simple means versus ratio (GREG) estimates",
	subtitle = "True means (vertical and horizontal lines)\nSample means (points)\ncorresponding GREG estimate given by arrow"
	)