bestdan · February 12, 2021 15:28
diff --git a/medians_vs_logmeans.R b/medians_vs_logmeans.R
 #' @title Log-means versus Medians
 #' @author Daniel Egan
 #' @description When data has a power law or extremely skewed distribution,
 #' using a log-mean usually results in more stable and useful central estimates
 #' compared to a mean or a median.
 #' https://towardsdatascience.com/on-average-youre-using-the-wrong-average-geometric-harmonic-means-in-data-analysis-2a703e21ea0

 library(dplyr)
 library(tidyr)
 library(ggplot2)

 rdata<- 10^rnorm(30000, mean=3, sd=3)
 n_samples <- 200
 means_df <- data.frame(sample_id = 1:n_samples,
                       sample_median = NA_real_,
                       sample_mean = NA_real_,
                       sample_log_mean = NA_real_)

 log10mean <- function(x){
  10 ^ (mean(log10(x)))
 }
 # log10mean(rdata[1:10])


 for(i in 1:n_samples){
  this_sample <- sample(rdata, size = 200)
  means_df$sample_median[i] <- median(this_sample)
  means_df$sample_mean[i] <- mean(this_sample)
  means_df$sample_log_mean[i] <- log10mean(this_sample)
 }


 means_df %>%
  pivot_longer(c(sample_median, sample_mean, sample_log_mean), names_to = "type") %>%
  ggplot(aes(x = value, group = type)) +
  geom_density() +
  facet_wrap(. ~ type, scales = "free") +
  labs(title = "Sample means using different methods",
       subtitle = "Note fewer outliers/ more normal distribution for median & log-mean.")


 #' Suppose we used each method and looked at out-of-sample
 #' error, i.e. the RMSE  have compared to another.
 rmse <- function(x){
  sqrt(mean(x))
 }

 median_ratio <- data.frame(type = "median", ratio = abs(as.vector(outer(X = means_df$sample_mean,Y = means_df$sample_median, FUN = "-"))))
 mean_ratio <- data.frame(type = "mean", ratio = abs(as.vector(outer(X = means_df$sample_mean,Y = means_df$sample_mean, FUN = "-"))))
 logmean_ratio <- data.frame(type = "logmean", ratio = abs(as.vector(outer(X = means_df$sample_mean,Y = means_df$sample_log_mean, FUN = "-"))))

 ratios_df <- rbind(median_ratio, mean_ratio, logmean_ratio)
 ratios_df %>%
  ggplot(aes(sample = ratio, group = type)) +
  geom_qq_line() +
  facet_wrap(. ~ type, scales = ) +
  labs(title = "Distribution of absolute errors",
       subtitle = "Note fewer outliers/ more normal distribution for median & log-mean.")

 #' Log-means vs medians?
 means_df %>%
  ggplot(aes(x = sample_median, y = sample_log_mean)) +
  geom_point() +
  geom_abline(slope = 1, intercept = 0) +
  coord_cartesian(xlim = c(0, 5000), ylim = c(0, 5000))
  labs(title = "Distribution of absolute errors",
       subtitle = "Note fewer outliers/ more normal distribution for median & log-mean.")



 #' What about with data that aren't perfect base-10 generated dude?
	#' @title Log-means versus Medians
	#' @author Daniel Egan
	#' @description When data has a power law or extremely skewed distribution,
	#' using a log-mean usually results in more stable and useful central estimates
	#' compared to a mean or a median.
	#' https://towardsdatascience.com/on-average-youre-using-the-wrong-average-geometric-harmonic-means-in-data-analysis-2a703e21ea0

	library(dplyr)
	library(tidyr)
	library(ggplot2)

	rdata<- 10^rnorm(30000, mean=3, sd=3)
	n_samples <- 200
	means_df <- data.frame(sample_id = 1:n_samples,
	sample_median = NA_real_,
	sample_mean = NA_real_,
	sample_log_mean = NA_real_)

	log10mean <- function(x){
	10 ^ (mean(log10(x)))
	}
	# log10mean(rdata[1:10])


	for(i in 1:n_samples){
	this_sample <- sample(rdata, size = 200)
	means_df$sample_median[i] <- median(this_sample)
	means_df$sample_mean[i] <- mean(this_sample)
	means_df$sample_log_mean[i] <- log10mean(this_sample)
	}


	means_df %>%
	pivot_longer(c(sample_median, sample_mean, sample_log_mean), names_to = "type") %>%
	ggplot(aes(x = value, group = type)) +
	geom_density() +
	facet_wrap(. ~ type, scales = "free") +
	labs(title = "Sample means using different methods",
	subtitle = "Note fewer outliers/ more normal distribution for median & log-mean.")


	#' Suppose we used each method and looked at out-of-sample
	#' error, i.e. the RMSE have compared to another.
	rmse <- function(x){
	sqrt(mean(x))
	}

	median_ratio <- data.frame(type = "median", ratio = abs(as.vector(outer(X = means_df$sample_mean,Y = means_df$sample_median, FUN = "-"))))
	mean_ratio <- data.frame(type = "mean", ratio = abs(as.vector(outer(X = means_df$sample_mean,Y = means_df$sample_mean, FUN = "-"))))
	logmean_ratio <- data.frame(type = "logmean", ratio = abs(as.vector(outer(X = means_df$sample_mean,Y = means_df$sample_log_mean, FUN = "-"))))

	ratios_df <- rbind(median_ratio, mean_ratio, logmean_ratio)
	ratios_df %>%
	ggplot(aes(sample = ratio, group = type)) +
	geom_qq_line() +
	facet_wrap(. ~ type, scales = ) +
	labs(title = "Distribution of absolute errors",
	subtitle = "Note fewer outliers/ more normal distribution for median & log-mean.")

	#' Log-means vs medians?
	means_df %>%
	ggplot(aes(x = sample_median, y = sample_log_mean)) +
	geom_point() +
	geom_abline(slope = 1, intercept = 0) +
	coord_cartesian(xlim = c(0, 5000), ylim = c(0, 5000))
	labs(title = "Distribution of absolute errors",
	subtitle = "Note fewer outliers/ more normal distribution for median & log-mean.")



	#' What about with data that aren't perfect base-10 generated dude?