FlukeAndFeather · August 3, 2021 17:43
diff --git a/datasaurus_example.R b/datasaurus_example.R
 # If you don't have devtools, install that first. If you have Windows and R version 4.0 (or greater), see: https://cran.r-project.org/bin/windows/Rtools/
 devtools::install_github("lockedata/datasauRus")

 # Load the datasaurus dozen and our favorite suite of data processing tools
 library(datasauRus)
 library(tidyverse)

 # Quick look at the contents of the datasaurus_dozen data frame
 summary(datasaurus_dozen)
 unique(datasaurus_dozen$dataset)

 # Scatter plots of the "slant up" and high lines" datasets
 filter(datasaurus_dozen, dataset == "slant_up") %>% 
  ggplot(aes(x, y)) +
  geom_point() +
  theme_classic()
 filter(datasaurus_dozen, dataset == "high_lines") %>% 
  ggplot(aes(x, y)) +
  geom_point() +
  theme_classic()

 # Scatter plot of the legend, the myth, the datasaurus
 filter(datasaurus_dozen, dataset == "dino") %>% 
  ggplot(aes(x, y)) +
  geom_point() +
  theme_classic()

 # Scatter plots of the full collection
 ggplot(datasaurus_dozen, aes(x, y, color = dataset)) +
  geom_point() +
  facet_wrap(~ dataset) +
  theme_classic() +
  theme(legend.position = "none")

 # This is what makes the datasaurus dozen so special: despite *looking* so distinctive to human eyes, they all
 # have the same summary statistics (mean, standard deviation, and correlation coefficient). A reminder that we
 # need to be discerning when we look at summary statistics, whether in a research article or in training a 
 # machine learning algorithm. Looking at the raw data, or using algorithms that look at the raw data, can be the
 # difference between useful inference and misleading nonsense.
 datasaurus_dozen %>% 
  group_by(dataset) %>% 
  summarize(mean(x),
            mean(y),
            sd(x),
            sd(y),
            cor(x, y))
	# If you don't have devtools, install that first. If you have Windows and R version 4.0 (or greater), see: https://cran.r-project.org/bin/windows/Rtools/
	devtools::install_github("lockedata/datasauRus")

	# Load the datasaurus dozen and our favorite suite of data processing tools
	library(datasauRus)
	library(tidyverse)

	# Quick look at the contents of the datasaurus_dozen data frame
	summary(datasaurus_dozen)
	unique(datasaurus_dozen$dataset)

	# Scatter plots of the "slant up" and high lines" datasets
	filter(datasaurus_dozen, dataset == "slant_up") %>%
	ggplot(aes(x, y)) +
	geom_point() +
	theme_classic()
	filter(datasaurus_dozen, dataset == "high_lines") %>%
	ggplot(aes(x, y)) +
	geom_point() +
	theme_classic()

	# Scatter plot of the legend, the myth, the datasaurus
	filter(datasaurus_dozen, dataset == "dino") %>%
	ggplot(aes(x, y)) +
	geom_point() +
	theme_classic()

	# Scatter plots of the full collection
	ggplot(datasaurus_dozen, aes(x, y, color = dataset)) +
	geom_point() +
	facet_wrap(~ dataset) +
	theme_classic() +
	theme(legend.position = "none")

	# This is what makes the datasaurus dozen so special: despite looking so distinctive to human eyes, they all
	# have the same summary statistics (mean, standard deviation, and correlation coefficient). A reminder that we
	# need to be discerning when we look at summary statistics, whether in a research article or in training a
	# machine learning algorithm. Looking at the raw data, or using algorithms that look at the raw data, can be the
	# difference between useful inference and misleading nonsense.
	datasaurus_dozen %>%
	group_by(dataset) %>%
	summarize(mean(x),
	mean(y),
	sd(x),
	sd(y),
	cor(x, y))