echasnovski · October 29, 2019 14:18
diff --git a/geom_parallel_slopes.md b/geom_parallel_slopes.md
diff --git a/geom_parallel_slopes.R b/geom_parallel_slopes.R
 library(tidyverse)


 # Geom creation -----------------------------------------------------------
 StatParallelSlopes <- ggproto(
  "StatParallelSlopes", Stat,

  required_aes = c("x", "y"),

  compute_panel = function(data, scales, se = TRUE, formula = y ~ x, n = 100) {
    if (nrow(data) == 0) {
      return(data[integer(0), ])
    }

    # Compute model data
    model_info <- compute_model_info(data, formula)
    formula <- model_info$formula
    data <- model_info$data

    # Fit model
    model <- lm(formula = formula, data = data)

    # Compute prediction from model based on sequence of x-values defined for
    # every group separately
    stats <- lapply(
      X = split(data, data$group), FUN = predict_per_group,
      model = model, se = se, n = n
    )

    # Combine predictions into one data frame
    dplyr::bind_rows(stats)
  }
 )

 geom_parallel_slopes <- function(mapping = NULL, data = NULL,
                                 position = "identity", ...,
                                 se = TRUE, formula = y ~ x, n = 100,
                                 na.rm = FALSE, show.legend = NA,
                                 inherit.aes = TRUE) {
  layer(
    geom = GeomSmooth, stat = StatParallelSlopes, data = data,
    mapping = mapping, position = position,
    params = list(na.rm = na.rm, se = se, formula = formula, n = n, ...),
    inherit.aes = inherit.aes, show.legend = show.legend
  )
 }

 compute_model_info <- function(data, formula) {
  if (has_unique_value(data$group)) {
    # Apparently, in {ggplot2} 'group' column is always present at the stage of
    # computing panel data. It is filled with `-1` if it wasn't
    # supplied as aesthetic and with `1` if it has only one unique value.
    warning(
      "`geom_parallel_slopes()` didn't recieve a grouping variable with more ",
      "than one unique value. Make sure you supply one. Basic model is fitted.",
      call. = FALSE
    )
  } else {
    data$group <- as.factor(data$group)
    # Actually make model to have parallel slopes
    formula <- as.formula(paste0(deparse(formula), " + group"))
  }

  list(formula = formula, data = data)
 }

 predict_per_group <- function(group_df, model, se, n) {
  # This code is a modified version of `ggplot2:::predictdf.default`

  # Create a data on which to perform prediction
  x_seq <- seq(min(group_df$x), max(group_df$x), length.out = n)
  group_seq <- rep(group_df$group[1], n)
  new_data <- data.frame(x = x_seq, group = group_seq)

  # Perform prediction
  pred <- stats::predict(
    model, newdata = new_data, se.fit = se,
    interval = if (se) "confidence" else "none"
  )

  # Convert prediction to "ggplot2 format"
  if (isTRUE(se)) {
    fit <- as.data.frame(pred$fit)
    names(fit) <- c("y", "ymin", "ymax")

    res <- data.frame(x = x_seq, fit, se = pred$se.fit)
  } else {
    res <- data.frame(x = x_seq, y = as.vector(pred))
  }

  # Restore columns that describe group with unique value (like color, etc.)
  # so that they can be used in output plot
  restore_unique_cols(res, group_df)
 }

 # Combination of source code from `ggplot2::StatSmooth$compute_panel` and
 # `ggplot2:::uniquecols`
 restore_unique_cols <- function(new, old) {
  is_unique <- sapply(old, has_unique_value)
  unique_df <- old[1, is_unique, drop = FALSE]
  rownames(unique_df) <- seq_len(nrow(unique_df))

  missing <- !(names(unique_df) %in% names(new))

  cbind(new, unique_df[rep(1, nrow(new)), missing, drop = FALSE])
 }

 has_unique_value <- function(x) {
  length(unique(x)) <= 1
 }


 # Demonstration -----------------------------------------------------------
 set.seed(202)
 dummy_df <- bind_rows(
  tibble(a = runif(10, 0, 0.1), gr = 1),
  tibble(a = runif(15, 0.1, 0.3), gr = 2),
  tibble(a = runif(20, 0.4, 1), gr = 3)
 ) %>%
  mutate(
    pan = factor(sample(1:3, n(), replace = TRUE)),
    gr = factor(gr),
    # Slope depends on panel, intercept on group
    b = as.integer(pan)*a + as.integer(gr) + runif(n(), max = 0.1),
    # Quadratic curves
    c = as.integer(pan)*a^2 + as.integer(gr) + runif(n(), max = 0.1),
  )

 # Basic usage
 viz <- ggplot(dummy_df, aes(a, b, colour = gr)) + geom_point()
 viz + geom_smooth(method = "lm") + labs(title = "geom_smooth()")
  # Roughly same width of error boundaries is a nice demonstration of the fact
  # that model was fitted to the whole data and not per group (in which error's
  # relative amplitude actually differ).
 viz + geom_parallel_slopes() + labs(title = "geom_parallel_slopes()")

  # Demonstration that models are different
 viz +
  geom_smooth(aes(group = gr), method = "lm", color = "blue", se = FALSE) +
  geom_parallel_slopes(aes(group = gr), color = "green", se = FALSE) +
  labs(
    title =
      "geom_smooth() (blue) and geom_parallel_slopes() (green) are different"
  )

 # Extra aesthetics
 viz +
  geom_parallel_slopes(mapping = aes(group = gr), color = "red", size = 3) +
  labs(title = "geom_parallel_slopes() with modified aesthetics")

 # Different formula
 viz_quad <- ggplot(dummy_df, aes(a, c, colour = gr)) + geom_point()
 viz_quad +
  geom_smooth(method = "lm", formula = y ~ poly(x, 2)) +
  labs(title = "Quadratic geom_smooth()")
 viz_quad +
  geom_parallel_slopes(formula = y ~ poly(x, 2)) +
  labs(title = "Quadratic geom_parallel_slopes()")

  # Demonstration that linear and quadratic models are actually different
 ggplot(dummy_df, aes(a, c, colour = gr, group = gr)) +
  geom_point(show.legend = FALSE) +
  geom_parallel_slopes(color = "blue", se = FALSE) +
  geom_parallel_slopes(color = "green", se = FALSE, formula = y ~ poly(x, 2)) +
  labs(
    title =
      "Linear (blue) and quadratic (green) parallel slopes models are different"
  )

 # Model is fitted per facet panel
 viz_faceted <- ggplot(dummy_df, aes(a, b, colour = gr)) +
  geom_point() +
  facet_wrap(~ pan)
 viz_faceted + geom_smooth(method = "lm") + labs(title = "Faceted geom_smooth()")
 viz_faceted +
  geom_parallel_slopes() +
  labs(title = "Faceted geom_parallel_slopes()")

 # Edge cases
  # Works with not factor grouping variable (in which case it should be
  # explicitly set in `group` aesthetics, as in `geom_smooth()`)
 dummy_df %>%
  mutate(gr = as.integer(gr)) %>%
  ggplot(aes(a, b, colour = gr, group = gr)) +
    geom_point() +
    geom_smooth(method = "lm") +
    geom_parallel_slopes() +
    labs(title = "Works with non-factor grouping")

  # Works without grouping variable or with one level in it (just as
  # `geom_smooth()`), but with a relevant warning
 ggplot(dummy_df, aes(a, b)) +
  geom_point() +
  geom_smooth(method = "lm") +
  # It is plotted exactly over `geom_smooth()`
  geom_parallel_slopes() +
  labs(title = "Works with no grouping variable")
 ggplot(dummy_df %>% mutate(gr = 1), aes(a, b, group = gr)) +
  geom_point() +
  geom_smooth(method = "lm") +
  # It is plotted exactly over `geom_smooth()`
  geom_parallel_slopes() +
  labs(title = "Works with grouping variable with one value")
	library(tidyverse)


	# Geom creation -----------------------------------------------------------
	StatParallelSlopes <- ggproto(
	"StatParallelSlopes", Stat,

	required_aes = c("x", "y"),

	compute_panel = function(data, scales, se = TRUE, formula = y ~ x, n = 100) {
	if (nrow(data) == 0) {
	return(data[integer(0), ])
	}

	# Compute model data
	model_info <- compute_model_info(data, formula)
	formula <- model_info$formula
	data <- model_info$data

	# Fit model
	model <- lm(formula = formula, data = data)

	# Compute prediction from model based on sequence of x-values defined for
	# every group separately
	stats <- lapply(
	X = split(data, data$group), FUN = predict_per_group,
	model = model, se = se, n = n
	)

	# Combine predictions into one data frame
	dplyr::bind_rows(stats)
	}
	)

	geom_parallel_slopes <- function(mapping = NULL, data = NULL,
	position = "identity", ...,
	se = TRUE, formula = y ~ x, n = 100,
	na.rm = FALSE, show.legend = NA,
	inherit.aes = TRUE) {
	layer(
	geom = GeomSmooth, stat = StatParallelSlopes, data = data,
	mapping = mapping, position = position,
	params = list(na.rm = na.rm, se = se, formula = formula, n = n, ...),
	inherit.aes = inherit.aes, show.legend = show.legend
	)
	}

	compute_model_info <- function(data, formula) {
	if (has_unique_value(data$group)) {
	# Apparently, in {ggplot2} 'group' column is always present at the stage of
	# computing panel data. It is filled with `-1` if it wasn't
	# supplied as aesthetic and with `1` if it has only one unique value.
	warning(
	"`geom_parallel_slopes()` didn't recieve a grouping variable with more ",
	"than one unique value. Make sure you supply one. Basic model is fitted.",
	call. = FALSE
	)
	} else {
	data$group <- as.factor(data$group)
	# Actually make model to have parallel slopes
	formula <- as.formula(paste0(deparse(formula), " + group"))
	}

	list(formula = formula, data = data)
	}

	predict_per_group <- function(group_df, model, se, n) {
	# This code is a modified version of `ggplot2:::predictdf.default`

	# Create a data on which to perform prediction
	x_seq <- seq(min(group_df$x), max(group_df$x), length.out = n)
	group_seq <- rep(group_df$group[1], n)
	new_data <- data.frame(x = x_seq, group = group_seq)

	# Perform prediction
	pred <- stats::predict(
	model, newdata = new_data, se.fit = se,
	interval = if (se) "confidence" else "none"
	)

	# Convert prediction to "ggplot2 format"
	if (isTRUE(se)) {
	fit <- as.data.frame(pred$fit)
	names(fit) <- c("y", "ymin", "ymax")

	res <- data.frame(x = x_seq, fit, se = pred$se.fit)
	} else {
	res <- data.frame(x = x_seq, y = as.vector(pred))
	}

	# Restore columns that describe group with unique value (like color, etc.)
	# so that they can be used in output plot
	restore_unique_cols(res, group_df)
	}

	# Combination of source code from `ggplot2::StatSmooth$compute_panel` and
	# `ggplot2:::uniquecols`
	restore_unique_cols <- function(new, old) {
	is_unique <- sapply(old, has_unique_value)
	unique_df <- old[1, is_unique, drop = FALSE]
	rownames(unique_df) <- seq_len(nrow(unique_df))

	missing <- !(names(unique_df) %in% names(new))

	cbind(new, unique_df[rep(1, nrow(new)), missing, drop = FALSE])
	}

	has_unique_value <- function(x) {
	length(unique(x)) <= 1
	}


	# Demonstration -----------------------------------------------------------
	set.seed(202)
	dummy_df <- bind_rows(
	tibble(a = runif(10, 0, 0.1), gr = 1),
	tibble(a = runif(15, 0.1, 0.3), gr = 2),
	tibble(a = runif(20, 0.4, 1), gr = 3)
	) %>%
	mutate(
	pan = factor(sample(1:3, n(), replace = TRUE)),
	gr = factor(gr),
	# Slope depends on panel, intercept on group
	b = as.integer(pan)*a + as.integer(gr) + runif(n(), max = 0.1),
	# Quadratic curves
	c = as.integer(pan)*a^2 + as.integer(gr) + runif(n(), max = 0.1),
	)

	# Basic usage
	viz <- ggplot(dummy_df, aes(a, b, colour = gr)) + geom_point()
	viz + geom_smooth(method = "lm") + labs(title = "geom_smooth()")
	# Roughly same width of error boundaries is a nice demonstration of the fact
	# that model was fitted to the whole data and not per group (in which error's
	# relative amplitude actually differ).
	viz + geom_parallel_slopes() + labs(title = "geom_parallel_slopes()")

	# Demonstration that models are different
	viz +
	geom_smooth(aes(group = gr), method = "lm", color = "blue", se = FALSE) +
	geom_parallel_slopes(aes(group = gr), color = "green", se = FALSE) +
	labs(
	title =
	"geom_smooth() (blue) and geom_parallel_slopes() (green) are different"
	)

	# Extra aesthetics
	viz +
	geom_parallel_slopes(mapping = aes(group = gr), color = "red", size = 3) +
	labs(title = "geom_parallel_slopes() with modified aesthetics")

	# Different formula
	viz_quad <- ggplot(dummy_df, aes(a, c, colour = gr)) + geom_point()
	viz_quad +
	geom_smooth(method = "lm", formula = y ~ poly(x, 2)) +
	labs(title = "Quadratic geom_smooth()")
	viz_quad +
	geom_parallel_slopes(formula = y ~ poly(x, 2)) +
	labs(title = "Quadratic geom_parallel_slopes()")

	# Demonstration that linear and quadratic models are actually different
	ggplot(dummy_df, aes(a, c, colour = gr, group = gr)) +
	geom_point(show.legend = FALSE) +
	geom_parallel_slopes(color = "blue", se = FALSE) +
	geom_parallel_slopes(color = "green", se = FALSE, formula = y ~ poly(x, 2)) +
	labs(
	title =
	"Linear (blue) and quadratic (green) parallel slopes models are different"
	)

	# Model is fitted per facet panel
	viz_faceted <- ggplot(dummy_df, aes(a, b, colour = gr)) +
	geom_point() +
	facet_wrap(~ pan)
	viz_faceted + geom_smooth(method = "lm") + labs(title = "Faceted geom_smooth()")
	viz_faceted +
	geom_parallel_slopes() +
	labs(title = "Faceted geom_parallel_slopes()")

	# Edge cases
	# Works with not factor grouping variable (in which case it should be
	# explicitly set in `group` aesthetics, as in `geom_smooth()`)
	dummy_df %>%
	mutate(gr = as.integer(gr)) %>%
	ggplot(aes(a, b, colour = gr, group = gr)) +
	geom_point() +
	geom_smooth(method = "lm") +
	geom_parallel_slopes() +
	labs(title = "Works with non-factor grouping")

	# Works without grouping variable or with one level in it (just as
	# `geom_smooth()`), but with a relevant warning
	ggplot(dummy_df, aes(a, b)) +
	geom_point() +
	geom_smooth(method = "lm") +
	# It is plotted exactly over `geom_smooth()`
	geom_parallel_slopes() +
	labs(title = "Works with no grouping variable")
	ggplot(dummy_df %>% mutate(gr = 1), aes(a, b, group = gr)) +
	geom_point() +
	geom_smooth(method = "lm") +
	# It is plotted exactly over `geom_smooth()`
	geom_parallel_slopes() +
	labs(title = "Works with grouping variable with one value")