Skip to content

Instantly share code, notes, and snippets.

@echasnovski
Last active October 29, 2019 14:18
Show Gist options
  • Save echasnovski/468a326108b25c3731369b75e6bcf2b9 to your computer and use it in GitHub Desktop.
Save echasnovski/468a326108b25c3731369b75e6bcf2b9 to your computer and use it in GitHub Desktop.
Draft of `geom_parallel_slopes()` for {moderndive} package
library(tidyverse)


# Geom creation -----------------------------------------------------------
StatParallelSlopes <- ggproto(
  "StatParallelSlopes", Stat,

  required_aes = c("x", "y"),

  compute_panel = function(data, scales, se = TRUE, formula = y ~ x, n = 100) {
    if (nrow(data) == 0) {
      return(data[integer(0), ])
    }

    # Compute model data
    model_info <- compute_model_info(data, formula)
    formula <- model_info$formula
    data <- model_info$data

    # Fit model
    model <- lm(formula = formula, data = data)

    # Compute prediction from model based on sequence of x-values defined for
    # every group separately
    stats <- lapply(
      X = split(data, data$group), FUN = predict_per_group,
      model = model, se = se, n = n
    )

    # Combine predictions into one data frame
    dplyr::bind_rows(stats)
  }
)

geom_parallel_slopes <- function(mapping = NULL, data = NULL,
                                 position = "identity", ...,
                                 se = TRUE, formula = y ~ x, n = 100,
                                 na.rm = FALSE, show.legend = NA,
                                 inherit.aes = TRUE) {
  layer(
    geom = GeomSmooth, stat = StatParallelSlopes, data = data,
    mapping = mapping, position = position,
    params = list(na.rm = na.rm, se = se, formula = formula, n = n, ...),
    inherit.aes = inherit.aes, show.legend = show.legend
  )
}

compute_model_info <- function(data, formula) {
  if (has_unique_value(data$group)) {
    # Apparently, in {ggplot2} 'group' column is always present at the stage of
    # computing panel data. It is filled with `-1` if it wasn't
    # supplied as aesthetic and with `1` if it has only one unique value.
    warning(
      "`geom_parallel_slopes()` didn't recieve a grouping variable with more ",
      "than one unique value. Make sure you supply one. Basic model is fitted.",
      call. = FALSE
    )
  } else {
    data$group <- as.factor(data$group)
    # Actually make model to have parallel slopes
    formula <- as.formula(paste0(deparse(formula), " + group"))
  }

  list(formula = formula, data = data)
}

predict_per_group <- function(group_df, model, se, n) {
  # This code is a modified version of `ggplot2:::predictdf.default`

  # Create a data on which to perform prediction
  x_seq <- seq(min(group_df$x), max(group_df$x), length.out = n)
  group_seq <- rep(group_df$group[1], n)
  new_data <- data.frame(x = x_seq, group = group_seq)

  # Perform prediction
  pred <- stats::predict(
    model, newdata = new_data, se.fit = se,
    interval = if (se) "confidence" else "none"
  )

  # Convert prediction to "ggplot2 format"
  if (isTRUE(se)) {
    fit <- as.data.frame(pred$fit)
    names(fit) <- c("y", "ymin", "ymax")

    res <- data.frame(x = x_seq, fit, se = pred$se.fit)
  } else {
    res <- data.frame(x = x_seq, y = as.vector(pred))
  }

  # Restore columns that describe group with unique value (like color, etc.)
  # so that they can be used in output plot
  restore_unique_cols(res, group_df)
}

# Combination of source code from `ggplot2::StatSmooth$compute_panel` and
# `ggplot2:::uniquecols`
restore_unique_cols <- function(new, old) {
  is_unique <- sapply(old, has_unique_value)
  unique_df <- old[1, is_unique, drop = FALSE]
  rownames(unique_df) <- seq_len(nrow(unique_df))

  missing <- !(names(unique_df) %in% names(new))

  cbind(new, unique_df[rep(1, nrow(new)), missing, drop = FALSE])
}

has_unique_value <- function(x) {
  length(unique(x)) <= 1
}


# Demonstration -----------------------------------------------------------
set.seed(202)
dummy_df <- bind_rows(
  tibble(a = runif(10, 0, 0.1), gr = 1),
  tibble(a = runif(15, 0.1, 0.3), gr = 2),
  tibble(a = runif(20, 0.4, 1), gr = 3)
) %>%
  mutate(
    pan = factor(sample(1:3, n(), replace = TRUE)),
    gr = factor(gr),
    # Slope depends on panel, intercept on group
    b = as.integer(pan)*a + as.integer(gr) + runif(n(), max = 0.1),
    # Quadratic curves
    c = as.integer(pan)*a^2 + as.integer(gr) + runif(n(), max = 0.1),
  )

# Basic usage
viz <- ggplot(dummy_df, aes(a, b, colour = gr)) + geom_point()
viz + geom_smooth(method = "lm") + labs(title = "geom_smooth()")

  # Roughly same width of error boundaries is a nice demonstration of the fact
  # that model was fitted to the whole data and not per group (in which error's
  # relative amplitude actually differ).
viz + geom_parallel_slopes() + labs(title = "geom_parallel_slopes()")

  # Demonstration that models are different
viz +
  geom_smooth(aes(group = gr), method = "lm", color = "blue", se = FALSE) +
  geom_parallel_slopes(aes(group = gr), color = "green", se = FALSE) +
  labs(
    title =
      "geom_smooth() (blue) and geom_parallel_slopes() (green) are different"
  )

# Extra aesthetics
viz +
  geom_parallel_slopes(mapping = aes(group = gr), color = "red", size = 3) +
  labs(title = "geom_parallel_slopes() with modified aesthetics")

# Different formula
viz_quad <- ggplot(dummy_df, aes(a, c, colour = gr)) + geom_point()
viz_quad +
  geom_smooth(method = "lm", formula = y ~ poly(x, 2)) +
  labs(title = "Quadratic geom_smooth()")

viz_quad +
  geom_parallel_slopes(formula = y ~ poly(x, 2)) +
  labs(title = "Quadratic geom_parallel_slopes()")

  # Demonstration that linear and quadratic models are actually different
ggplot(dummy_df, aes(a, c, colour = gr, group = gr)) +
  geom_point(show.legend = FALSE) +
  geom_parallel_slopes(color = "blue", se = FALSE) +
  geom_parallel_slopes(color = "green", se = FALSE, formula = y ~ poly(x, 2)) +
  labs(
    title =
      "Linear (blue) and quadratic (green) parallel slopes models are different"
  )

# Model is fitted per facet panel
viz_faceted <- ggplot(dummy_df, aes(a, b, colour = gr)) +
  geom_point() +
  facet_wrap(~ pan)
viz_faceted + geom_smooth(method = "lm") + labs(title = "Faceted geom_smooth()")
#> Warning in qt((1 - level)/2, df): созданы NaN

viz_faceted +
  geom_parallel_slopes() +
  labs(title = "Faceted geom_parallel_slopes()")

# Edge cases
  # Works with not factor grouping variable (in which case it should be
  # explicitly set in `group` aesthetics, as in `geom_smooth()`)
dummy_df %>%
  mutate(gr = as.integer(gr)) %>%
  ggplot(aes(a, b, colour = gr, group = gr)) +
    geom_point() +
    geom_smooth(method = "lm") +
    geom_parallel_slopes() +
    labs(title = "Works with non-factor grouping")

  # Works without grouping variable or with one level in it (just as
  # `geom_smooth()`), but with a relevant warning
ggplot(dummy_df, aes(a, b)) +
  geom_point() +
  geom_smooth(method = "lm") +
  # It is plotted exactly over `geom_smooth()`
  geom_parallel_slopes() +
  labs(title = "Works with no grouping variable")
#> Warning: `geom_parallel_slopes()` didn't recieve a grouping variable
#> with more than one unique value. Make sure you supply one. Basic model is
#> fitted.

ggplot(dummy_df %>% mutate(gr = 1), aes(a, b, group = gr)) +
  geom_point() +
  geom_smooth(method = "lm") +
  # It is plotted exactly over `geom_smooth()`
  geom_parallel_slopes() +
  labs(title = "Works with grouping variable with one value")
#> Warning: `geom_parallel_slopes()` didn't recieve a grouping variable
#> with more than one unique value. Make sure you supply one. Basic model is
#> fitted.

Created on 2019-10-29 by the reprex package (v0.3.0)

library(tidyverse)
# Geom creation -----------------------------------------------------------
StatParallelSlopes <- ggproto(
"StatParallelSlopes", Stat,
required_aes = c("x", "y"),
compute_panel = function(data, scales, se = TRUE, formula = y ~ x, n = 100) {
if (nrow(data) == 0) {
return(data[integer(0), ])
}
# Compute model data
model_info <- compute_model_info(data, formula)
formula <- model_info$formula
data <- model_info$data
# Fit model
model <- lm(formula = formula, data = data)
# Compute prediction from model based on sequence of x-values defined for
# every group separately
stats <- lapply(
X = split(data, data$group), FUN = predict_per_group,
model = model, se = se, n = n
)
# Combine predictions into one data frame
dplyr::bind_rows(stats)
}
)
geom_parallel_slopes <- function(mapping = NULL, data = NULL,
position = "identity", ...,
se = TRUE, formula = y ~ x, n = 100,
na.rm = FALSE, show.legend = NA,
inherit.aes = TRUE) {
layer(
geom = GeomSmooth, stat = StatParallelSlopes, data = data,
mapping = mapping, position = position,
params = list(na.rm = na.rm, se = se, formula = formula, n = n, ...),
inherit.aes = inherit.aes, show.legend = show.legend
)
}
compute_model_info <- function(data, formula) {
if (has_unique_value(data$group)) {
# Apparently, in {ggplot2} 'group' column is always present at the stage of
# computing panel data. It is filled with `-1` if it wasn't
# supplied as aesthetic and with `1` if it has only one unique value.
warning(
"`geom_parallel_slopes()` didn't recieve a grouping variable with more ",
"than one unique value. Make sure you supply one. Basic model is fitted.",
call. = FALSE
)
} else {
data$group <- as.factor(data$group)
# Actually make model to have parallel slopes
formula <- as.formula(paste0(deparse(formula), " + group"))
}
list(formula = formula, data = data)
}
predict_per_group <- function(group_df, model, se, n) {
# This code is a modified version of `ggplot2:::predictdf.default`
# Create a data on which to perform prediction
x_seq <- seq(min(group_df$x), max(group_df$x), length.out = n)
group_seq <- rep(group_df$group[1], n)
new_data <- data.frame(x = x_seq, group = group_seq)
# Perform prediction
pred <- stats::predict(
model, newdata = new_data, se.fit = se,
interval = if (se) "confidence" else "none"
)
# Convert prediction to "ggplot2 format"
if (isTRUE(se)) {
fit <- as.data.frame(pred$fit)
names(fit) <- c("y", "ymin", "ymax")
res <- data.frame(x = x_seq, fit, se = pred$se.fit)
} else {
res <- data.frame(x = x_seq, y = as.vector(pred))
}
# Restore columns that describe group with unique value (like color, etc.)
# so that they can be used in output plot
restore_unique_cols(res, group_df)
}
# Combination of source code from `ggplot2::StatSmooth$compute_panel` and
# `ggplot2:::uniquecols`
restore_unique_cols <- function(new, old) {
is_unique <- sapply(old, has_unique_value)
unique_df <- old[1, is_unique, drop = FALSE]
rownames(unique_df) <- seq_len(nrow(unique_df))
missing <- !(names(unique_df) %in% names(new))
cbind(new, unique_df[rep(1, nrow(new)), missing, drop = FALSE])
}
has_unique_value <- function(x) {
length(unique(x)) <= 1
}
# Demonstration -----------------------------------------------------------
set.seed(202)
dummy_df <- bind_rows(
tibble(a = runif(10, 0, 0.1), gr = 1),
tibble(a = runif(15, 0.1, 0.3), gr = 2),
tibble(a = runif(20, 0.4, 1), gr = 3)
) %>%
mutate(
pan = factor(sample(1:3, n(), replace = TRUE)),
gr = factor(gr),
# Slope depends on panel, intercept on group
b = as.integer(pan)*a + as.integer(gr) + runif(n(), max = 0.1),
# Quadratic curves
c = as.integer(pan)*a^2 + as.integer(gr) + runif(n(), max = 0.1),
)
# Basic usage
viz <- ggplot(dummy_df, aes(a, b, colour = gr)) + geom_point()
viz + geom_smooth(method = "lm") + labs(title = "geom_smooth()")
# Roughly same width of error boundaries is a nice demonstration of the fact
# that model was fitted to the whole data and not per group (in which error's
# relative amplitude actually differ).
viz + geom_parallel_slopes() + labs(title = "geom_parallel_slopes()")
# Demonstration that models are different
viz +
geom_smooth(aes(group = gr), method = "lm", color = "blue", se = FALSE) +
geom_parallel_slopes(aes(group = gr), color = "green", se = FALSE) +
labs(
title =
"geom_smooth() (blue) and geom_parallel_slopes() (green) are different"
)
# Extra aesthetics
viz +
geom_parallel_slopes(mapping = aes(group = gr), color = "red", size = 3) +
labs(title = "geom_parallel_slopes() with modified aesthetics")
# Different formula
viz_quad <- ggplot(dummy_df, aes(a, c, colour = gr)) + geom_point()
viz_quad +
geom_smooth(method = "lm", formula = y ~ poly(x, 2)) +
labs(title = "Quadratic geom_smooth()")
viz_quad +
geom_parallel_slopes(formula = y ~ poly(x, 2)) +
labs(title = "Quadratic geom_parallel_slopes()")
# Demonstration that linear and quadratic models are actually different
ggplot(dummy_df, aes(a, c, colour = gr, group = gr)) +
geom_point(show.legend = FALSE) +
geom_parallel_slopes(color = "blue", se = FALSE) +
geom_parallel_slopes(color = "green", se = FALSE, formula = y ~ poly(x, 2)) +
labs(
title =
"Linear (blue) and quadratic (green) parallel slopes models are different"
)
# Model is fitted per facet panel
viz_faceted <- ggplot(dummy_df, aes(a, b, colour = gr)) +
geom_point() +
facet_wrap(~ pan)
viz_faceted + geom_smooth(method = "lm") + labs(title = "Faceted geom_smooth()")
viz_faceted +
geom_parallel_slopes() +
labs(title = "Faceted geom_parallel_slopes()")
# Edge cases
# Works with not factor grouping variable (in which case it should be
# explicitly set in `group` aesthetics, as in `geom_smooth()`)
dummy_df %>%
mutate(gr = as.integer(gr)) %>%
ggplot(aes(a, b, colour = gr, group = gr)) +
geom_point() +
geom_smooth(method = "lm") +
geom_parallel_slopes() +
labs(title = "Works with non-factor grouping")
# Works without grouping variable or with one level in it (just as
# `geom_smooth()`), but with a relevant warning
ggplot(dummy_df, aes(a, b)) +
geom_point() +
geom_smooth(method = "lm") +
# It is plotted exactly over `geom_smooth()`
geom_parallel_slopes() +
labs(title = "Works with no grouping variable")
ggplot(dummy_df %>% mutate(gr = 1), aes(a, b, group = gr)) +
geom_point() +
geom_smooth(method = "lm") +
# It is plotted exactly over `geom_smooth()`
geom_parallel_slopes() +
labs(title = "Works with grouping variable with one value")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment