DavisVaughan · October 14, 2021 18:39 · DavisVaughan · Oct 14, 2021
diff --git a/cases-of-case-when.R b/cases-of-case-when.R
 df <- tibble(
  g = c(1, 1, 1, 1, 2, 2, 2, 2),
  x = 1:8,
  y = 8:1
 )

 # Group to show that this is applied per groups nicely
 df <- group_by(df, g)

 x <- df$x
 x
 #> [1] 1 2 3 4 5 6 7 8

 df
 #> # A tibble: 8 × 3
 #> # Groups:   g [2]
 #>       g     x     y
 #>   <dbl> <int> <int>
 #> 1     1     1     8
 #> 2     1     2     7
 #> 3     1     3     6
 #> 4     1     4     5
 #> 5     2     5     4
 #> 6     2     6     3
 #> 7     2     7     2
 #> 8     2     8     1


 # - 1 new col
 # - 1 or N cases
 # - Default = user supplied

 case_when(
  x > 6, 6,
  x > 3, 3,
  default = 1
 )
 #> [1] 1 1 1 3 3 3 6 6

 # Can use vectors as replacements which get sliced to the size of sum(condition)
 df %>%
  mutate(
    z = case_when(
      x > 6, mean(y),
      x > 3, y,
      default = 1
    )
  )
 #> # A tibble: 8 × 4
 #> # Groups:   g [2]
 #>       g     x     y     z
 #>   <dbl> <int> <int> <dbl>
 #> 1     1     1     8   1  
 #> 2     1     2     7   1  
 #> 3     1     3     6   1  
 #> 4     1     4     5   5  
 #> 5     2     5     4   4  
 #> 6     2     6     3   3  
 #> 7     2     7     2   2.5
 #> 8     2     8     1   2.5

 # - N new cols
 # - 1 or N cases
 # - Default = user supplied

 case_when(
  x > 6, tibble(x = 1, y = 2),
  x > 3, tibble(x = 3, y = 6),
  default = tibble(x = NA, y = NA)
 )
 #> # A tibble: 8 × 2
 #>       x     y
 #>   <dbl> <dbl>
 #> 1    NA    NA
 #> 2    NA    NA
 #> 3    NA    NA
 #> 4     3     6
 #> 5     3     6
 #> 6     3     6
 #> 7     1     2
 #> 8     1     2

 # Data frame auto expansion for the win!
 # (CRAN dplyr can't do this)
 df %>%
  mutate(
    case_when(
      x > 6, tibble(a = 1, b = mean(y)),
      x > 3, tibble(a = 3, b = y),
      default = tibble(a = NA, b = NA)
    )
  )
 #> # A tibble: 8 × 5
 #> # Groups:   g [2]
 #>       g     x     y     a     b
 #>   <dbl> <int> <int> <dbl> <dbl>
 #> 1     1     1     8    NA  NA  
 #> 2     1     2     7    NA  NA  
 #> 3     1     3     6    NA  NA  
 #> 4     1     4     5     3   5  
 #> 5     2     5     4     3   4  
 #> 6     2     6     3     3   3  
 #> 7     2     7     2     1   2.5
 #> 8     2     8     1     1   2.5


 # - 1 existing col
 # - 1 or N cases
 # - Default = original col

 replace_when(
  x,
  x > 6, 6,
  x > 3, 3
 )
 #> [1] 1 2 3 3 3 3 6 6

 df %>%
  mutate(
    x = replace_when(
      x,
      x > 6, max(y),
      x > 3, y
    )
  )
 #> # A tibble: 8 × 3
 #> # Groups:   g [2]
 #>       g     x     y
 #>   <dbl> <int> <int>
 #> 1     1     1     8
 #> 2     1     2     7
 #> 3     1     3     6
 #> 4     1     5     5
 #> 5     2     4     4
 #> 6     2     3     3
 #> 7     2     4     2
 #> 8     2     4     1

 # Special consideration for replace()
 replace_when(x, x > 6, NA)
 #> [1]  1  2  3  4  5  6 NA NA
 replace(x, x > 6, NA)
 #> [1]  1  2  3  4  5  6 NA NA

 # Which fails with this common pattern
 y <- df$y
 replace_when(x, x > 6, y)
 #> [1] 1 2 3 4 5 6 2 1
 replace(x, x > 6, y)
 #> Warning in x[list] <- values: number of items to replace is not a multiple of replacement length
 #> [1] 1 2 3 4 5 6 8 7


 # - N existing cols
 # - 1 or N cases
 # - Default = original col

 replace_when(
  tibble(x = x, y = y),
  x > 6, tibble(x = NA, y = NA),
  x > 3, tibble(x = max(x), y = max(y))
 )
 #> # A tibble: 8 × 2
 #>       x     y
 #>   <int> <int>
 #> 1     1     8
 #> 2     2     7
 #> 3     3     6
 #> 4     8     8
 #> 5     8     8
 #> 6     8     8
 #> 7    NA    NA
 #> 8    NA    NA

 # While possible, this is pretty clunky and also is pretty rare
 df %>%
  mutate(
    replace_when(
      tibble(x = x, y = y),
      x > 6, tibble(x = NA, y = NA),
      x > 3, tibble(x = max(x), y = max(y))
    )
  )
 #> # A tibble: 8 × 3
 #> # Groups:   g [2]
 #>       g     x     y
 #>   <dbl> <int> <int>
 #> 1     1     1     8
 #> 2     1     2     7
 #> 3     1     3     6
 #> 4     1     4     8
 #> 5     2     8     4
 #> 6     2     8     4
 #> 7     2    NA    NA
 #> 8     2    NA    NA

 # More common would be to update multiple columns based on 1 condition
 # (Can't use if_else() because we want type stability of `x`)
 df %>%
  mutate(
    replace_when(
      tibble(x = x, y = y),
      x > 6, tibble(x = NA, y = NA)
    )
  )
 #> # A tibble: 8 × 3
 #> # Groups:   g [2]
 #>       g     x     y
 #>   <dbl> <int> <int>
 #> 1     1     1     8
 #> 2     1     2     7
 #> 3     1     3     6
 #> 4     1     4     5
 #> 5     2     5     4
 #> 6     2     6     3
 #> 7     2    NA    NA
 #> 8     2    NA    NA

 # But this case seems so common that we should provide a native dplyr helper
 # (this seems like the only way to avoid specifying the columns twice)
 revise(df, x > 6, x = NA, y = NA)
 #> # A tibble: 8 × 3
 #> # Groups:   g [2]
 #>       g     x     y
 #>   <dbl> <int> <int>
 #> 1     1     1     8
 #> 2     1     2     7
 #> 3     1     3     6
 #> 4     1     4     5
 #> 5     2     5     4
 #> 6     2     6     3
 #> 7     2    NA    NA
 #> 8     2    NA    NA


 # Note that revise() computes `...` on the filtered data, while replace_when()
 # uses the entire group's data.
 # I'm not sure if revise() is correct or not (it is what data table does but
 # feels kind of wrong for these computed column cases).
 revise(df, x > 6, y = max(y))
 #> # A tibble: 8 × 3
 #> # Groups:   g [2]
 #>       g     x     y
 #>   <dbl> <int> <int>
 #> 1     1     1     8
 #> 2     1     2     7
 #> 3     1     3     6
 #> 4     1     4     5
 #> 5     2     5     4
 #> 6     2     6     3
 #> 7     2     7     2
 #> 8     2     8     2

 mutate(df, y = replace_when(y, x > 6, max(y)))
 #> # A tibble: 8 × 3
 #> # Groups:   g [2]
 #>       g     x     y
 #>   <dbl> <int> <int>
 #> 1     1     1     8
 #> 2     1     2     7
 #> 3     1     3     6
 #> 4     1     4     5
 #> 5     2     5     4
 #> 6     2     6     3
 #> 7     2     7     4
 #> 8     2     8     4
	df <- tibble(
	g = c(1, 1, 1, 1, 2, 2, 2, 2),
	x = 1:8,
	y = 8:1
	)

	# Group to show that this is applied per groups nicely
	df <- group_by(df, g)

	x <- df$x
	x
	#> [1] 1 2 3 4 5 6 7 8

	df
	#> # A tibble: 8 × 3
	#> # Groups: g [2]
	#> g x y
	#> <dbl> <int> <int>
	#> 1 1 1 8
	#> 2 1 2 7
	#> 3 1 3 6
	#> 4 1 4 5
	#> 5 2 5 4
	#> 6 2 6 3
	#> 7 2 7 2
	#> 8 2 8 1


	# - 1 new col
	# - 1 or N cases
	# - Default = user supplied

	case_when(
	x > 6, 6,
	x > 3, 3,
	default = 1
	)
	#> [1] 1 1 1 3 3 3 6 6

	# Can use vectors as replacements which get sliced to the size of sum(condition)
	df %>%
	mutate(
	z = case_when(
	x > 6, mean(y),
	x > 3, y,
	default = 1
	)
	)
	#> # A tibble: 8 × 4
	#> # Groups: g [2]
	#> g x y z
	#> <dbl> <int> <int> <dbl>
	#> 1 1 1 8 1
	#> 2 1 2 7 1
	#> 3 1 3 6 1
	#> 4 1 4 5 5
	#> 5 2 5 4 4
	#> 6 2 6 3 3
	#> 7 2 7 2 2.5
	#> 8 2 8 1 2.5

	# - N new cols
	# - 1 or N cases
	# - Default = user supplied

	case_when(
	x > 6, tibble(x = 1, y = 2),
	x > 3, tibble(x = 3, y = 6),
	default = tibble(x = NA, y = NA)
	)
	#> # A tibble: 8 × 2
	#> x y
	#> <dbl> <dbl>
	#> 1 NA NA
	#> 2 NA NA
	#> 3 NA NA
	#> 4 3 6
	#> 5 3 6
	#> 6 3 6
	#> 7 1 2
	#> 8 1 2

	# Data frame auto expansion for the win!
	# (CRAN dplyr can't do this)
	df %>%
	mutate(
	case_when(
	x > 6, tibble(a = 1, b = mean(y)),
	x > 3, tibble(a = 3, b = y),
	default = tibble(a = NA, b = NA)
	)
	)
	#> # A tibble: 8 × 5
	#> # Groups: g [2]
	#> g x y a b
	#> <dbl> <int> <int> <dbl> <dbl>
	#> 1 1 1 8 NA NA
	#> 2 1 2 7 NA NA
	#> 3 1 3 6 NA NA
	#> 4 1 4 5 3 5
	#> 5 2 5 4 3 4
	#> 6 2 6 3 3 3
	#> 7 2 7 2 1 2.5
	#> 8 2 8 1 1 2.5


	# - 1 existing col
	# - 1 or N cases
	# - Default = original col

	replace_when(
	x,
	x > 6, 6,
	x > 3, 3
	)
	#> [1] 1 2 3 3 3 3 6 6

	df %>%
	mutate(
	x = replace_when(
	x,
	x > 6, max(y),
	x > 3, y
	)
	)
	#> # A tibble: 8 × 3
	#> # Groups: g [2]
	#> g x y
	#> <dbl> <int> <int>
	#> 1 1 1 8
	#> 2 1 2 7
	#> 3 1 3 6
	#> 4 1 5 5
	#> 5 2 4 4
	#> 6 2 3 3
	#> 7 2 4 2
	#> 8 2 4 1

	# Special consideration for replace()
	replace_when(x, x > 6, NA)
	#> [1] 1 2 3 4 5 6 NA NA
	replace(x, x > 6, NA)
	#> [1] 1 2 3 4 5 6 NA NA

	# Which fails with this common pattern
	y <- df$y
	replace_when(x, x > 6, y)
	#> [1] 1 2 3 4 5 6 2 1
	replace(x, x > 6, y)
	#> Warning in x[list] <- values: number of items to replace is not a multiple of replacement length
	#> [1] 1 2 3 4 5 6 8 7


	# - N existing cols
	# - 1 or N cases
	# - Default = original col

	replace_when(
	tibble(x = x, y = y),
	x > 6, tibble(x = NA, y = NA),
	x > 3, tibble(x = max(x), y = max(y))
	)
	#> # A tibble: 8 × 2
	#> x y
	#> <int> <int>
	#> 1 1 8
	#> 2 2 7
	#> 3 3 6
	#> 4 8 8
	#> 5 8 8
	#> 6 8 8
	#> 7 NA NA
	#> 8 NA NA

	# While possible, this is pretty clunky and also is pretty rare
	df %>%
	mutate(
	replace_when(
	tibble(x = x, y = y),
	x > 6, tibble(x = NA, y = NA),
	x > 3, tibble(x = max(x), y = max(y))
	)
	)
	#> # A tibble: 8 × 3
	#> # Groups: g [2]
	#> g x y
	#> <dbl> <int> <int>
	#> 1 1 1 8
	#> 2 1 2 7
	#> 3 1 3 6
	#> 4 1 4 8
	#> 5 2 8 4
	#> 6 2 8 4
	#> 7 2 NA NA
	#> 8 2 NA NA

	# More common would be to update multiple columns based on 1 condition
	# (Can't use if_else() because we want type stability of `x`)
	df %>%
	mutate(
	replace_when(
	tibble(x = x, y = y),
	x > 6, tibble(x = NA, y = NA)
	)
	)
	#> # A tibble: 8 × 3
	#> # Groups: g [2]
	#> g x y
	#> <dbl> <int> <int>
	#> 1 1 1 8
	#> 2 1 2 7
	#> 3 1 3 6
	#> 4 1 4 5
	#> 5 2 5 4
	#> 6 2 6 3
	#> 7 2 NA NA
	#> 8 2 NA NA

	# But this case seems so common that we should provide a native dplyr helper
	# (this seems like the only way to avoid specifying the columns twice)
	revise(df, x > 6, x = NA, y = NA)
	#> # A tibble: 8 × 3
	#> # Groups: g [2]
	#> g x y
	#> <dbl> <int> <int>
	#> 1 1 1 8
	#> 2 1 2 7
	#> 3 1 3 6
	#> 4 1 4 5
	#> 5 2 5 4
	#> 6 2 6 3
	#> 7 2 NA NA
	#> 8 2 NA NA


	# Note that revise() computes `...` on the filtered data, while replace_when()
	# uses the entire group's data.
	# I'm not sure if revise() is correct or not (it is what data table does but
	# feels kind of wrong for these computed column cases).
	revise(df, x > 6, y = max(y))
	#> # A tibble: 8 × 3
	#> # Groups: g [2]
	#> g x y
	#> <dbl> <int> <int>
	#> 1 1 1 8
	#> 2 1 2 7
	#> 3 1 3 6
	#> 4 1 4 5
	#> 5 2 5 4
	#> 6 2 6 3
	#> 7 2 7 2
	#> 8 2 8 2

	mutate(df, y = replace_when(y, x > 6, max(y)))
	#> # A tibble: 8 × 3
	#> # Groups: g [2]
	#> g x y
	#> <dbl> <int> <int>
	#> 1 1 1 8
	#> 2 1 2 7
	#> 3 1 3 6
	#> 4 1 4 5
	#> 5 2 5 4
	#> 6 2 6 3
	#> 7 2 7 4
	#> 8 2 8 4
No results found