Skip to content

Instantly share code, notes, and snippets.

@jonocarroll
Created March 7, 2018 06:03
Show Gist options
  • Save jonocarroll/7f35eefb35f77aaefce527f0b16b034f to your computer and use it in GitHub Desktop.
Save jonocarroll/7f35eefb35f77aaefce527f0b16b034f to your computer and use it in GitHub Desktop.
Functions keep_levels and discard_levels to filter with validation
#' Keep only certain groups/levels of a column
#'
#' @param .d a data.frame or tibble
#' @param .g column containing groups/levels to be kept or discarded
#' @param .l groups/levels to be kept or discarded as character vector
#'
#' @return a data.frame or tibble with the same or fewer rows after filtering
#' @export
#' @importFrom rlang enquo
#'
#' @seealso discard\_levels
#'
#' @examples
#' keep\_levels(iris, Species, "setosa")
keep_levels <- function(.d, .g, .l) {
g <- validate_level_inputs(.d, rlang::enquo(.g), .l)
.d[.d[[g]] %in% .l, ]
}
#' Discard certain groups/levels of a column
#'
#' @param .d a data.frame or tibble
#' @param .g column containing groups/levels to be kept or discarded
#' @param .l groups/levels to be kept or discarded as character vector
#'
#' @return a data.frame or tibble with the same or fewer rows after filtering
#' @export
#' @importFrom rlang enquo
#'
#' @seealso keep\_levels
#'
#' @examples
#' discard\_levels(iris, Species, "setosa")
discard_levels <- function(.d, .g, .l) {
g <- validate_level_inputs(.d, rlang::enquo(.g), .l)
.d[! .d[[g]] %in% .l, ]
}
#' Validate levels to be kept/discarded
#'
#' Performs the following validations:
#' - input .d is data.frame-alike
#' - input .g is a column of .d
#' - .l is/are present in .d$.g
#'
#' Failure of any of these validations results in an error.
#'
#' @inheritParams keep\_levels
#' @importFrom rlang quo\_name
#'
#' @return column to be filtered as a character string (name)
validate_level_inputs <- function(.d, .g, .l) {
stopifnot(inherits(.d, "data.frame"))
g <- rlang::quo_name(.g)
if(! g %in% names(.d)) {
stop(paste0("\"", g, "\" column not present in input data"),
call. = FALSE)
}
levels_present <- .l %in% unique(.d[[g]])
if (!all(levels_present)) {
stop(paste0("Level(s) not present in ", g, ": \"",
toString(.l[which(!levels_present)]),
"\""),
call. = FALSE)
}
g
}
## TESTING EXAMPLE DATA
## create a smaller version of iris
iris_test <- iris[c(1:3, 51:53, 101:103), ]
iris_test$Species[1] <- NA
iris_test
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1 5.1 3.5 1.4 0.2 <NA>
#> 2 4.9 3.0 1.4 0.2 setosa
#> 3 4.7 3.2 1.3 0.2 setosa
#> 51 7.0 3.2 4.7 1.4 versicolor
#> 52 6.4 3.2 4.5 1.5 versicolor
#> 53 6.9 3.1 4.9 1.5 versicolor
#> 101 6.3 3.3 6.0 2.5 virginica
#> 102 5.8 2.7 5.1 1.9 virginica
#> 103 7.1 3.0 5.9 2.1 virginica
## TESTING EXAMPLES - KEEP
## empty input (not present)
keep_levels(iris_test, Species, "")
#> Error: Level(s) not present in Species: ""
## NA input but present in data
keep_levels(iris_test, Species, NA)
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1 5.1 3.5 1.4 0.2 <NA>
## correct usage (keeps only setosa)
keep_levels(iris_test, Species, "setosa")
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 2 4.9 3.0 1.4 0.2 setosa
#> 3 4.7 3.2 1.3 0.2 setosa
## misidentified single level (cannot filter)
keep_levels(iris_test, Species, c("setosa", "samosa"))
#> Error: Level(s) not present in Species: "samosa"
## misidentified multiple levels (cannot filter)
keep_levels(iris_test, Species, c("setosa", "samosa", "sangria"))
#> Error: Level(s) not present in Species: "samosa, sangria"
## misidentified column (cannot filter)
keep_levels(iris_test, Spatial, c("setosa", "samosa"))
#> Error: "Spatial" column not present in input data
## multiple levels kept
keep_levels(iris_test, Species, c("setosa", "virginica"))
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 2 4.9 3.0 1.4 0.2 setosa
#> 3 4.7 3.2 1.3 0.2 setosa
#> 101 6.3 3.3 6.0 2.5 virginica
#> 102 5.8 2.7 5.1 1.9 virginica
#> 103 7.1 3.0 5.9 2.1 virginica
## column name as character
keep_levels(iris_test, "Species", "setosa")
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 2 4.9 3.0 1.4 0.2 setosa
#> 3 4.7 3.2 1.3 0.2 setosa
## input data not data.frame-like
keep_levels(letters, Species, "setosa")
#> Error: inherits(.d, "data.frame") is not TRUE
## input data actually a tibble
keep_levels(tibble::as.tibble(iris_test), Species, "setosa")
#> # A tibble: 2 x 5
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> <dbl> <dbl> <dbl> <dbl> <fct>
#> 1 4.90 3.00 1.40 0.200 setosa
#> 2 4.70 3.20 1.30 0.200 setosa
## TESTING EXAMPLES - DISCARD
discard_levels(iris_test, Species, "")
#> Error: Level(s) not present in Species: ""
discard_levels(iris_test, Species, NA)
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 2 4.9 3.0 1.4 0.2 setosa
#> 3 4.7 3.2 1.3 0.2 setosa
#> 51 7.0 3.2 4.7 1.4 versicolor
#> 52 6.4 3.2 4.5 1.5 versicolor
#> 53 6.9 3.1 4.9 1.5 versicolor
#> 101 6.3 3.3 6.0 2.5 virginica
#> 102 5.8 2.7 5.1 1.9 virginica
#> 103 7.1 3.0 5.9 2.1 virginica
discard_levels(iris_test, Species, "setosa")
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1 5.1 3.5 1.4 0.2 <NA>
#> 51 7.0 3.2 4.7 1.4 versicolor
#> 52 6.4 3.2 4.5 1.5 versicolor
#> 53 6.9 3.1 4.9 1.5 versicolor
#> 101 6.3 3.3 6.0 2.5 virginica
#> 102 5.8 2.7 5.1 1.9 virginica
#> 103 7.1 3.0 5.9 2.1 virginica
discard_levels(iris_test, Species, c("setosa", "samosa"))
#> Error: Level(s) not present in Species: "samosa"
discard_levels(iris_test, Species, c("setosa", "samosa", "sangria"))
#> Error: Level(s) not present in Species: "samosa, sangria"
discard_levels(iris_test, Spatial, c("setosa", "samosa"))
#> Error: "Spatial" column not present in input data
discard_levels(iris_test, Species, c("setosa", "virginica"))
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1 5.1 3.5 1.4 0.2 <NA>
#> 51 7.0 3.2 4.7 1.4 versicolor
#> 52 6.4 3.2 4.5 1.5 versicolor
#> 53 6.9 3.1 4.9 1.5 versicolor
discard_levels(iris_test, "Species", "setosa")
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1 5.1 3.5 1.4 0.2 <NA>
#> 51 7.0 3.2 4.7 1.4 versicolor
#> 52 6.4 3.2 4.5 1.5 versicolor
#> 53 6.9 3.1 4.9 1.5 versicolor
#> 101 6.3 3.3 6.0 2.5 virginica
#> 102 5.8 2.7 5.1 1.9 virginica
#> 103 7.1 3.0 5.9 2.1 virginica
discard_levels(letters, Species, "setosa")
#> Error: inherits(.d, "data.frame") is not TRUE
discard_levels(tibble::as.tibble(iris_test), Species, "setosa")
#> # A tibble: 7 x 5
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> <dbl> <dbl> <dbl> <dbl> <fct>
#> 1 5.10 3.50 1.40 0.200 <NA>
#> 2 7.00 3.20 4.70 1.40 versicolor
#> 3 6.40 3.20 4.50 1.50 versicolor
#> 4 6.90 3.10 4.90 1.50 versicolor
#> 5 6.30 3.30 6.00 2.50 virginica
#> 6 5.80 2.70 5.10 1.90 virginica
#> 7 7.10 3.00 5.90 2.10 virginica
discard_levels(iris_test, Species, c("setosa", "virginica", "versicolor"))
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1 5.1 3.5 1.4 0.2 <NA>
## INITIAL MOTIVATION EXAMPLE:
set.seed(1)
## with characters
testdf <- data.frame(
a = 1:10,
b = sample(c("W", "X_Y", "Z"), 10, replace = TRUE),
stringsAsFactors = FALSE
)
## with factors
testdfF <- data.frame(
a = 1:10,
b = sample(c("W", "X_Y", "Z"), 10, replace = TRUE)
)
testdf
#> a b
#> 1 1 W
#> 2 2 X_Y
#> 3 3 X_Y
#> 4 4 Z
#> 5 5 W
#> 6 6 Z
#> 7 7 Z
#> 8 8 X_Y
#> 9 9 X_Y
#> 10 10 W
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
## only captures "Z", silently ignores the typo
testdf %>% filter(b %in% c("X Y", "Z"))
#> a b
#> 1 4 Z
#> 2 6 Z
#> 3 7 Z
## identifies that the level is not present
testdf %>% keep_levels(b, c("X Y", "Z"))
#> Error: Level(s) not present in b: "X Y"
## correct filtering still works
testdf %>% keep_levels(b, c("X_Y", "Z"))
#> a b
#> 2 2 X_Y
#> 3 3 X_Y
#> 4 4 Z
#> 6 6 Z
#> 7 7 Z
#> 8 8 X_Y
#> 9 9 X_Y
## same results with factors (missing level)
testdfF %>% keep_levels(b, c("X Y", "Z"))
#> Error: Level(s) not present in b: "X Y"
## same results with factors (correct levels)
testdfF %>% keep_levels(b, c("X_Y", "Z"))
#> a b
#> 3 3 Z
#> 4 4 X_Y
#> 5 5 Z
#> 6 6 X_Y
#> 7 7 Z
#> 8 8 Z
#> 9 9 X_Y
#> 10 10 Z
#' Created on 2018-03-07 by the [reprex package](http://reprex.tidyverse.org) (v0.2.0).
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment