Created
March 7, 2018 06:03
-
-
Save jonocarroll/7f35eefb35f77aaefce527f0b16b034f to your computer and use it in GitHub Desktop.
Functions keep_levels and discard_levels to filter with validation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #' Keep only certain groups/levels of a column | |
| #' | |
| #' @param .d a data.frame or tibble | |
| #' @param .g column containing groups/levels to be kept or discarded | |
| #' @param .l groups/levels to be kept or discarded as character vector | |
| #' | |
| #' @return a data.frame or tibble with the same or fewer rows after filtering | |
| #' @export | |
| #' @importFrom rlang enquo | |
| #' | |
| #' @seealso discard\_levels | |
| #' | |
| #' @examples | |
| #' keep\_levels(iris, Species, "setosa") | |
| keep_levels <- function(.d, .g, .l) { | |
| g <- validate_level_inputs(.d, rlang::enquo(.g), .l) | |
| .d[.d[[g]] %in% .l, ] | |
| } | |
| #' Discard certain groups/levels of a column | |
| #' | |
| #' @param .d a data.frame or tibble | |
| #' @param .g column containing groups/levels to be kept or discarded | |
| #' @param .l groups/levels to be kept or discarded as character vector | |
| #' | |
| #' @return a data.frame or tibble with the same or fewer rows after filtering | |
| #' @export | |
| #' @importFrom rlang enquo | |
| #' | |
| #' @seealso keep\_levels | |
| #' | |
| #' @examples | |
| #' discard\_levels(iris, Species, "setosa") | |
| discard_levels <- function(.d, .g, .l) { | |
| g <- validate_level_inputs(.d, rlang::enquo(.g), .l) | |
| .d[! .d[[g]] %in% .l, ] | |
| } | |
| #' Validate levels to be kept/discarded | |
| #' | |
| #' Performs the following validations: | |
| #' - input .d is data.frame-alike | |
| #' - input .g is a column of .d | |
| #' - .l is/are present in .d$.g | |
| #' | |
| #' Failure of any of these validations results in an error. | |
| #' | |
| #' @inheritParams keep\_levels | |
| #' @importFrom rlang quo\_name | |
| #' | |
| #' @return column to be filtered as a character string (name) | |
| validate_level_inputs <- function(.d, .g, .l) { | |
| stopifnot(inherits(.d, "data.frame")) | |
| g <- rlang::quo_name(.g) | |
| if(! g %in% names(.d)) { | |
| stop(paste0("\"", g, "\" column not present in input data"), | |
| call. = FALSE) | |
| } | |
| levels_present <- .l %in% unique(.d[[g]]) | |
| if (!all(levels_present)) { | |
| stop(paste0("Level(s) not present in ", g, ": \"", | |
| toString(.l[which(!levels_present)]), | |
| "\""), | |
| call. = FALSE) | |
| } | |
| g | |
| } | |
| ## TESTING EXAMPLE DATA | |
| ## create a smaller version of iris | |
| iris_test <- iris[c(1:3, 51:53, 101:103), ] | |
| iris_test$Species[1] <- NA | |
| iris_test | |
| #> Sepal.Length Sepal.Width Petal.Length Petal.Width Species | |
| #> 1 5.1 3.5 1.4 0.2 <NA> | |
| #> 2 4.9 3.0 1.4 0.2 setosa | |
| #> 3 4.7 3.2 1.3 0.2 setosa | |
| #> 51 7.0 3.2 4.7 1.4 versicolor | |
| #> 52 6.4 3.2 4.5 1.5 versicolor | |
| #> 53 6.9 3.1 4.9 1.5 versicolor | |
| #> 101 6.3 3.3 6.0 2.5 virginica | |
| #> 102 5.8 2.7 5.1 1.9 virginica | |
| #> 103 7.1 3.0 5.9 2.1 virginica | |
| ## TESTING EXAMPLES - KEEP | |
| ## empty input (not present) | |
| keep_levels(iris_test, Species, "") | |
| #> Error: Level(s) not present in Species: "" | |
| ## NA input but present in data | |
| keep_levels(iris_test, Species, NA) | |
| #> Sepal.Length Sepal.Width Petal.Length Petal.Width Species | |
| #> 1 5.1 3.5 1.4 0.2 <NA> | |
| ## correct usage (keeps only setosa) | |
| keep_levels(iris_test, Species, "setosa") | |
| #> Sepal.Length Sepal.Width Petal.Length Petal.Width Species | |
| #> 2 4.9 3.0 1.4 0.2 setosa | |
| #> 3 4.7 3.2 1.3 0.2 setosa | |
| ## misidentified single level (cannot filter) | |
| keep_levels(iris_test, Species, c("setosa", "samosa")) | |
| #> Error: Level(s) not present in Species: "samosa" | |
| ## misidentified multiple levels (cannot filter) | |
| keep_levels(iris_test, Species, c("setosa", "samosa", "sangria")) | |
| #> Error: Level(s) not present in Species: "samosa, sangria" | |
| ## misidentified column (cannot filter) | |
| keep_levels(iris_test, Spatial, c("setosa", "samosa")) | |
| #> Error: "Spatial" column not present in input data | |
| ## multiple levels kept | |
| keep_levels(iris_test, Species, c("setosa", "virginica")) | |
| #> Sepal.Length Sepal.Width Petal.Length Petal.Width Species | |
| #> 2 4.9 3.0 1.4 0.2 setosa | |
| #> 3 4.7 3.2 1.3 0.2 setosa | |
| #> 101 6.3 3.3 6.0 2.5 virginica | |
| #> 102 5.8 2.7 5.1 1.9 virginica | |
| #> 103 7.1 3.0 5.9 2.1 virginica | |
| ## column name as character | |
| keep_levels(iris_test, "Species", "setosa") | |
| #> Sepal.Length Sepal.Width Petal.Length Petal.Width Species | |
| #> 2 4.9 3.0 1.4 0.2 setosa | |
| #> 3 4.7 3.2 1.3 0.2 setosa | |
| ## input data not data.frame-like | |
| keep_levels(letters, Species, "setosa") | |
| #> Error: inherits(.d, "data.frame") is not TRUE | |
| ## input data actually a tibble | |
| keep_levels(tibble::as.tibble(iris_test), Species, "setosa") | |
| #> # A tibble: 2 x 5 | |
| #> Sepal.Length Sepal.Width Petal.Length Petal.Width Species | |
| #> <dbl> <dbl> <dbl> <dbl> <fct> | |
| #> 1 4.90 3.00 1.40 0.200 setosa | |
| #> 2 4.70 3.20 1.30 0.200 setosa | |
| ## TESTING EXAMPLES - DISCARD | |
| discard_levels(iris_test, Species, "") | |
| #> Error: Level(s) not present in Species: "" | |
| discard_levels(iris_test, Species, NA) | |
| #> Sepal.Length Sepal.Width Petal.Length Petal.Width Species | |
| #> 2 4.9 3.0 1.4 0.2 setosa | |
| #> 3 4.7 3.2 1.3 0.2 setosa | |
| #> 51 7.0 3.2 4.7 1.4 versicolor | |
| #> 52 6.4 3.2 4.5 1.5 versicolor | |
| #> 53 6.9 3.1 4.9 1.5 versicolor | |
| #> 101 6.3 3.3 6.0 2.5 virginica | |
| #> 102 5.8 2.7 5.1 1.9 virginica | |
| #> 103 7.1 3.0 5.9 2.1 virginica | |
| discard_levels(iris_test, Species, "setosa") | |
| #> Sepal.Length Sepal.Width Petal.Length Petal.Width Species | |
| #> 1 5.1 3.5 1.4 0.2 <NA> | |
| #> 51 7.0 3.2 4.7 1.4 versicolor | |
| #> 52 6.4 3.2 4.5 1.5 versicolor | |
| #> 53 6.9 3.1 4.9 1.5 versicolor | |
| #> 101 6.3 3.3 6.0 2.5 virginica | |
| #> 102 5.8 2.7 5.1 1.9 virginica | |
| #> 103 7.1 3.0 5.9 2.1 virginica | |
| discard_levels(iris_test, Species, c("setosa", "samosa")) | |
| #> Error: Level(s) not present in Species: "samosa" | |
| discard_levels(iris_test, Species, c("setosa", "samosa", "sangria")) | |
| #> Error: Level(s) not present in Species: "samosa, sangria" | |
| discard_levels(iris_test, Spatial, c("setosa", "samosa")) | |
| #> Error: "Spatial" column not present in input data | |
| discard_levels(iris_test, Species, c("setosa", "virginica")) | |
| #> Sepal.Length Sepal.Width Petal.Length Petal.Width Species | |
| #> 1 5.1 3.5 1.4 0.2 <NA> | |
| #> 51 7.0 3.2 4.7 1.4 versicolor | |
| #> 52 6.4 3.2 4.5 1.5 versicolor | |
| #> 53 6.9 3.1 4.9 1.5 versicolor | |
| discard_levels(iris_test, "Species", "setosa") | |
| #> Sepal.Length Sepal.Width Petal.Length Petal.Width Species | |
| #> 1 5.1 3.5 1.4 0.2 <NA> | |
| #> 51 7.0 3.2 4.7 1.4 versicolor | |
| #> 52 6.4 3.2 4.5 1.5 versicolor | |
| #> 53 6.9 3.1 4.9 1.5 versicolor | |
| #> 101 6.3 3.3 6.0 2.5 virginica | |
| #> 102 5.8 2.7 5.1 1.9 virginica | |
| #> 103 7.1 3.0 5.9 2.1 virginica | |
| discard_levels(letters, Species, "setosa") | |
| #> Error: inherits(.d, "data.frame") is not TRUE | |
| discard_levels(tibble::as.tibble(iris_test), Species, "setosa") | |
| #> # A tibble: 7 x 5 | |
| #> Sepal.Length Sepal.Width Petal.Length Petal.Width Species | |
| #> <dbl> <dbl> <dbl> <dbl> <fct> | |
| #> 1 5.10 3.50 1.40 0.200 <NA> | |
| #> 2 7.00 3.20 4.70 1.40 versicolor | |
| #> 3 6.40 3.20 4.50 1.50 versicolor | |
| #> 4 6.90 3.10 4.90 1.50 versicolor | |
| #> 5 6.30 3.30 6.00 2.50 virginica | |
| #> 6 5.80 2.70 5.10 1.90 virginica | |
| #> 7 7.10 3.00 5.90 2.10 virginica | |
| discard_levels(iris_test, Species, c("setosa", "virginica", "versicolor")) | |
| #> Sepal.Length Sepal.Width Petal.Length Petal.Width Species | |
| #> 1 5.1 3.5 1.4 0.2 <NA> | |
| ## INITIAL MOTIVATION EXAMPLE: | |
| set.seed(1) | |
| ## with characters | |
| testdf <- data.frame( | |
| a = 1:10, | |
| b = sample(c("W", "X_Y", "Z"), 10, replace = TRUE), | |
| stringsAsFactors = FALSE | |
| ) | |
| ## with factors | |
| testdfF <- data.frame( | |
| a = 1:10, | |
| b = sample(c("W", "X_Y", "Z"), 10, replace = TRUE) | |
| ) | |
| testdf | |
| #> a b | |
| #> 1 1 W | |
| #> 2 2 X_Y | |
| #> 3 3 X_Y | |
| #> 4 4 Z | |
| #> 5 5 W | |
| #> 6 6 Z | |
| #> 7 7 Z | |
| #> 8 8 X_Y | |
| #> 9 9 X_Y | |
| #> 10 10 W | |
| library(dplyr) | |
| #> | |
| #> Attaching package: 'dplyr' | |
| #> The following objects are masked from 'package:stats': | |
| #> | |
| #> filter, lag | |
| #> The following objects are masked from 'package:base': | |
| #> | |
| #> intersect, setdiff, setequal, union | |
| ## only captures "Z", silently ignores the typo | |
| testdf %>% filter(b %in% c("X Y", "Z")) | |
| #> a b | |
| #> 1 4 Z | |
| #> 2 6 Z | |
| #> 3 7 Z | |
| ## identifies that the level is not present | |
| testdf %>% keep_levels(b, c("X Y", "Z")) | |
| #> Error: Level(s) not present in b: "X Y" | |
| ## correct filtering still works | |
| testdf %>% keep_levels(b, c("X_Y", "Z")) | |
| #> a b | |
| #> 2 2 X_Y | |
| #> 3 3 X_Y | |
| #> 4 4 Z | |
| #> 6 6 Z | |
| #> 7 7 Z | |
| #> 8 8 X_Y | |
| #> 9 9 X_Y | |
| ## same results with factors (missing level) | |
| testdfF %>% keep_levels(b, c("X Y", "Z")) | |
| #> Error: Level(s) not present in b: "X Y" | |
| ## same results with factors (correct levels) | |
| testdfF %>% keep_levels(b, c("X_Y", "Z")) | |
| #> a b | |
| #> 3 3 Z | |
| #> 4 4 X_Y | |
| #> 5 5 Z | |
| #> 6 6 X_Y | |
| #> 7 7 Z | |
| #> 8 8 Z | |
| #> 9 9 X_Y | |
| #> 10 10 Z | |
| #' Created on 2018-03-07 by the [reprex package](http://reprex.tidyverse.org) (v0.2.0). |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment