Last active
February 18, 2020 06:37
-
-
Save jthomasmock/77630744719b9fd671cb9900deee2185 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
# The actual dataset has ~3000 columns and 100ish rows per column | |
# Each column refers to a country and a state within that country | |
# There is also an "average" for the country column | |
# What is the best approach to grabbing ONLY the average column | |
bad_df <- tibble::tribble( | |
~category, ~usa_1, ~usa_2, ~usa_3, ~canada_1, ~canada_2, ~canada_13, | |
NA, "Alabama", "Texas", "Country Average", "Ontario", "Manitoba", "Country Average", | |
"Cow", "24", "16", "23", "20", "24", "15", | |
"Pig", "19", "26", "16", "19", "15", "20", | |
"Chicken", "21", NA, "17", "26", "25", "17", | |
"This data has sources from USDA and others", NA, NA, NA, NA, NA, NA | |
) | |
# Simply doing pivot_longer() loses the data relationship for states | |
bad_df %>% | |
pivot_longer(usa_1:canada_13, names_to = "country", values_to = "water_usage") | |
#> # A tibble: 30 x 3 | |
#> category country water_usage | |
#> <chr> <chr> <chr> | |
#> 1 <NA> usa_1 Alabama | |
#> 2 <NA> usa_2 Texas | |
#> 3 <NA> usa_3 Country Average | |
#> 4 <NA> canada_1 Ontario | |
#> 5 <NA> canada_2 Manitoba | |
#> 6 <NA> canada_13 Country Average | |
#> 7 Cow usa_1 24 | |
#> 8 Cow usa_2 16 | |
#> 9 Cow usa_3 23 | |
#> 10 Cow canada_1 20 | |
#> # … with 20 more rows | |
# My solution | |
# Pull the first row as a vector, which we will combine with the col name | |
state_vector <- as.character(as.vector(bad_df[1,])) | |
bad_df %>% | |
# combine column name with 1st row | |
set_names(nm = glue::glue("{names(bad_df)}_{state_vector}")) %>% | |
# Grab only the grouping variable and any col names that contain "Average" | |
select(category = category_NA, contains("Average")) %>% | |
filter(!is.na(category), str_length(category) <=20) %>% | |
pivot_longer(`usa_3_Country Average`:`canada_13_Country Average`, names_to = "country", values_to = "water_usage") %>% | |
mutate(country = str_remove(country, "_Country Average"), | |
country = str_remove(country, "_"), | |
country = str_remove(country, "[:digit:]+"), | |
water_usage = as.double(water_usage)) | |
#> # A tibble: 6 x 3 | |
#> category country water_usage | |
#> <chr> <chr> <dbl> | |
#> 1 Cow usa 23 | |
#> 2 Cow canada 15 | |
#> 3 Pig usa 16 | |
#> 4 Pig canada 20 | |
#> 5 Chicken usa 17 | |
#> 6 Chicken canada 17 | |
``` | |
<sup>Created on 2020-02-17 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0)</sup> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Full Dataset for Sharla, CEO of Sharla.party.