jthomasmock · February 18, 2020 06:37 · jthomasmock · Feb 17, 2020
diff --git a/very_wide.R b/very_wide.R
 library(tidyverse)

 # The actual dataset has ~3000 columns and 100ish rows per column
 # Each column refers to a country and a state within that country
 # There is also an "average" for the country column
 # What is the best approach to grabbing ONLY the average column

 bad_df <- tibble::tribble(
  ~category,    ~usa_1,  ~usa_2,            ~usa_3, ~canada_1,  ~canada_2,         ~canada_13,
         NA, "Alabama", "Texas", "Country Average", "Ontario", "Manitoba", "Country Average",
      "Cow",      "24",    "16",              "23",      "20",       "24",              "15",
      "Pig",      "19",    "26",              "16",      "19",       "15",              "20",
  "Chicken",      "21",     NA,               "17",      "26",       "25",              "17",
  "This data has sources from USDA and others", NA, NA, NA, NA, NA, NA
  )

 # Simply doing pivot_longer() loses the data relationship for states
 bad_df %>% 
  pivot_longer(usa_1:canada_13, names_to = "country", values_to = "water_usage")
 #> # A tibble: 30 x 3
 #>    category country   water_usage    
 #>    <chr>    <chr>     <chr>          
 #>  1 <NA>     usa_1     Alabama        
 #>  2 <NA>     usa_2     Texas          
 #>  3 <NA>     usa_3     Country Average
 #>  4 <NA>     canada_1  Ontario        
 #>  5 <NA>     canada_2  Manitoba       
 #>  6 <NA>     canada_13 Country Average
 #>  7 Cow      usa_1     24             
 #>  8 Cow      usa_2     16             
 #>  9 Cow      usa_3     23             
 #> 10 Cow      canada_1  20             
 #> # … with 20 more rows

 # My solution
 # Pull the first row as a vector, which we will combine with the col name
 state_vector <- as.character(as.vector(bad_df[1,]))


 bad_df %>% 
  # combine column name with 1st row
  set_names(nm = glue::glue("{names(bad_df)}_{state_vector}")) %>% 
  # Grab only the grouping variable and any col names that contain "Average"
  select(category = category_NA, contains("Average")) %>% 
  filter(!is.na(category), str_length(category) <=20) %>% 
  pivot_longer(`usa_3_Country Average`:`canada_13_Country Average`, names_to = "country", values_to = "water_usage") %>% 
  mutate(country = str_remove(country, "_Country Average"),
         country = str_remove(country, "_"),
         country = str_remove(country, "[:digit:]+"),
         water_usage = as.double(water_usage))
 #> # A tibble: 6 x 3
 #>   category country water_usage
 #>   <chr>    <chr>   <dbl>      
 #> 1 Cow      usa     23         
 #> 2 Cow      canada  15         
 #> 3 Pig      usa     16         
 #> 4 Pig      canada  20         
 #> 5 Chicken  usa     17         
 #> 6 Chicken  canada  17
 ```

 <sup>Created on 2020-02-17 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0)</sup>
	library(tidyverse)

	# The actual dataset has ~3000 columns and 100ish rows per column
	# Each column refers to a country and a state within that country
	# There is also an "average" for the country column
	# What is the best approach to grabbing ONLY the average column

	bad_df <- tibble::tribble(
	~category, ~usa_1, ~usa_2, ~usa_3, ~canada_1, ~canada_2, ~canada_13,
	NA, "Alabama", "Texas", "Country Average", "Ontario", "Manitoba", "Country Average",
	"Cow", "24", "16", "23", "20", "24", "15",
	"Pig", "19", "26", "16", "19", "15", "20",
	"Chicken", "21", NA, "17", "26", "25", "17",
	"This data has sources from USDA and others", NA, NA, NA, NA, NA, NA
	)

	# Simply doing pivot_longer() loses the data relationship for states
	bad_df %>%
	pivot_longer(usa_1:canada_13, names_to = "country", values_to = "water_usage")
	#> # A tibble: 30 x 3
	#> category country water_usage
	#> <chr> <chr> <chr>
	#> 1 <NA> usa_1 Alabama
	#> 2 <NA> usa_2 Texas
	#> 3 <NA> usa_3 Country Average
	#> 4 <NA> canada_1 Ontario
	#> 5 <NA> canada_2 Manitoba
	#> 6 <NA> canada_13 Country Average
	#> 7 Cow usa_1 24
	#> 8 Cow usa_2 16
	#> 9 Cow usa_3 23
	#> 10 Cow canada_1 20
	#> # … with 20 more rows

	# My solution
	# Pull the first row as a vector, which we will combine with the col name
	state_vector <- as.character(as.vector(bad_df[1,]))


	bad_df %>%
	# combine column name with 1st row
	set_names(nm = glue::glue("{names(bad_df)}_{state_vector}")) %>%
	# Grab only the grouping variable and any col names that contain "Average"
	select(category = category_NA, contains("Average")) %>%
	filter(!is.na(category), str_length(category) <=20) %>%
	pivot_longer(`usa_3_Country Average`:`canada_13_Country Average`, names_to = "country", values_to = "water_usage") %>%
	mutate(country = str_remove(country, "_Country Average"),
	country = str_remove(country, "_"),
	country = str_remove(country, "[:digit:]+"),
	water_usage = as.double(water_usage))
	#> # A tibble: 6 x 3
	#> category country water_usage
	#> <chr> <chr> <dbl>
	#> 1 Cow usa 23
	#> 2 Cow canada 15
	#> 3 Pig usa 16
	#> 4 Pig canada 20
	#> 5 Chicken usa 17
	#> 6 Chicken canada 17
	```

	<sup>Created on 2020-02-17 by the [reprex package](https://reprex.tidyverse.org) (v0.3.0)</sup>