cimentadaj · March 13, 2020 13:20
diff --git a/nuts_extract.R b/nuts_extract.R
 library(eurostat)
 library(dplyr)

 res <- get_eurostat("demo_r_pjangroup")

 ## Example for NUTS1: UKB
 uk_match <- "UK[A-Z]{1}$"
 ## Example for NUTS2: ITAC12
 ita_match <- "IT[A-Z][0-9]{1,2}"
 ## Example for NUTS1: either DE3 or DEA
 ## I don't know why NUTS1 Germany mixes numbers and letters
 de_match <- "DE[A-Z]{1}$|DE[0-9]{1}$"
 ## Example for NUTS1: FR1 and FRB, FRC, etc..
 fr_match <- "FR1$|FR[A-Z]{1}$"
 ## Example for NUTS2: ES11
 es_match <- "ES[0-9]{2,2}$"
 ## Example for NUTS2: IE04 to IE06
 ie_match <- "IE04|IE05|IE06$"
 ## Example for NUTS2: NL12
 nl_match <- "NL[0-9]{2,2}$"
 ## Example for NUTS2: BE12
 be_match <- "BE[0-9]{2,2}$"
 ## Example for NUTS2: AT11
 at_match <- "AT[0-9]{2,2}$"
 ## Example for NUTS2: AT11
 lu_match <- "LU[0-9]{2,2}$"

 ## Wikipedia NUTS library for each country
 ## https://en.wikipedia.org/wiki/NUTS_statistical_regions_of_France
 ## Just change the country name at the end for the desired one

 final_match <- paste0(c(uk_match,
                        ita_match,
                        de_match,
                        fr_match,
                        es_match,
                        ie_match,
                        nl_match,
                        be_match,
                        at_match,
                        lu_match),
                      collapse = "|")

 tst <-
  res %>%
  ## To quickly check the codes for each country -- delete after done
  ## filter(grepl("FR", geo)) %>%
  ## distinct(geo) %>%
  ## print(n = Inf)
  ## Select the most recent time point
  group_by(sex, age, geo) %>%
  filter(time == max(time)) %>%
  ungroup() %>%
  # Drop unused columns
  select(-unit, -time) %>% 
  filter(sex != "T", !age %in% c("TOTAL", "UNK"), grepl(final_match, geo)) %>%
  label_eurostat(code = "geo", fix_duplicated = TRUE) %>%
  mutate(country = case_when(grepl(uk_match, geo_code) ~ "UK",
                             grepl(ita_match, geo_code) ~ "ITA",
                             grepl(de_match, geo_code) ~ "DE",
                             grepl(fr_match, geo_code) ~ "FR",
                             grepl(es_match, geo_code) ~ "ES",
                             grepl(ie_match, geo_code) ~ "IE",
                             grepl(nl_match, geo_code) ~ "NL",
                             grepl(be_match, geo_code) ~ "BE",
                             grepl(at_match, geo_code) ~ "AT",
                             grepl(lu_match, geo_code) ~ "LU",                                                                                                                                                 
                             TRUE ~ NA_character_)) %>%
  select(country, sex, age, geo_code, geo, values) %>%
  ## Because some geo codes in germany has DE* in front
  mutate(geo = gsub("DE[0-9]{1} |DE[A-Z]{1} |FR[A-Z]{1,2} |FR1 |ES[0-9]{2,2} |Prov. |BE[0-9]{2,3} ", "", geo)) %>%
  filter(!grepl("Départements|Not regionalised|RUP FR - Régions", geo))

 ## To quickly extract the code/names for each NUTS -- delete after done
 tst %>%
  filter(country == "LU") %>%
  distinct(geo_code, geo) %>% 
  print(n = Inf) %>% 
  pull(geo) %>%
  as.character() %>%
  cat(sep = "\n")
	library(eurostat)
	library(dplyr)

	res <- get_eurostat("demo_r_pjangroup")

	## Example for NUTS1: UKB
	uk_match <- "UK[A-Z]{1}$"
	## Example for NUTS2: ITAC12
	ita_match <- "IT[A-Z][0-9]{1,2}"
	## Example for NUTS1: either DE3 or DEA
	## I don't know why NUTS1 Germany mixes numbers and letters
	de_match <- "DE[A-Z]{1}$\|DE[0-9]{1}$"
	## Example for NUTS1: FR1 and FRB, FRC, etc..
	fr_match <- "FR1$\|FR[A-Z]{1}$"
	## Example for NUTS2: ES11
	es_match <- "ES[0-9]{2,2}$"
	## Example for NUTS2: IE04 to IE06
	ie_match <- "IE04\|IE05\|IE06$"
	## Example for NUTS2: NL12
	nl_match <- "NL[0-9]{2,2}$"
	## Example for NUTS2: BE12
	be_match <- "BE[0-9]{2,2}$"
	## Example for NUTS2: AT11
	at_match <- "AT[0-9]{2,2}$"
	## Example for NUTS2: AT11
	lu_match <- "LU[0-9]{2,2}$"

	## Wikipedia NUTS library for each country
	## https://en.wikipedia.org/wiki/NUTS_statistical_regions_of_France
	## Just change the country name at the end for the desired one

	final_match <- paste0(c(uk_match,
	ita_match,
	de_match,
	fr_match,
	es_match,
	ie_match,
	nl_match,
	be_match,
	at_match,
	lu_match),
	collapse = "\|")

	tst <-
	res %>%
	## To quickly check the codes for each country -- delete after done
	## filter(grepl("FR", geo)) %>%
	## distinct(geo) %>%
	## print(n = Inf)
	## Select the most recent time point
	group_by(sex, age, geo) %>%
	filter(time == max(time)) %>%
	ungroup() %>%
	# Drop unused columns
	select(-unit, -time) %>%
	filter(sex != "T", !age %in% c("TOTAL", "UNK"), grepl(final_match, geo)) %>%
	label_eurostat(code = "geo", fix_duplicated = TRUE) %>%
	mutate(country = case_when(grepl(uk_match, geo_code) ~ "UK",
	grepl(ita_match, geo_code) ~ "ITA",
	grepl(de_match, geo_code) ~ "DE",
	grepl(fr_match, geo_code) ~ "FR",
	grepl(es_match, geo_code) ~ "ES",
	grepl(ie_match, geo_code) ~ "IE",
	grepl(nl_match, geo_code) ~ "NL",
	grepl(be_match, geo_code) ~ "BE",
	grepl(at_match, geo_code) ~ "AT",
	grepl(lu_match, geo_code) ~ "LU",
	TRUE ~ NA_character_)) %>%
	select(country, sex, age, geo_code, geo, values) %>%
	## Because some geo codes in germany has DE* in front
	mutate(geo = gsub("DE[0-9]{1} \|DE[A-Z]{1} \|FR[A-Z]{1,2} \|FR1 \|ES[0-9]{2,2} \|Prov. \|BE[0-9]{2,3} ", "", geo)) %>%
	filter(!grepl("Départements\|Not regionalised\|RUP FR - Régions", geo))

	## To quickly extract the code/names for each NUTS -- delete after done
	tst %>%
	filter(country == "LU") %>%
	distinct(geo_code, geo) %>%
	print(n = Inf) %>%
	pull(geo) %>%
	as.character() %>%
	cat(sep = "\n")