Tadge-Analytics · October 17, 2019 01:45
diff --git a/Tableau Certification Directory -pdf to csv b/Tableau Certification Directory -pdf to csv
 library(tidyverse)
 setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

 countries <- c("AE", "AF", "AN", "AO", "AR", "AS", "AT", "AU", "BB", "BD", 
               "BE", "BF", "BH", "BM", "BN", "BR", "BY", "CA", "CH", "CL", "CN", 
               "CO", "CR", "CZ", "DE", "DK", "DZ", "EC", "EE", "EG", "ES", "ET", 
               "FI", "FO", "FR", "GA", "GB", "GE", "GH", "GI", "GR", "GT", "HK", 
               "HN", "HR", "HU", "ID", "IE", "IL", "IN", "IR", "IS", "IT", "JM", 
               "JO", "JP", "KE", "KH", "KR", "KW", "KZ", "LA", "LB", "LK", "LT", 
               "LU", "LV", "MA", "MN", "MO", "MT", "MU", "MX", "MY", "NG", "NI", 
               "NL", "NO", "NZ", "OM", "PA", "PE", "PH", "PK", "PL", "PR", "PS", 
               "PT", "PY", "QA", "RE", "RO", "RS", "RU", "SA", "SE", "SG", "SI", 
               "SK", "SN", "SV", "TH", "TN", "TR", "TT", "TW", "UA", "UG", "US", 
               "UY", "VE", "VG", "VI", "VN", "WS", "ZA", "ZM", "ZW")


 import <- read_csv("new raw data as text.txt", col_types = "c", col_names = F) %>% 
  filter(X1 != "Ab.",
         X1 != ".") %>% 
  mutate(index = row_number())


 # split into parts by spaces
 processed <- import %>% 
  mutate(value = str_split(X1, pattern = " ")) %>% 
  unnest(value) %>% # unpivot
  
  # set column name to rank of item
  # find the dates, by a dash
  mutate(dates = if_else(str_detect(value, "-") | value == "None", value, NULL)) %>% 
  separate(dates, "-", into = "start_number", extra = "drop", remove = F) %>% 
  mutate(start_number = as.numeric(start_number)) %>% 
  mutate(dates = case_when(value == "None" ~ value,
                           !is.na(start_number) ~ dates)) %>% 
  select(-start_number) %>% 
  
  # if the value is a complete number... it's the person_id
  mutate(person_id = if_else(!is.na(as.numeric(value)), value, NULL)) %>% 
  
  # if the value is one of the following two letter country abbreviations, we're good
  mutate(country = if_else(value %in% countries, value, NULL)) %>% 
  
  # if the word is "Tableau" it's the start of the accreditation title (we'll come back to this)
  mutate(is_tableau = case_when(value == "Tableau" ~ "certification", 
                                !is.na(dates)|!is.na(person_id)|!is.na(country) ~ "data")) %>% 
  fill(is_tableau) %>% 
  mutate(certification = if_else(is_tableau == "certification", value, NULL)) %>% 
  select(-is_tableau) %>% 
  mutate(person_name = if_else(is.na(dates) & is.na(person_id) & is.na(country) & is.na(certification), value, NULL))


 # take away dates, person names and accreditation names... 
 # these have multiple parts and need to be concatenated together

 # dates
 dates <- processed %>% 
  select(index, dates) %>% 
  filter(!is.na(dates)) %>% 
  group_by(index) %>% 
  mutate(row_number = row_number()) %>% 
  spread(row_number, dates) %>% 
  rename(achieved_date = 2,
         expiry_date = 3)


 # person names
 person_names <- processed %>% 
  select(index, person_name) %>% 
  filter(!is.na(person_name)) %>% 
  group_by(index) %>% 
  mutate(row_number = row_number()) %>% 
  spread(row_number, person_name) 

 person_names <- person_names %>% 
  select(index) %>% 
  bind_cols(
    person_names %>% 
      ungroup() %>% 
      select(-index) %>% 
      apply(1, function(x) paste(x[!is.na(x)], collapse = " ")) %>% 
      as.data.frame() %>% 
      rename(Full_name = 1))


 # accreditations
 certificates <- processed %>% 
  select(index, certification) %>% 
  filter(!is.na(certification)) %>% 
  group_by(index) %>% 
  mutate(row_number = row_number()) %>% 
  spread(row_number, certification) 


 certificates <- certificates %>% 
  select(index) %>% 
  bind_cols(
    certificates %>% 
      ungroup() %>% 
      select(-index) %>% 
      apply(1, function(x) paste(x[!is.na(x)], collapse = " ")) %>% 
      as.data.frame() %>% 
      rename(certificate_name = 1))


 person_id <- processed %>% 
  select(index, person_id) %>% 
  filter(!is.na(person_id))


 country <- processed %>% 
  select(index, country) %>% 
  filter(!is.na(country))



 # let's piece all the data together
 completed <- import %>%
  left_join(person_id , by = "index") %>%
  left_join(person_names, by = "index") %>% 
  left_join(country , by = "index") %>%
  left_join(certificates, by = "index") %>% 
  left_join(dates, by = "index") %>% 
  fill(person_id, Full_name, country) %>% 
  filter(!is.na(certificate_name)) %>% 
  select(-X1, -index)


 completed %>% 
  write_csv("Certifications in Excel.csv", na = "")
	library(tidyverse)
	setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

	countries <- c("AE", "AF", "AN", "AO", "AR", "AS", "AT", "AU", "BB", "BD",
	"BE", "BF", "BH", "BM", "BN", "BR", "BY", "CA", "CH", "CL", "CN",
	"CO", "CR", "CZ", "DE", "DK", "DZ", "EC", "EE", "EG", "ES", "ET",
	"FI", "FO", "FR", "GA", "GB", "GE", "GH", "GI", "GR", "GT", "HK",
	"HN", "HR", "HU", "ID", "IE", "IL", "IN", "IR", "IS", "IT", "JM",
	"JO", "JP", "KE", "KH", "KR", "KW", "KZ", "LA", "LB", "LK", "LT",
	"LU", "LV", "MA", "MN", "MO", "MT", "MU", "MX", "MY", "NG", "NI",
	"NL", "NO", "NZ", "OM", "PA", "PE", "PH", "PK", "PL", "PR", "PS",
	"PT", "PY", "QA", "RE", "RO", "RS", "RU", "SA", "SE", "SG", "SI",
	"SK", "SN", "SV", "TH", "TN", "TR", "TT", "TW", "UA", "UG", "US",
	"UY", "VE", "VG", "VI", "VN", "WS", "ZA", "ZM", "ZW")


	import <- read_csv("new raw data as text.txt", col_types = "c", col_names = F) %>%
	filter(X1 != "Ab.",
	X1 != ".") %>%
	mutate(index = row_number())


	# split into parts by spaces
	processed <- import %>%
	mutate(value = str_split(X1, pattern = " ")) %>%
	unnest(value) %>% # unpivot

	# set column name to rank of item
	# find the dates, by a dash
	mutate(dates = if_else(str_detect(value, "-") \| value == "None", value, NULL)) %>%
	separate(dates, "-", into = "start_number", extra = "drop", remove = F) %>%
	mutate(start_number = as.numeric(start_number)) %>%
	mutate(dates = case_when(value == "None" ~ value,
	!is.na(start_number) ~ dates)) %>%
	select(-start_number) %>%

	# if the value is a complete number... it's the person_id
	mutate(person_id = if_else(!is.na(as.numeric(value)), value, NULL)) %>%

	# if the value is one of the following two letter country abbreviations, we're good
	mutate(country = if_else(value %in% countries, value, NULL)) %>%

	# if the word is "Tableau" it's the start of the accreditation title (we'll come back to this)
	mutate(is_tableau = case_when(value == "Tableau" ~ "certification",
	!is.na(dates)\|!is.na(person_id)\|!is.na(country) ~ "data")) %>%
	fill(is_tableau) %>%
	mutate(certification = if_else(is_tableau == "certification", value, NULL)) %>%
	select(-is_tableau) %>%
	mutate(person_name = if_else(is.na(dates) & is.na(person_id) & is.na(country) & is.na(certification), value, NULL))


	# take away dates, person names and accreditation names...
	# these have multiple parts and need to be concatenated together

	# dates
	dates <- processed %>%
	select(index, dates) %>%
	filter(!is.na(dates)) %>%
	group_by(index) %>%
	mutate(row_number = row_number()) %>%
	spread(row_number, dates) %>%
	rename(achieved_date = 2,
	expiry_date = 3)


	# person names
	person_names <- processed %>%
	select(index, person_name) %>%
	filter(!is.na(person_name)) %>%
	group_by(index) %>%
	mutate(row_number = row_number()) %>%
	spread(row_number, person_name)

	person_names <- person_names %>%
	select(index) %>%
	bind_cols(
	person_names %>%
	ungroup() %>%
	select(-index) %>%
	apply(1, function(x) paste(x[!is.na(x)], collapse = " ")) %>%
	as.data.frame() %>%
	rename(Full_name = 1))


	# accreditations
	certificates <- processed %>%
	select(index, certification) %>%
	filter(!is.na(certification)) %>%
	group_by(index) %>%
	mutate(row_number = row_number()) %>%
	spread(row_number, certification)


	certificates <- certificates %>%
	select(index) %>%
	bind_cols(
	certificates %>%
	ungroup() %>%
	select(-index) %>%
	apply(1, function(x) paste(x[!is.na(x)], collapse = " ")) %>%
	as.data.frame() %>%
	rename(certificate_name = 1))


	person_id <- processed %>%
	select(index, person_id) %>%
	filter(!is.na(person_id))


	country <- processed %>%
	select(index, country) %>%
	filter(!is.na(country))



	# let's piece all the data together
	completed <- import %>%
	left_join(person_id , by = "index") %>%
	left_join(person_names, by = "index") %>%
	left_join(country , by = "index") %>%
	left_join(certificates, by = "index") %>%
	left_join(dates, by = "index") %>%
	fill(person_id, Full_name, country) %>%
	filter(!is.na(certificate_name)) %>%
	select(-X1, -index)


	completed %>%
	write_csv("Certifications in Excel.csv", na = "")