Created
October 17, 2019 01:45
-
-
Save Tadge-Analytics/445261cfe0fc9a21165525b3ecf3172f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) | |
countries <- c("AE", "AF", "AN", "AO", "AR", "AS", "AT", "AU", "BB", "BD", | |
"BE", "BF", "BH", "BM", "BN", "BR", "BY", "CA", "CH", "CL", "CN", | |
"CO", "CR", "CZ", "DE", "DK", "DZ", "EC", "EE", "EG", "ES", "ET", | |
"FI", "FO", "FR", "GA", "GB", "GE", "GH", "GI", "GR", "GT", "HK", | |
"HN", "HR", "HU", "ID", "IE", "IL", "IN", "IR", "IS", "IT", "JM", | |
"JO", "JP", "KE", "KH", "KR", "KW", "KZ", "LA", "LB", "LK", "LT", | |
"LU", "LV", "MA", "MN", "MO", "MT", "MU", "MX", "MY", "NG", "NI", | |
"NL", "NO", "NZ", "OM", "PA", "PE", "PH", "PK", "PL", "PR", "PS", | |
"PT", "PY", "QA", "RE", "RO", "RS", "RU", "SA", "SE", "SG", "SI", | |
"SK", "SN", "SV", "TH", "TN", "TR", "TT", "TW", "UA", "UG", "US", | |
"UY", "VE", "VG", "VI", "VN", "WS", "ZA", "ZM", "ZW") | |
import <- read_csv("new raw data as text.txt", col_types = "c", col_names = F) %>% | |
filter(X1 != "Ab.", | |
X1 != ".") %>% | |
mutate(index = row_number()) | |
# split into parts by spaces | |
processed <- import %>% | |
mutate(value = str_split(X1, pattern = " ")) %>% | |
unnest(value) %>% # unpivot | |
# set column name to rank of item | |
# find the dates, by a dash | |
mutate(dates = if_else(str_detect(value, "-") | value == "None", value, NULL)) %>% | |
separate(dates, "-", into = "start_number", extra = "drop", remove = F) %>% | |
mutate(start_number = as.numeric(start_number)) %>% | |
mutate(dates = case_when(value == "None" ~ value, | |
!is.na(start_number) ~ dates)) %>% | |
select(-start_number) %>% | |
# if the value is a complete number... it's the person_id | |
mutate(person_id = if_else(!is.na(as.numeric(value)), value, NULL)) %>% | |
# if the value is one of the following two letter country abbreviations, we're good | |
mutate(country = if_else(value %in% countries, value, NULL)) %>% | |
# if the word is "Tableau" it's the start of the accreditation title (we'll come back to this) | |
mutate(is_tableau = case_when(value == "Tableau" ~ "certification", | |
!is.na(dates)|!is.na(person_id)|!is.na(country) ~ "data")) %>% | |
fill(is_tableau) %>% | |
mutate(certification = if_else(is_tableau == "certification", value, NULL)) %>% | |
select(-is_tableau) %>% | |
mutate(person_name = if_else(is.na(dates) & is.na(person_id) & is.na(country) & is.na(certification), value, NULL)) | |
# take away dates, person names and accreditation names... | |
# these have multiple parts and need to be concatenated together | |
# dates | |
dates <- processed %>% | |
select(index, dates) %>% | |
filter(!is.na(dates)) %>% | |
group_by(index) %>% | |
mutate(row_number = row_number()) %>% | |
spread(row_number, dates) %>% | |
rename(achieved_date = 2, | |
expiry_date = 3) | |
# person names | |
person_names <- processed %>% | |
select(index, person_name) %>% | |
filter(!is.na(person_name)) %>% | |
group_by(index) %>% | |
mutate(row_number = row_number()) %>% | |
spread(row_number, person_name) | |
person_names <- person_names %>% | |
select(index) %>% | |
bind_cols( | |
person_names %>% | |
ungroup() %>% | |
select(-index) %>% | |
apply(1, function(x) paste(x[!is.na(x)], collapse = " ")) %>% | |
as.data.frame() %>% | |
rename(Full_name = 1)) | |
# accreditations | |
certificates <- processed %>% | |
select(index, certification) %>% | |
filter(!is.na(certification)) %>% | |
group_by(index) %>% | |
mutate(row_number = row_number()) %>% | |
spread(row_number, certification) | |
certificates <- certificates %>% | |
select(index) %>% | |
bind_cols( | |
certificates %>% | |
ungroup() %>% | |
select(-index) %>% | |
apply(1, function(x) paste(x[!is.na(x)], collapse = " ")) %>% | |
as.data.frame() %>% | |
rename(certificate_name = 1)) | |
person_id <- processed %>% | |
select(index, person_id) %>% | |
filter(!is.na(person_id)) | |
country <- processed %>% | |
select(index, country) %>% | |
filter(!is.na(country)) | |
# let's piece all the data together | |
completed <- import %>% | |
left_join(person_id , by = "index") %>% | |
left_join(person_names, by = "index") %>% | |
left_join(country , by = "index") %>% | |
left_join(certificates, by = "index") %>% | |
left_join(dates, by = "index") %>% | |
fill(person_id, Full_name, country) %>% | |
filter(!is.na(certificate_name)) %>% | |
select(-X1, -index) | |
completed %>% | |
write_csv("Certifications in Excel.csv", na = "") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment