Skip to content

Instantly share code, notes, and snippets.

@Tadge-Analytics
Created October 17, 2019 01:47
Show Gist options
  • Save Tadge-Analytics/89cfb323b926052d2b484334de77bd8e to your computer and use it in GitHub Desktop.
Save Tadge-Analytics/89cfb323b926052d2b484334de77bd8e to your computer and use it in GitHub Desktop.
library(tidyverse)
library(gmailr)
# authorise gmailr
search_term <- "from:([email protected]) new jobs for tableau in Melbourne"
messageIDs <- messages(search = search_term, num_results = 5)
my_messages <- tibble(messageIDs) %>%
mutate(downloaded_data = map(messageIDs, ~.x$messages %>%
modify_depth(1, "id") %>%
as.vector() %>%
map(message))) %>%
unnest(downloaded_data)
write_rds(my_messages, "rds files/downloaded emails.rds")
##################################
my_messages <- readRDS("rds files/downloaded emails.rds")
key_info <- my_messages %>%
mutate(id = map_chr(downloaded_data, ~gmailr::id(.x)),
date = map_chr(downloaded_data, ~gmailr::date(.x)),
date = lubridate::dmy_hms(date) %>% as.Date(),
subject = map_chr(downloaded_data, ~gmailr::subject(.x)))
# from the downloaded data we also, most importantly, want to parse out the body of the email
# to see if we can possibly organise that into some kind of useful Excel table.
with_body_content <- key_info %>%
mutate(body_content = map(downloaded_data, ~.x$payload$parts[[1]]$parts[[1]]$body$data %>%
RCurl::base64Decode(txt = .) %>%
str_split("\r\n|\t") %>%
unlist() %>%
tibble %>%
rename(text = 1) %>%
filter(text != ""))) %>%
select(-downloaded_data)
header_cells <- c("SEEK Job Mail",
"Hi Julian",
"tableau in Melbourne",
"posted yesterday match your Saved Search:",
"new jobs. Update your SEEK Profile",
"Manage your Saved Searches and subscription preferences",
"https://www.seek.com.au/my-activity/saved-search",
"SEEK Profile Link",
"Update your SEEK Profile")
tidy_data_frame <- with_body_content %>%
mutate(body_content = map(body_content, ~filter(.x, !str_detect(text, paste(header_cells, collapse = "|"))))) %>%
unnest(body_content)
final_tidy <- tidy_data_frame %>%
separate_rows(text, sep = "Suburb: ") %>%
mutate(cell_content = case_when(str_detect(text, "Location:") ~ "Location",
str_detect(text, "Advertiser:") ~ "Advertiser",
str_detect(text, "Salary:") ~ "Salary",
str_detect(text, "View this job at:") ~ "Link",
str_detect(text, "Jobs you may have missed") ~ "Missed Jobs Start",
TRUE ~ "Other")) %>%
mutate(cell_content = case_when(lead(cell_content) == "Location" ~ "Position",
lead(cell_content) == "Advertiser" ~ "Location",
lead(cell_content, 2) == "Advertiser" ~ "Position",
TRUE ~ cell_content)) %>%
filter(cell_content != "Other") %>%
mutate(previous_email_jobs = if_else(str_detect(cell_content, "Missed Jobs Start"), 1L, NA_integer_)) %>%
group_by(id) %>%
fill(previous_email_jobs) %>%
filter(cell_content != "Missed Jobs Start") %>%
mutate(text = if_else(cell_content != "Position",
str_replace(text, paste0(cell_content, ": "), ""),
text)) %>%
mutate(position_row = if_else(cell_content == "Position", row_number(), NA_integer_),
rank_of_position = if_else(!is.na(position_row), rank(position_row), NA_real_)) %>%
fill(rank_of_position) %>%
select(-position_row) %>%
spread(cell_content, text)
final_tidy %>%
writexl::write_xlsx("output/data.xlsx")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment