Skip to content

Instantly share code, notes, and snippets.

@jsta
Created March 11, 2020 20:05
Show Gist options
  • Save jsta/65e4f014cafcc8aee03bdbc744c88d4e to your computer and use it in GitHub Desktop.
Save jsta/65e4f014cafcc8aee03bdbc744c88d4e to your computer and use it in GitHub Desktop.
Create wordcloud from mbox email archive [unfinish]
library(mboxr)
library(tm.plugin.webmining)
library(tm)
library(wordcloud)
fpath <- path.expand("~/Downloads/jobs.mbox")
data <- read_mbox(fpath)
content <- paste(unlist(data$content), collapse = " ")
test <- extractHTMLStrip(content)
write.csv(test, "~/Downloads/jobs.csv")
test3 <- read.csv("~/Downloads/jobs.csv", stringsAsFactors = FALSE)$x
test2 <- removeWords(tolower(test3), c("research", "please", "skills", "applicants",
"successful", "students", "submit", "andor",
"review", "open", "campus", "begin", "join",
"resources", "salary", "develop", "diverse",
"education", "publication", "current", "changes",
"experience", "consideration", "five", "pdfs",
"opportunity", "background", "available",
"textindent25in", "home", "computer",
"annoucement", "document", "units", "hours",
"this", "list", "located", "addresses", "letter",
"project", "fontfamily", "send", "new", "located",
"pdf", "search", "august", "across",
"msostylepriority99", "without", "school",
"benefits", "one", "email", "strong", "includes",
"67698693", "baylor", "description", "north", "nov",
"action", "gleonall", "schools", "skype", "continue",
"relevant", "existing", "1st", "text", "thank",
"references", "three", "full", "dear", "350", "506",
"looking", "msolevelnumberformatbullet", "2018",
"will", "contact", "include", "including", "sciences",
"candidates", "department", "application", "program",
"work", "related", "applications", "position",
"candidate", "state", "msolevelnumberposition",
"must", "indent25in", "time", "university",
"can", "apply", "msoleveltabstopnone", "also",
"institute", "within", "date", "qualifications",
"may", "cover", "msolevelnumberleft", "center",
"msolevelnumberformatbullet", "working",
"undergraduate", "questions", "phd", "change"))
test2 <- stringr::str_remove(test2, "msolevel.*") %>%
stringr::str_remove("information") %>%
stringr::str_remove("msolevelnumberformatbullet") %>%
stringr::str_remove("andor")
wordcloud(test2, max.words = 20)
clipr::write_clip(test)
cleanFun <- function(htmlString) {
return(gsub("<.*?>", "", htmlString))
}
res <- list()
for(i in c(3:length(data$content))){
print(i)
res[[i]] <- html_text(read_html(data$content[2]))
}
res <- cleanFun(content)
setwd("~/Downloads")
test <- read.csv("~/Downloads/jobs.csv")
test <- merge_mbox_all(path = "jobs.mbox",
out = "jobs.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment