suppressPackageStartupMessages(library(dplyr))
Get word counts for Character * Chapter in LOTR trilogy
if(!file.exists("lotr_clean.tsv")) {
download.file(paste0("https://raw.githubusercontent.com/jennybc/",
"lotr/master/lotr_clean.tsv"),
destfile = "lotr_clean.tsv", method = "curl")
}
lotr_dat <- read.delim("lotr_clean.tsv", stringsAsFactor = FALSE)
Create Gender factor. Turn Film and Race into factors.
females <- c("Galadriel", "Arwen", "Lobelia Sackville-Baggins", "Rosie",
"Mrs. Bracegirdle", "Eowyn", "Freda", "Rohan Maiden")
lotr_dat <-
mutate(lotr_dat,
Film = factor(Film, levels = c("The Fellowship Of The Ring",
"The Two Towers",
"The Return Of The King")),
Gender = factor(ifelse(Character %in% females, "Female", "Male")),
Race = factor(Race))
Drop many races and get word count for Film * Gender * Race. Sadly, group_by() drops empty groups.
lotr_words <- lotr_dat %>%
filter(Race %in% c("Elf", "Hobbit", "Man")) %>%
droplevels %>%
group_by(Film, Gender, Race) %>%
summarize(Words = sum(Words))
Get all possible groups as preparation for post hoc rescue of empty groups.
all_combns <- with(lotr_words,
expand.grid(Film = levels(Film), Race = levels(Race),
Gender = levels(Gender)))
Film factors BEFORE the join: level order = order of the movies.
summary(lotr_words$Film)
## The Fellowship Of The Ring The Two Towers
## 5 5
## The Return Of The King
## 6
summary(all_combns$Film)
## The Fellowship Of The Ring The Two Towers
## 6 6
## The Return Of The King
## 6
Use left_join() to rescue empty groups
lotr_tidy <- left_join(all_combns, lotr_words)
## Joining by: c("Film", "Race", "Gender")
Film factor AFTER the join: level order = alphabetical.
summary(lotr_tidy$Film)
## The Fellowship Of The Ring The Return Of The King
## 6 6
## The Two Towers
## 6
sessionInfo()
## R version 3.1.1 (2014-07-10)
## Platform: x86_64-apple-darwin10.8.0 (64-bit)
##
## locale:
## [1] en_CA.UTF-8/en_CA.UTF-8/en_CA.UTF-8/C/en_CA.UTF-8/en_CA.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] dplyr_0.3.0.1
##
## loaded via a namespace (and not attached):
## [1] assertthat_0.1 DBI_0.3.1 digest_0.6.4 evaluate_0.5.5
## [5] formatR_1.0 htmltools_0.2.6 knitr_1.6.20 lazyeval_0.1.9
## [9] magrittr_1.0.1 parallel_3.1.1 Rcpp_0.11.3 rmarkdown_0.3.3
## [13] stringr_0.6.2 tools_3.1.1 yaml_2.1.13