Skip to content

Instantly share code, notes, and snippets.

@jennybc
Created November 5, 2014 18:44
Show Gist options
  • Save jennybc/bd1626735dd221c30f1a to your computer and use it in GitHub Desktop.
Save jennybc/bd1626735dd221c30f1a to your computer and use it in GitHub Desktop.
Demonstrate that dplyr::left_join() can mangle factor levels
suppressPackageStartupMessages(library(dplyr))

Get word counts for Character * Chapter in LOTR trilogy

if(!file.exists("lotr_clean.tsv")) {
  download.file(paste0("https://raw.githubusercontent.com/jennybc/",
                       "lotr/master/lotr_clean.tsv"), 
                destfile = "lotr_clean.tsv", method = "curl")
}
lotr_dat <- read.delim("lotr_clean.tsv", stringsAsFactor = FALSE)

Create Gender factor. Turn Film and Race into factors.

females <- c("Galadriel", "Arwen", "Lobelia Sackville-Baggins", "Rosie",
             "Mrs. Bracegirdle", "Eowyn", "Freda", "Rohan Maiden")
lotr_dat <-
  mutate(lotr_dat,
         Film = factor(Film, levels = c("The Fellowship Of The Ring",
                                        "The Two Towers",
                                        "The Return Of The King")),
         Gender = factor(ifelse(Character %in% females, "Female", "Male")),
         Race = factor(Race))

Drop many races and get word count for Film * Gender * Race. Sadly, group_by() drops empty groups.

lotr_words <- lotr_dat %>%
  filter(Race %in% c("Elf", "Hobbit", "Man")) %>%
  droplevels %>%
  group_by(Film, Gender, Race) %>%
  summarize(Words = sum(Words))

Get all possible groups as preparation for post hoc rescue of empty groups.

all_combns <- with(lotr_words,
                   expand.grid(Film = levels(Film), Race = levels(Race),
                               Gender = levels(Gender)))

Film factors BEFORE the join: level order = order of the movies.

summary(lotr_words$Film)
## The Fellowship Of The Ring             The Two Towers 
##                          5                          5 
##     The Return Of The King 
##                          6
summary(all_combns$Film)
## The Fellowship Of The Ring             The Two Towers 
##                          6                          6 
##     The Return Of The King 
##                          6

Use left_join() to rescue empty groups

lotr_tidy <- left_join(all_combns, lotr_words)
## Joining by: c("Film", "Race", "Gender")

Film factor AFTER the join: level order = alphabetical.

summary(lotr_tidy$Film)
## The Fellowship Of The Ring     The Return Of The King 
##                          6                          6 
##             The Two Towers 
##                          6
sessionInfo()
## R version 3.1.1 (2014-07-10)
## Platform: x86_64-apple-darwin10.8.0 (64-bit)
## 
## locale:
## [1] en_CA.UTF-8/en_CA.UTF-8/en_CA.UTF-8/C/en_CA.UTF-8/en_CA.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] dplyr_0.3.0.1
## 
## loaded via a namespace (and not attached):
##  [1] assertthat_0.1  DBI_0.3.1       digest_0.6.4    evaluate_0.5.5 
##  [5] formatR_1.0     htmltools_0.2.6 knitr_1.6.20    lazyeval_0.1.9 
##  [9] magrittr_1.0.1  parallel_3.1.1  Rcpp_0.11.3     rmarkdown_0.3.3
## [13] stringr_0.6.2   tools_3.1.1     yaml_2.1.13
#' ---
#' output:
#' md_document:
#' variant: markdown_github
#' ---
suppressPackageStartupMessages(library(dplyr))
#' Get word counts for Character * Chapter in LOTR trilogy
if(!file.exists("lotr_clean.tsv")) {
download.file(paste0("https://raw.githubusercontent.com/jennybc/",
"lotr/master/lotr_clean.tsv"),
destfile = "lotr_clean.tsv", method = "curl")
}
lotr_dat <- read.delim("lotr_clean.tsv", stringsAsFactor = FALSE)
#' Create Gender factor. Turn Film and Race into factors.
females <- c("Galadriel", "Arwen", "Lobelia Sackville-Baggins", "Rosie",
"Mrs. Bracegirdle", "Eowyn", "Freda", "Rohan Maiden")
lotr_dat <-
mutate(lotr_dat,
Film = factor(Film, levels = c("The Fellowship Of The Ring",
"The Two Towers",
"The Return Of The King")),
Gender = factor(ifelse(Character %in% females, "Female", "Male")),
Race = factor(Race))
#' Drop many races and get word count for Film * Gender * Race. Sadly,
#' group_by() drops empty groups.
lotr_words <- lotr_dat %>%
filter(Race %in% c("Elf", "Hobbit", "Man")) %>%
droplevels %>%
group_by(Film, Gender, Race) %>%
summarize(Words = sum(Words))
#' Get all possible groups as preparation for post hoc rescue of empty groups.
all_combns <- with(lotr_words,
expand.grid(Film = levels(Film), Race = levels(Race),
Gender = levels(Gender)))
#' Film factors BEFORE the join: level order = order of the movies.
summary(lotr_words$Film)
summary(all_combns$Film)
#' Use left_join() to rescue empty groups
lotr_tidy <- left_join(all_combns, lotr_words)
#' Film factor AFTER the join: level order = alphabetical.
summary(lotr_tidy$Film)
sessionInfo()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment