Last active
August 29, 2015 14:15
-
-
Save jalapic/5e05ab273e3388b5bf74 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### Quick illustration of dplyr / ggplot2 | |
library(dplyr) | |
library(ggplot2) | |
library(magrittr) | |
# Install babynames package | |
install.packages("babynames") | |
# load library | |
library(babynames) | |
## take a look at the data | |
str(babynames) | |
head(babynames) | |
tail(babynames) | |
?babynames | |
# Think of some questions you'd like me to answer using this data ??? | |
### e.g. which name was the most popular ever - and in which year? | |
babynames %>% arrange(desc(n)) | |
babynames %>% arrange(desc(prop)) | |
babynames %>% arrange(desc(prop)) %>% filter(sex=="F") | |
babynames %>% arrange(desc(prop)) %>% filter(sex=="F") | |
babynames %>% arrange(desc(prop)) %>% filter(sex=="F" & name!="Mary") | |
# e.g. How has the use of James changed over time? | |
james <- babynames %>% | |
filter(name=="James") | |
head(james) | |
# line graph | |
ggplot(james, aes(year, n)) + | |
geom_line(aes(color=sex), lwd=1) + | |
scale_color_manual(values = c("firebrick1", "dodgerblue")) + | |
theme_bw() | |
#can do it all in one | |
babynames %>% | |
filter(name=="Chris") %$% | |
ggplot(., aes(year, n)) + | |
geom_line(aes(color=sex), lwd=1) + | |
scale_color_manual(values = c("firebrick1", "dodgerblue")) + | |
theme_bw() | |
## let's compare three names... | |
threenames <- babynames %>% | |
filter(sex=="F") %>% | |
filter(name=="Jennifer" | name=="Sarah" | name=="Mary" ) | |
head(threenames) | |
ggplot(threenames, aes(year, n)) + | |
geom_line(aes(group=name, color=name), lwd=1) + | |
scale_color_manual(values = c("firebrick1", "dodgerblue", "darkorange")) + | |
theme_bw() | |
# another way of doing the filtering above. | |
mynames <- c("Kim", "Khloe", "Kendall", "Kourtney", "Kylie") | |
kards <- babynames %>% | |
filter(sex=="F") %>% | |
filter((name %in% mynames ) == T) | |
head(kards) | |
ggplot(kards, aes(year, n)) + | |
geom_line(aes(group=name, color=name), lwd=1) + | |
scale_color_manual(values = c("firebrick1", "dodgerblue", "darkorange", "purple", "green1")) + | |
theme_bw() | |
## e.g. 2. Names that don't get used anymore... | |
pre <- | |
babynames %>% | |
filter(year<1945) %>% | |
group_by(name) %>% | |
summarize(total = sum(n)) %>% | |
rename(npre = total) | |
post <- | |
babynames %>% | |
filter(year>=1945) %>% | |
group_by(name) %>% | |
summarize(total = sum(n)) %>% | |
rename(npost = total) | |
head(pre) | |
head(post) | |
full_join(pre, post, by = "name") #notice that some rows have "NA" - means they're missing (i.e. zeros) | |
ournames <- full_join(pre, post, by = "name") #notice that some rows have "NA" - means they're missing (i.e. zeros) | |
ournames %>% | |
filter(is.na(npost)==T) %>% | |
arrange(desc(npre)) | |
## let's look at distribution of each of these in one graph.... | |
#extact top 10 vanished names as a character vector | |
oldnames <- ournames %>% | |
filter(is.na(npost)==T) %>% | |
arrange(desc(npre)) %>% | |
head(10) %>% | |
.$name | |
oldnames | |
oldnames1 <- babynames %>% | |
filter((name %in% oldnames ) == T) | |
head(oldnames1) | |
ggplot(oldnames1, aes(year, n)) + | |
geom_line(aes(group=name, color=name), lwd=1) + | |
theme_bw() | |
### names beginning with "Adelaid" | |
x<-babynames[(grepl("Adelaid", babynames$name) == T),] | |
unique(x$name) | |
adelaids <- | |
babynames %>% | |
filter(grepl("Adelaid", name) == T) %>% | |
filter(sex=="F") %>% | |
group_by(year) %>% | |
summarize(total=sum(n)) | |
adelaids | |
ggplot(adelaids, aes(year, total)) + | |
geom_line(color="black", lwd=1) + | |
theme_bw() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment