Last active
July 17, 2018 13:20
-
-
Save halhen/5816f6aa102e4bbf02871bdc5ddda2b3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(tidytext) | |
# Contains both 1st and 2nd level codes; collisions won't appear though since 1-99 is tier 1 and 100+ is two | |
df.names <- read_csv2('../data/atus/codes.csv') %>% | |
mutate(code = as.integer(code)) | |
df.resp <- read_csv('../data/atus/atusresp.csv') | |
df.act <- read_csv('../data/atus/atusact.csv', col_types=cols(tustarttim = col_character(), tustoptime = col_character())) | |
df.sum <- read_csv('../data/atus/atussum.csv') | |
df <- df.act %>% | |
select(tucaseid, tuactivity_n, trcodep, trtier1p, trtier2p, tuactdur24) %>% | |
left_join(df.names %>% rename(activity = name), by=c('trcodep' = 'code')) %>% | |
left_join(df.sum %>% select(tucaseid, tesex, teage, tudiaryday, tryhhchild, tuyear, trsppres), by='tucaseid') | |
# Summary stats | |
df %>% | |
group_by(activity) %>% | |
summarize(n=n()) %>% | |
arrange(-n) | |
# tf-idf | |
df %>% | |
filter(teage >= 25, teage <= 45) %>% | |
filter(trtier1p != 18, trtier1p != 50) %>% | |
mutate(gender = ifelse(tesex==1, 'Male', 'Female'), | |
kids = ifelse(tryhhchild >= 0, 'Kids', 'No kids'), | |
group = paste0(kids, ' - ', gender)) %>% | |
group_by(gender, group, activity) %>% | |
summarize(n=n()) %>% | |
filter(n >= 20) %>% | |
bind_tf_idf(activity, group, n) %>% | |
group_by(group) %>% | |
top_n(10, tf_idf) %>% | |
ggplot(aes(reorder(activity, tf_idf), tf_idf, fill=gender)) + | |
geom_bar(stat='identity', show.legend=FALSE) + | |
scale_y_continuous(breaks=NULL) + | |
coord_flip() + | |
labs(x="", y="", title="Particular activities by gender and parenthood, 25-45 year olds", caption="@hnrklndbrg | Source: American Time Use Survey") + | |
facet_wrap(~ group, scales='free', ncol=2) + | |
hrbrthemes::theme_ipsum(grid='') | |
ggsave('/tmp/gender.png', width=12, height=6) | |
df %>% | |
filter(trtier1p != 18, trtier1p != 50) %>% | |
filter(teage < 85) %>% | |
mutate(age.group = trunc((teage + 5) / 10) * 10, | |
group = paste0(age.group - 5, '-', age.group + 4)) %>% | |
group_by(trtier1p, group, activity) %>% | |
summarize(n=n()) %>% | |
ungroup() %>% | |
filter(n >= 20) %>% | |
bind_tf_idf(activity, group, n) %>% | |
group_by(group) %>% | |
top_n(10, tf_idf) %>% | |
left_join(df.names %>% mutate(category = name), by=c('trtier1p'='code')) %>% | |
ggplot(aes(reorder(activity, tf_idf), tf_idf, fill=category)) + | |
geom_bar(stat='identity') + | |
scale_y_continuous(breaks=NULL) + | |
coord_flip() + | |
labs(x="", y="", title="Particular activities by age", caption="@hnrklndbrg | Source: American Time Use Survey") + | |
scale_fill_discrete(name='') + | |
facet_wrap(~ group, scales='free', ncol=2) + | |
hrbrthemes::theme_ipsum(grid='') + | |
theme(legend.position=c(0.7, 0.1)) | |
ggsave('/tmp/age.png', width=12, height=12) | |
# LDA | |
df.tmp <- df %>% | |
# exclude travel | |
filter(trtier1p != 18, trtier1p != 50) %>% | |
filter(tuyear >= 2013) %>% | |
#thursday | |
filter(tudiaryday == 5) %>% | |
group_by(tucaseid, activity) %>% | |
summarize(n=sum(tuactdur24)) | |
set.seed(1234) | |
lda <- df.tmp %>% | |
cast_dtm(tucaseid, activity, n) %>% | |
topicmodels::LDA(k=5) | |
# Classify and join with summary file to get demographics | |
df.tmp2 <- tidy(lda, matrix='gamma') %>% | |
group_by(document) %>% | |
filter(gamma == max(gamma)) %>% | |
left_join(df.sum %>% mutate(document = as.character(tucaseid)), by='document') | |
(p.demo <- df.tmp2 %>% | |
filter(teage < 85) %>% | |
mutate(age.group = trunc((teage + 5) / 10) * 10, | |
age.group = paste0(age.group - 5, '-', age.group + 4)) %>% | |
transmute(topic, | |
gender = ifelse(tesex == 1, 'Male', 'Female'), | |
age = age.group, | |
kids = ifelse(tryhhchild >= 0, 'Kids', 'No kids')) %>% | |
gather(key, value, gender:kids) %>% | |
group_by(topic, key, value) %>% | |
summarize(n=n()) %>% | |
mutate(p=n/sum(n)) %>% | |
ggplot(aes(value, p, fill=factor(topic))) + | |
geom_bar(stat='identity', show.legend=FALSE) + | |
scale_y_continuous(labels=scales::percent) + | |
facet_grid(topic~key, scales='free_x', space='free_x') + | |
labs(x="", y="") + | |
hrbrthemes::theme_ipsum(grid='Y') + | |
theme(strip.text=element_blank())) | |
(p.dim <- lda %>% | |
tidy(matrix='beta') %>% | |
filter(term != 'Sleeping') %>% | |
group_by(topic) %>% | |
top_n(10, beta) %>% | |
ggplot(aes(reorder(term, beta), beta, fill=factor(topic))) + | |
geom_bar(stat='identity', show.legend=FALSE) + | |
coord_flip() + | |
scale_y_continuous(labels=function(x) {rep("", length(x))}) + | |
labs(x="", y="") + | |
facet_grid(topic~., scales='free') + | |
hrbrthemes::theme_ipsum(grid='') + | |
theme(strip.text=element_blank())) | |
(p <- gridExtra::grid.arrange(p.dim, p.demo, ncol=2)) | |
ggsave('/tmp/lda.png', plot=p, width=12, height=10) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment