Skip to content

Instantly share code, notes, and snippets.

@halhen
Last active July 17, 2018 13:20
Show Gist options
  • Save halhen/5816f6aa102e4bbf02871bdc5ddda2b3 to your computer and use it in GitHub Desktop.
Save halhen/5816f6aa102e4bbf02871bdc5ddda2b3 to your computer and use it in GitHub Desktop.
library(tidyverse)
library(tidytext)
# Contains both 1st and 2nd level codes; collisions won't appear though since 1-99 is tier 1 and 100+ is two
df.names <- read_csv2('../data/atus/codes.csv') %>%
mutate(code = as.integer(code))
df.resp <- read_csv('../data/atus/atusresp.csv')
df.act <- read_csv('../data/atus/atusact.csv', col_types=cols(tustarttim = col_character(), tustoptime = col_character()))
df.sum <- read_csv('../data/atus/atussum.csv')
df <- df.act %>%
select(tucaseid, tuactivity_n, trcodep, trtier1p, trtier2p, tuactdur24) %>%
left_join(df.names %>% rename(activity = name), by=c('trcodep' = 'code')) %>%
left_join(df.sum %>% select(tucaseid, tesex, teage, tudiaryday, tryhhchild, tuyear, trsppres), by='tucaseid')
# Summary stats
df %>%
group_by(activity) %>%
summarize(n=n()) %>%
arrange(-n)
# tf-idf
df %>%
filter(teage >= 25, teage <= 45) %>%
filter(trtier1p != 18, trtier1p != 50) %>%
mutate(gender = ifelse(tesex==1, 'Male', 'Female'),
kids = ifelse(tryhhchild >= 0, 'Kids', 'No kids'),
group = paste0(kids, ' - ', gender)) %>%
group_by(gender, group, activity) %>%
summarize(n=n()) %>%
filter(n >= 20) %>%
bind_tf_idf(activity, group, n) %>%
group_by(group) %>%
top_n(10, tf_idf) %>%
ggplot(aes(reorder(activity, tf_idf), tf_idf, fill=gender)) +
geom_bar(stat='identity', show.legend=FALSE) +
scale_y_continuous(breaks=NULL) +
coord_flip() +
labs(x="", y="", title="Particular activities by gender and parenthood, 25-45 year olds", caption="@hnrklndbrg | Source: American Time Use Survey") +
facet_wrap(~ group, scales='free', ncol=2) +
hrbrthemes::theme_ipsum(grid='')
ggsave('/tmp/gender.png', width=12, height=6)
df %>%
filter(trtier1p != 18, trtier1p != 50) %>%
filter(teage < 85) %>%
mutate(age.group = trunc((teage + 5) / 10) * 10,
group = paste0(age.group - 5, '-', age.group + 4)) %>%
group_by(trtier1p, group, activity) %>%
summarize(n=n()) %>%
ungroup() %>%
filter(n >= 20) %>%
bind_tf_idf(activity, group, n) %>%
group_by(group) %>%
top_n(10, tf_idf) %>%
left_join(df.names %>% mutate(category = name), by=c('trtier1p'='code')) %>%
ggplot(aes(reorder(activity, tf_idf), tf_idf, fill=category)) +
geom_bar(stat='identity') +
scale_y_continuous(breaks=NULL) +
coord_flip() +
labs(x="", y="", title="Particular activities by age", caption="@hnrklndbrg | Source: American Time Use Survey") +
scale_fill_discrete(name='') +
facet_wrap(~ group, scales='free', ncol=2) +
hrbrthemes::theme_ipsum(grid='') +
theme(legend.position=c(0.7, 0.1))
ggsave('/tmp/age.png', width=12, height=12)
# LDA
df.tmp <- df %>%
# exclude travel
filter(trtier1p != 18, trtier1p != 50) %>%
filter(tuyear >= 2013) %>%
#thursday
filter(tudiaryday == 5) %>%
group_by(tucaseid, activity) %>%
summarize(n=sum(tuactdur24))
set.seed(1234)
lda <- df.tmp %>%
cast_dtm(tucaseid, activity, n) %>%
topicmodels::LDA(k=5)
# Classify and join with summary file to get demographics
df.tmp2 <- tidy(lda, matrix='gamma') %>%
group_by(document) %>%
filter(gamma == max(gamma)) %>%
left_join(df.sum %>% mutate(document = as.character(tucaseid)), by='document')
(p.demo <- df.tmp2 %>%
filter(teage < 85) %>%
mutate(age.group = trunc((teage + 5) / 10) * 10,
age.group = paste0(age.group - 5, '-', age.group + 4)) %>%
transmute(topic,
gender = ifelse(tesex == 1, 'Male', 'Female'),
age = age.group,
kids = ifelse(tryhhchild >= 0, 'Kids', 'No kids')) %>%
gather(key, value, gender:kids) %>%
group_by(topic, key, value) %>%
summarize(n=n()) %>%
mutate(p=n/sum(n)) %>%
ggplot(aes(value, p, fill=factor(topic))) +
geom_bar(stat='identity', show.legend=FALSE) +
scale_y_continuous(labels=scales::percent) +
facet_grid(topic~key, scales='free_x', space='free_x') +
labs(x="", y="") +
hrbrthemes::theme_ipsum(grid='Y') +
theme(strip.text=element_blank()))
(p.dim <- lda %>%
tidy(matrix='beta') %>%
filter(term != 'Sleeping') %>%
group_by(topic) %>%
top_n(10, beta) %>%
ggplot(aes(reorder(term, beta), beta, fill=factor(topic))) +
geom_bar(stat='identity', show.legend=FALSE) +
coord_flip() +
scale_y_continuous(labels=function(x) {rep("", length(x))}) +
labs(x="", y="") +
facet_grid(topic~., scales='free') +
hrbrthemes::theme_ipsum(grid='') +
theme(strip.text=element_blank()))
(p <- gridExtra::grid.arrange(p.dim, p.demo, ncol=2))
ggsave('/tmp/lda.png', plot=p, width=12, height=10)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment