Created
July 11, 2014 12:24
-
-
Save agoldst/d9782993efa8c977b2a6 to your computer and use it in GitHub Desktop.
R code used to produce the slides for this DH 2014 presentation: http://andrewgoldstone.com/blog/2014/07/02/dh2014/ . Generated by knitr::purl()
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
opts_chunk$set(echo=F,warning=F,prompt=F,comment="", | |
autodep=T,cache=T,dev="tikz", | |
fig.width=4.5,fig.height=3,size ='footnotesize', | |
dev.args=list(pointsize=12)) | |
options(width=70) | |
options(tikzDefaultEngine="xetex") | |
options(tikzXelatexPackages=c( | |
"\\usepackage{tikz}\n", | |
"\\usepackage[active,tightpage,xetex]{preview}\n", | |
"\\usepackage{fontspec,xunicode}\n", | |
"\\setmainfont{Gill Sans}\n", | |
"\\PreviewEnvironment{pgfpicture}\n", | |
"\\setlength\\PreviewBorder{0pt}\n")) | |
library("xtable") | |
library("lubridate") | |
library("stringr") | |
library("dfrtopics") | |
dep_auto() | |
smoother <- function (...) { | |
geom_smooth(method="loess",span=0.5,fill="grey60",se=F,...) | |
} | |
mdir <- "/Users/agoldst/Documents/research/20c/hls/tmhls/dh2014/hls_v10K_k120" | |
meta <- read_metadata(file.path(mdir, "dfr-data", | |
c("elh_ci_all", | |
"mlr1905-1970", | |
"mlr1971-2013", | |
"modphil_all", | |
"nlh_all", | |
"pmla_all", | |
"res1925-1980", | |
"res1981-2012"), | |
"citations.CSV")) | |
m <- list() | |
m$keys <- read.csv(file.path(mdir,"keys.csv"),as.is=T) | |
m$doctops <- read.csv(file.path(mdir,"doc_topics.csv"),as.is=T) | |
m$n <- max(m$keys$topic) | |
m$vocab <- readLines(file.path(mdir,"vocab.txt")) | |
meta <- meta[meta$id %in% m$doctops$id,] | |
topic_classes <- read.csv(file.path(mdir,"keys_classed.csv"),as.is=T) | |
m$dtw <- doc_topics_wide(m$doctops,meta) | |
m$series <- topic_proportions_series_frame(topic_year_matrix(m$dtw)) | |
m$series$decade <- cut.Date(as.Date(m$series$year), | |
breaks=seq.Date(from=as.Date("1880-01-01"), | |
to=as.Date("2020-01-01"), | |
by="10 years")) | |
m$dtw$decade <- cut.Date(as.Date(m$dtw$pubdate), | |
breaks=seq.Date(from=as.Date("1880-01-01"), | |
to=as.Date("2020-01-01"), | |
by="10 years")) | |
js <- ddply(meta,"journaltitle",summarize, | |
name=str_trim(unique(journaltitle)),start=min(pubdate), | |
end=max(pubdate)) | |
js <- js[order(js$start),] | |
js$name <- str_c("*",js$name,"*") | |
cat(str_c(str_c(js$name," (",year(js$start),"--",year(js$end),")"), | |
collapse=" \n")) | |
top_words <- ddply(m$keys,"topic",transform,rank=order(weight,decreasing=T)) | |
top_words <- top_words[top_words$rank <= 3,] | |
top_words$x <- (top_words$topic - 1) %% 12 | |
top_words$y <- floor((top_words$topic - 1) / 12) | |
top_words$y <- top_words$y + rep(c(0,-0.25,0.25),times=nrow(top_words) / 3) | |
top_words$y <- -top_words$y | |
ggplot(top_words,aes(x=x,y=y,label=word,size=weight)) + geom_text() + | |
scale_size_continuous(range=c(2,3)) + | |
theme(legend.position="none", | |
line=element_blank(), | |
rect=element_blank(), | |
title=element_blank(), | |
axis.text=element_blank()) | |
slope_discrete <- function (decade, frac, cutoff, | |
earliest="1920-01-01", | |
latest="2010-01-01") { | |
earlier <- frac[as.Date(decade) >= as.Date(earliest) & | |
as.Date(decade) < as.Date(cutoff)] | |
later <- frac[as.Date(decade) > as.Date(cutoff) & | |
as.Date(decade) <= as.Date(latest)] | |
score <- sum(rep(earlier,times=length(later)) < | |
rep(later,each=length(earlier))) | |
score / (length(earlier) * length(later)) | |
} | |
recency_cutoff <- "1980-01-01" | |
topic_decades <- t(daply(m$dtw[,-1], "decade", function (d) colSums(d[,1:m$n]))) | |
series_dec <- topic_proportions_series_frame(topic_decades) | |
recent_topics <- function (s,cutoff) { | |
topic_slopes <- ddply(s,"topic", function (d) { | |
data.frame(topic=d$topic[1], | |
slope=slope_discrete(d$year,d$weight,cutoff)) | |
}) | |
topic_slopes$topic[topic_slopes$slope == 1] | |
} | |
recents <- recent_topics(series_dec,recency_cutoff) | |
recent_series <- m$series[m$series$topic %in% recents,] | |
topic_names <- daply(m$keys[m$keys$topic %in% recents,],"topic", | |
function (d) { | |
paste(paste(d$word[1:5],collapse=" "), | |
sep="") | |
}) | |
recent_series$topic <- factor(recent_series$topic,labels=topic_names) | |
m$series$recent <- m$series$topic %in% recents | |
ggplot(m$series,aes(year,weight, | |
color=recent)) + | |
smoother() + | |
facet_wrap(~ topic,nrow=12,scales="free_y") + | |
theme(legend.position="none", | |
rect=element_blank(), | |
title=element_blank(), | |
axis.text=element_blank(), | |
axis.ticks=element_blank(), | |
strip.text=element_blank(), | |
panel.grid=element_blank()) + | |
scale_color_manual(values=c("blue","orange")) | |
tnames <- str_c("• ",levels(recent_series$topic)) | |
cat(str_c(tnames,collapse="\\\\ ")) | |
topic_classes$recent <- ifelse(topic_classes$topic %in% recents, | |
"recent","not recent") | |
tally <- as.matrix(with(topic_classes,table(code,recent))) | |
print(xtable(tally,digits=0),comment=F) | |
decs <- seq.Date(from=as.Date("1950-01-01"), | |
to=as.Date("2000-01-01"), | |
by="10 years") | |
slps <- list() | |
rs <- list() | |
socs <- matrix(nrow=length(decs),ncol=3, | |
dimnames=list(as.character(decs), | |
c("total recent", | |
"S not recent", | |
"S recent"))) | |
for (d in seq_along(decs)) { | |
rs[[d]] <- recent_topics(series_dec,decs[d]) # series_dec calc'd above | |
slps[[d]] <- ddply(series_dec,.(topic),summarize, | |
cutoff=decs[d], | |
slope=slope_discrete(year,weight,decs[d])) | |
tab <- table(topic_classes$code,topic_classes$topic %in% rs[[d]]) | |
socs[d,1] <- length(rs[[d]]) | |
socs[d,2:3] <- tab["S",] | |
} | |
socs <- cbind(year(decs),socs) | |
colnames(socs)[1] <- "cutoff year" | |
print(xtable(socs,digits=0),include.rownames=F,comment=F) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment