Last active
December 16, 2015 11:09
-
-
Save benmarwick/5425399 to your computer and use it in GitHub Desktop.
Start with zip file from JSTOR's DfR, end with a summary of topics, historical trends in topics, topics that are becoming more prominent, and topics that have declined. Entirely with R. Topics modeled using LDA.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
############################################################ | |
# History of Topics in American Archaeology | |
############################################################ | |
#### prepare workspace | |
rm(list = ls(all.names = TRUE)) | |
gc() | |
#### get data into the R session | |
# set R's working directory | |
setwd("C:/Users/Marwick/Downloads/JSTOR") | |
# Get zip file of CSVs from JSTOR and unzip | |
# this may take a few minutes... | |
unzip("2013.4.18.ynRq3AJu.zip") | |
# set working directory to newly created folder | |
# with lots of CSV files | |
setwd(paste0(getwd(), "//wordcounts"))) | |
## get list of data, the CSV files of wordcounts | |
myfiles <- dir(pattern = "\\.(csv|CSV)$", full.names = TRUE) | |
# read CSV files into a R data object | |
library(plyr) | |
system.time(csvfiles <<- llply(myfiles, read.csv, .progress = "text", .inform = FALSE)) | |
# assign file names to each dataframe in the list | |
names(csvfiles) <- myfiles | |
## reshape data | |
# `untable' each CSV file into a list of data frames, one data frame per file | |
system.time(csvfilesuntabled <<- sapply(1:length(csvfiles), function(x) {rep(csvfiles[[x]]$WORDCOUNTS, times = csvfiles[[x]]$WEIGHT)})) | |
names(csvfilesuntabled) <- myfiles | |
# go through each item of the list and randomise the order of the words | |
# within so they are not in alpha order (which distorts the topic modelling) | |
csvfilesuntabled <- lapply(csvfilesuntabled, function(i) sample(i, length(i))) | |
## bring in citations file with biblio data for each paper | |
cit <- read.csv("C:/Users/Marwick/Downloads/JSTOR/citations.CSV") | |
# replace for-slash with underscore to make it match the filenames | |
# and replace odd \t that was added during import | |
library(stringr) | |
cit$id <- str_extract(chartr('/', '_', cit$id), ".*[^\t]") | |
# limit list of citations to full length articles only | |
# note that citation type is not in the correct column | |
# and that we need \t in there also | |
citfla <- cit[cit$publisher == 'fla\t',] | |
# subset from the wordcount data only the full length articles | |
# remove characters from the file names that are not in the citation list | |
# to enable matching with citation IDs | |
library(stringr) | |
names(csvfilesuntabled) <- str_extract(basename(names(csvfilesuntabled)), "[^wordcounts_].+[^.CSV]") | |
# subset items in the list of wordcount data whose names are in | |
# the list of fla citation IDs | |
csvfilesuntabledfla <- csvfilesuntabled[which(names(csvfilesuntabled) %in% citfla$id)] | |
# put citation IDs in order with wordcount data names | |
citfla1 <- (merge(names(csvfilesuntabledfla), citfla, by.x=1, by.y="id")) | |
# create a variable that holds the year of publication for | |
# each article | |
citfla1$year <- str_extract(citfla1$issue, "[[:digit:]]+{4}") | |
# now we have a table of citations with a unique ID for each article | |
# that is linked to the year of publication. We can come back to this | |
#### Make a corpus for cleaning text | |
# create a corpus of all the articles | |
library(tm) | |
(mycorpus <- Corpus(VectorSource(csvfilesuntabledfla))) | |
# clean and simplify the text | |
skipWords <- function(x) removeWords(x, stopwords("english")) | |
funcs <- list(tolower, removePunctuation, removeNumbers, stripWhitespace, skipWords) | |
mycorpus.clean <- tm_map(mycorpus, wordLengths=c(3,14), FUN = tm_reduce, tmFuns = funcs) | |
#### do POS tagging and remove non-nouns | |
options(java.parameters = "-Xmx2g" ) # voodoo to give Java 2gb RAM | |
# install packages | |
list.of.packages <- c("tm", "openNLP", "openNLPmodels.en") | |
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])] | |
if(length(new.packages)) install.packages(new.packages) | |
lapply(list.of.packages , require, character.only=T) | |
# apply POS tags, this can take a very long time... | |
mycorpus.clean.POStag <- list(vector, length(mycorpus.clean)) | |
for(i in 1:length(mycorpus.clean)){ | |
mycorpus.clean.POStag[[i]] <- tmTagPOS(mycorpus.clean[[i]]) | |
invisible(gc(v=FALSE)) # force empty RAM between each iteration | |
cat(paste0(i," of ", length(mycorpus.clean), " items\n")) | |
flush.console() | |
} | |
# split the word/tag strings to make a dataframe with words in | |
# one col and tags in another col | |
mycorpus.POStag.split <- llply(1:length(mycorpus.clean.POStag), function(i) read.table(textConnection(gsub(" ", "\n", mycorpus.clean.POStag[[i]])), | |
sep="/", stringsAsFactors=FALSE), .progress = "text", .inform = FALSE) | |
# just keep NNs | |
mycorpus.nouns <- llply(1:length(mycorpus.POStag.split), function(i) mycorpus.POStag.split[[i]][mycorpus.POStag.split[[i]]$V2 == "NN" | mycorpus.POStag.split[[i]]$V2 == "NNS",], .progress = "text", .inform = FALSE) | |
# make character string of NNs for topic modelling | |
mycorpus.noun.strings <- llply(1:length(mycorpus.nouns), function(i) paste(mycorpus.nouns[[i]][,1], collapse = ", "), .progress = "text", .inform = FALSE) | |
# inspect... | |
str(mycorpus.noun.strings) | |
# remove everything except the last object and garbage clear to free up memory | |
rm(list=setdiff(ls(), "mycorpus.noun.strings")); gc() | |
# split vector every n words for topic modelling | |
n <- 1000 | |
splitwords <- vector("list", length(mycorpus.noun.strings)) | |
splitwordslist <- vector("list", length(mycorpus.noun.strings)) | |
for(i in 1:length(mycorpus.noun.strings)){ | |
splitwords[[i]] <- unlist(strsplit(mycorpus.noun.strings[[i]], " ")) | |
splitwordslist[[i]] <- (split(unlist(strsplit(splitwords[[i]] , " ")), seq_along(unlist(strsplit(splitwords[[i]], " ")))%/%n)) | |
} | |
# combine list of lists into one big list of n word pieces | |
listofnwords <- unlist(splitwordslist, recursive = FALSE) | |
# make a label so that we can put years on the batches of n words | |
lengths <- sapply(1:length(mycorpus.noun.strings), function(i) length(splitwordslist[[i]])) | |
names(listofnwords) <- unlist(lapply(1:length(lengths), function(x) rep(list.files()[x], lengths[x]))) | |
names(listofnwords) <- paste0(names(listofnwords), "_", unlist(lapply(lengths, function(x) seq(1:x)))) | |
# remove odd characters | |
listofnwordsclean <- lapply(listofnwords, function(j) iconv(j, "latin1", "ASCII", sub="")) | |
#### clean some more (remove punctuation again, remove people's names, make compact dtm) | |
mycorpus <- Corpus(VectorSource(listofnwordsclean)) | |
# remove punctuation | |
mycorpus <- llply(1:length(mycorpus), function(i) gsub("[[:punct:]]", "", mycorpus[[i]])) | |
# remove common human names http://www.fakenamegenerator.com/ http://www.census.gov/genealogy/www/data/2000surnames/index.html | |
babynames <- tolower(read.csv("https://raw.github.com/hadley/data-baby-names/master/baby-names.csv")[,2]) | |
mycorpus <- lapply(mycorpus, function(j) j[!j %in% babynames]) | |
# put back in corpus form again | |
mycorpus <- Corpus(VectorSource(mycorpus)) | |
# create dtm from corpus | |
skipWords <- function(x) removeWords(x, stopwords("english")) | |
funcs <- list(tolower, removePunctuation, removeNumbers, stripWhitespace, skipWords) | |
mycorpusclean <- tm_map(mycorpus, FUN = tm_reduce, tmFuns = funcs) | |
mydtm <- DocumentTermMatrix(mycorpusclean, control = list(wordLengths=c(3,14))) | |
mydtm | |
mydtm.sparse <- removeSparseTerms(mydtm, 0.9) | |
# inspect and iterate to improve | |
inspect(mydtm.sparse[1:10,1:10]) # have a quick look at the term document matrix | |
tot <- sum(colSums(as.matrix(mydtm.sparse))) # total number of words in the matrix | |
findFreqTerms(mydtm.sparse, lowfreq= 0.005 * tot) # edit stopword file here | |
# C:\emacs\R\win-library\3.0\tm\stopwords do not leave any spaces after the words | |
mycorpus.stopwords <- tm_map(mycorpus, FUN = tm_reduce, tmFuns = funcs) | |
mydtm.stopwords <- DocumentTermMatrix(mycorpus.stopwords, control = list(minWordLength = 4, stopwords = TRUE)) | |
mydtm.stopwords.sparse <- removeSparseTerms(mydtm.stopwords, 0.8) | |
str(mydtm.stopwords.sparse ) | |
findFreqTerms(mydtm.stopwords.sparse, lowfreq= 0.005 * tot) | |
mydtm.stopwords.sparse$dimnames$Terms[mydtm.stopwords.sparse$dimnames$Terms == "david"] | |
## on-the-fly visualisations | |
# hierarchical cluster plot | |
plot(hclust(dist(mydtm.stopwords.sparse))) | |
# Visualization of the correlations within a term-document matrix | |
plot((mydtm.stopwords.sparse), terms = findFreqTerms((mydtm.stopwords.sparse), lowfreq = 10)[1:20], corThreshold = 0.003) | |
#### Generate topic model | |
# clear the memory to make space | |
rm(list=setdiff(ls(), "mydtm")); gc() | |
# How many topics to use? Iterate with different numbers of topics to maximimse LL | |
library(topicmodels) | |
# create the sequence that stores the number of topics to | |
# iterate over | |
sequ <- seq(20, 300, by = 20) | |
# basic loop to iterate over different topic numbers | |
ldas <- llply(sequ, function(k) logLik(LDA(mydtm, k)), .progress = "text") | |
# convert list output to dataframe | |
best.model.logLik <- data.frame(logLik = as.matrix(lapply(lda[sequ], logLik)), ntopic = sequ) | |
# plot | |
with(best.model.logLik, plot(ntopic, logLik, type = 'l', xlab="Number of topics", ylab="Log likelihood")) | |
#### topics over time | |
# top five topics for +ve correlation | |
# top five topics for -ve correlation | |
# is archy more scientific? | |
# is archy more gender inclusive? |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(LDAviz) | |
# laod all American Antiquity articles | |
load("~/teamviewer/kretzler_and_marwick.RData") | |
# extract words and docs | |
# https://github.com/cpsievert/xkcd-Topics/blob/master/preprocess.R | |
doc.id <- rep(nouns$i, nouns$v) | |
word.id <- rep(nouns$j, nouns$v) | |
n.iter <- 100 | |
Ks <- seq(10, 200, by=20) | |
rez <- NULL | |
ctr <- 1 | |
for (i in Ks) { | |
rez[[ctr]] <- fitLDA(word.id, doc.id, k=i, n.chains=3, n.iter=n.iter, alpha=25/i, beta=0.1) | |
ctr <- ctr + 1 | |
} | |
# https://github.com/cpsievert/xkcd-Topics/blob/master/postprocess.R | |
#m onitor convergence | |
for (i in seq_along(rez)){ | |
plotLoglik(rez[[i]][[2]]) | |
} | |
Rhats <- data.frame(matrix(numeric(2*length(Ks)), nrow=length(Ks), ncol=2)) | |
for (i in seq_along(rez)){ | |
l <- rez[[i]][[1]] | |
l2 <- split(c(l), rep(1:dim(l)[2], each=dim(l)[1])) | |
ml <- mcmc.list(lapply(l2, mcmc)) | |
Rhats[i, ] <- gelman.diag(ml)[[1]] | |
} | |
names(Rhats) <- c("Point est.", "Upper C.I.") | |
Rhats$num.topics <- Ks | |
Rhats | |
W <- length(word.id) | |
V <- length(unique(word.id)) | |
sample_post <- function(fit){ #grabs samples from posterior | |
samples <- matrix(numeric(W*n.iter), nrow=W, ncol=n.iter) | |
samples[,1] <- fit[[1]][,sample(1:3, 1)] #randomly choose one chain | |
k <- max(samples[,1]) | |
for (i in seq_len(n.iter)[-1]){ | |
samples[,i] <- fitLDA(word.id, doc.id, k=k, n.chains=1, n.iter=1, topics.init=samples[,i-1], | |
alpha=25/i, beta=0.1)[[1]] | |
} | |
return(samples) | |
} | |
compute_ll <- function(samples, k){ #Compute log p(w|T) | |
lik <- numeric(n.iter) | |
for (i in seq_len(n.iter)){ | |
nj <- table(samples[,i]) | |
nwj <- table(word.id, samples[,i]) | |
lik[i] <- k*(lgamma(V*0.01) - V*lgamma(0.01)) + sum(colSums(lgamma(nwj + 0.01)) - lgamma(nj + V*0.01)) | |
} | |
return(lik) | |
} | |
harmonics <- numeric(length(rez)) | |
stderr <- numeric(length(rez)) | |
for (i in seq_along(rez)){ | |
samplez <- sample_post(rez[[i]]) | |
ll <- compute_ll(samplez, k=Ks[i]) | |
harmonics[i] <- 1/mean(1/ll) #point estimate of p(w|T) | |
stderr <- mean((ll - harmonics[i])^2) #sample variance | |
} | |
z <- qnorm(.975) | |
plot(Ks, harmonics, type="l", xlab="Number of Topics", ylab="Estimated log-likelihood") | |
#assess precision of the Monte Carlo approximation (see page 257 of Kaiser's notes) | |
lines(x=Ks, y=harmonics+z*sqrt((1/n.iter)*stderr), lty=2) | |
lines(x=Ks, y=harmonics-z*sqrt((1/n.iter)*stderr), lty=2, col=2) | |
points(Ks, harmonics) | |
names(harmonics) <- Ks | |
#save(harmonics, file="4-204topics-beta0.1-alpha25byk.rda") | |
library(lubridate) | |
library(ggplot2) | |
idx <- which.max(harmonics) #index pointing to optimal number of topics | |
opt <- Ks[idx] | |
topic.id <- rez[[idx]][[1]][,sample(1:3, 1)] | |
date.id <- dates$date[doc.id] | |
time.df <- data.frame(date.id, topic.id) | |
time.df$yr <- year(time.df$date.id) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment