Skip to content

Instantly share code, notes, and snippets.

@benmarwick
Last active December 16, 2015 11:09
Show Gist options
  • Save benmarwick/5425399 to your computer and use it in GitHub Desktop.
Save benmarwick/5425399 to your computer and use it in GitHub Desktop.
Start with zip file from JSTOR's DfR, end with a summary of topics, historical trends in topics, topics that are becoming more prominent, and topics that have declined. Entirely with R. Topics modeled using LDA.
############################################################
# History of Topics in American Archaeology
############################################################
#### prepare workspace
rm(list = ls(all.names = TRUE))
gc()
#### get data into the R session
# set R's working directory
setwd("C:/Users/Marwick/Downloads/JSTOR")
# Get zip file of CSVs from JSTOR and unzip
# this may take a few minutes...
unzip("2013.4.18.ynRq3AJu.zip")
# set working directory to newly created folder
# with lots of CSV files
setwd(paste0(getwd(), "//wordcounts")))
## get list of data, the CSV files of wordcounts
myfiles <- dir(pattern = "\\.(csv|CSV)$", full.names = TRUE)
# read CSV files into a R data object
library(plyr)
system.time(csvfiles <<- llply(myfiles, read.csv, .progress = "text", .inform = FALSE))
# assign file names to each dataframe in the list
names(csvfiles) <- myfiles
## reshape data
# `untable' each CSV file into a list of data frames, one data frame per file
system.time(csvfilesuntabled <<- sapply(1:length(csvfiles), function(x) {rep(csvfiles[[x]]$WORDCOUNTS, times = csvfiles[[x]]$WEIGHT)}))
names(csvfilesuntabled) <- myfiles
# go through each item of the list and randomise the order of the words
# within so they are not in alpha order (which distorts the topic modelling)
csvfilesuntabled <- lapply(csvfilesuntabled, function(i) sample(i, length(i)))
## bring in citations file with biblio data for each paper
cit <- read.csv("C:/Users/Marwick/Downloads/JSTOR/citations.CSV")
# replace for-slash with underscore to make it match the filenames
# and replace odd \t that was added during import
library(stringr)
cit$id <- str_extract(chartr('/', '_', cit$id), ".*[^\t]")
# limit list of citations to full length articles only
# note that citation type is not in the correct column
# and that we need \t in there also
citfla <- cit[cit$publisher == 'fla\t',]
# subset from the wordcount data only the full length articles
# remove characters from the file names that are not in the citation list
# to enable matching with citation IDs
library(stringr)
names(csvfilesuntabled) <- str_extract(basename(names(csvfilesuntabled)), "[^wordcounts_].+[^.CSV]")
# subset items in the list of wordcount data whose names are in
# the list of fla citation IDs
csvfilesuntabledfla <- csvfilesuntabled[which(names(csvfilesuntabled) %in% citfla$id)]
# put citation IDs in order with wordcount data names
citfla1 <- (merge(names(csvfilesuntabledfla), citfla, by.x=1, by.y="id"))
# create a variable that holds the year of publication for
# each article
citfla1$year <- str_extract(citfla1$issue, "[[:digit:]]+{4}")
# now we have a table of citations with a unique ID for each article
# that is linked to the year of publication. We can come back to this
#### Make a corpus for cleaning text
# create a corpus of all the articles
library(tm)
(mycorpus <- Corpus(VectorSource(csvfilesuntabledfla)))
# clean and simplify the text
skipWords <- function(x) removeWords(x, stopwords("english"))
funcs <- list(tolower, removePunctuation, removeNumbers, stripWhitespace, skipWords)
mycorpus.clean <- tm_map(mycorpus, wordLengths=c(3,14), FUN = tm_reduce, tmFuns = funcs)
#### do POS tagging and remove non-nouns
options(java.parameters = "-Xmx2g" ) # voodoo to give Java 2gb RAM
# install packages
list.of.packages <- c("tm", "openNLP", "openNLPmodels.en")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
lapply(list.of.packages , require, character.only=T)
# apply POS tags, this can take a very long time...
mycorpus.clean.POStag <- list(vector, length(mycorpus.clean))
for(i in 1:length(mycorpus.clean)){
mycorpus.clean.POStag[[i]] <- tmTagPOS(mycorpus.clean[[i]])
invisible(gc(v=FALSE)) # force empty RAM between each iteration
cat(paste0(i," of ", length(mycorpus.clean), " items\n"))
flush.console()
}
# split the word/tag strings to make a dataframe with words in
# one col and tags in another col
mycorpus.POStag.split <- llply(1:length(mycorpus.clean.POStag), function(i) read.table(textConnection(gsub(" ", "\n", mycorpus.clean.POStag[[i]])),
sep="/", stringsAsFactors=FALSE), .progress = "text", .inform = FALSE)
# just keep NNs
mycorpus.nouns <- llply(1:length(mycorpus.POStag.split), function(i) mycorpus.POStag.split[[i]][mycorpus.POStag.split[[i]]$V2 == "NN" | mycorpus.POStag.split[[i]]$V2 == "NNS",], .progress = "text", .inform = FALSE)
# make character string of NNs for topic modelling
mycorpus.noun.strings <- llply(1:length(mycorpus.nouns), function(i) paste(mycorpus.nouns[[i]][,1], collapse = ", "), .progress = "text", .inform = FALSE)
# inspect...
str(mycorpus.noun.strings)
# remove everything except the last object and garbage clear to free up memory
rm(list=setdiff(ls(), "mycorpus.noun.strings")); gc()
# split vector every n words for topic modelling
n <- 1000
splitwords <- vector("list", length(mycorpus.noun.strings))
splitwordslist <- vector("list", length(mycorpus.noun.strings))
for(i in 1:length(mycorpus.noun.strings)){
splitwords[[i]] <- unlist(strsplit(mycorpus.noun.strings[[i]], " "))
splitwordslist[[i]] <- (split(unlist(strsplit(splitwords[[i]] , " ")), seq_along(unlist(strsplit(splitwords[[i]], " ")))%/%n))
}
# combine list of lists into one big list of n word pieces
listofnwords <- unlist(splitwordslist, recursive = FALSE)
# make a label so that we can put years on the batches of n words
lengths <- sapply(1:length(mycorpus.noun.strings), function(i) length(splitwordslist[[i]]))
names(listofnwords) <- unlist(lapply(1:length(lengths), function(x) rep(list.files()[x], lengths[x])))
names(listofnwords) <- paste0(names(listofnwords), "_", unlist(lapply(lengths, function(x) seq(1:x))))
# remove odd characters
listofnwordsclean <- lapply(listofnwords, function(j) iconv(j, "latin1", "ASCII", sub=""))
#### clean some more (remove punctuation again, remove people's names, make compact dtm)
mycorpus <- Corpus(VectorSource(listofnwordsclean))
# remove punctuation
mycorpus <- llply(1:length(mycorpus), function(i) gsub("[[:punct:]]", "", mycorpus[[i]]))
# remove common human names http://www.fakenamegenerator.com/ http://www.census.gov/genealogy/www/data/2000surnames/index.html
babynames <- tolower(read.csv("https://raw.github.com/hadley/data-baby-names/master/baby-names.csv")[,2])
mycorpus <- lapply(mycorpus, function(j) j[!j %in% babynames])
# put back in corpus form again
mycorpus <- Corpus(VectorSource(mycorpus))
# create dtm from corpus
skipWords <- function(x) removeWords(x, stopwords("english"))
funcs <- list(tolower, removePunctuation, removeNumbers, stripWhitespace, skipWords)
mycorpusclean <- tm_map(mycorpus, FUN = tm_reduce, tmFuns = funcs)
mydtm <- DocumentTermMatrix(mycorpusclean, control = list(wordLengths=c(3,14)))
mydtm
mydtm.sparse <- removeSparseTerms(mydtm, 0.9)
# inspect and iterate to improve
inspect(mydtm.sparse[1:10,1:10]) # have a quick look at the term document matrix
tot <- sum(colSums(as.matrix(mydtm.sparse))) # total number of words in the matrix
findFreqTerms(mydtm.sparse, lowfreq= 0.005 * tot) # edit stopword file here
# C:\emacs\R\win-library\3.0\tm\stopwords do not leave any spaces after the words
mycorpus.stopwords <- tm_map(mycorpus, FUN = tm_reduce, tmFuns = funcs)
mydtm.stopwords <- DocumentTermMatrix(mycorpus.stopwords, control = list(minWordLength = 4, stopwords = TRUE))
mydtm.stopwords.sparse <- removeSparseTerms(mydtm.stopwords, 0.8)
str(mydtm.stopwords.sparse )
findFreqTerms(mydtm.stopwords.sparse, lowfreq= 0.005 * tot)
mydtm.stopwords.sparse$dimnames$Terms[mydtm.stopwords.sparse$dimnames$Terms == "david"]
## on-the-fly visualisations
# hierarchical cluster plot
plot(hclust(dist(mydtm.stopwords.sparse)))
# Visualization of the correlations within a term-document matrix
plot((mydtm.stopwords.sparse), terms = findFreqTerms((mydtm.stopwords.sparse), lowfreq = 10)[1:20], corThreshold = 0.003)
#### Generate topic model
# clear the memory to make space
rm(list=setdiff(ls(), "mydtm")); gc()
# How many topics to use? Iterate with different numbers of topics to maximimse LL
library(topicmodels)
# create the sequence that stores the number of topics to
# iterate over
sequ <- seq(20, 300, by = 20)
# basic loop to iterate over different topic numbers
ldas <- llply(sequ, function(k) logLik(LDA(mydtm, k)), .progress = "text")
# convert list output to dataframe
best.model.logLik <- data.frame(logLik = as.matrix(lapply(lda[sequ], logLik)), ntopic = sequ)
# plot
with(best.model.logLik, plot(ntopic, logLik, type = 'l', xlab="Number of topics", ylab="Log likelihood"))
#### topics over time
# top five topics for +ve correlation
# top five topics for -ve correlation
# is archy more scientific?
# is archy more gender inclusive?
library(LDAviz)
# laod all American Antiquity articles
load("~/teamviewer/kretzler_and_marwick.RData")
# extract words and docs
# https://github.com/cpsievert/xkcd-Topics/blob/master/preprocess.R
doc.id <- rep(nouns$i, nouns$v)
word.id <- rep(nouns$j, nouns$v)
n.iter <- 100
Ks <- seq(10, 200, by=20)
rez <- NULL
ctr <- 1
for (i in Ks) {
rez[[ctr]] <- fitLDA(word.id, doc.id, k=i, n.chains=3, n.iter=n.iter, alpha=25/i, beta=0.1)
ctr <- ctr + 1
}
# https://github.com/cpsievert/xkcd-Topics/blob/master/postprocess.R
#m onitor convergence
for (i in seq_along(rez)){
plotLoglik(rez[[i]][[2]])
}
Rhats <- data.frame(matrix(numeric(2*length(Ks)), nrow=length(Ks), ncol=2))
for (i in seq_along(rez)){
l <- rez[[i]][[1]]
l2 <- split(c(l), rep(1:dim(l)[2], each=dim(l)[1]))
ml <- mcmc.list(lapply(l2, mcmc))
Rhats[i, ] <- gelman.diag(ml)[[1]]
}
names(Rhats) <- c("Point est.", "Upper C.I.")
Rhats$num.topics <- Ks
Rhats
W <- length(word.id)
V <- length(unique(word.id))
sample_post <- function(fit){ #grabs samples from posterior
samples <- matrix(numeric(W*n.iter), nrow=W, ncol=n.iter)
samples[,1] <- fit[[1]][,sample(1:3, 1)] #randomly choose one chain
k <- max(samples[,1])
for (i in seq_len(n.iter)[-1]){
samples[,i] <- fitLDA(word.id, doc.id, k=k, n.chains=1, n.iter=1, topics.init=samples[,i-1],
alpha=25/i, beta=0.1)[[1]]
}
return(samples)
}
compute_ll <- function(samples, k){ #Compute log p(w|T)
lik <- numeric(n.iter)
for (i in seq_len(n.iter)){
nj <- table(samples[,i])
nwj <- table(word.id, samples[,i])
lik[i] <- k*(lgamma(V*0.01) - V*lgamma(0.01)) + sum(colSums(lgamma(nwj + 0.01)) - lgamma(nj + V*0.01))
}
return(lik)
}
harmonics <- numeric(length(rez))
stderr <- numeric(length(rez))
for (i in seq_along(rez)){
samplez <- sample_post(rez[[i]])
ll <- compute_ll(samplez, k=Ks[i])
harmonics[i] <- 1/mean(1/ll) #point estimate of p(w|T)
stderr <- mean((ll - harmonics[i])^2) #sample variance
}
z <- qnorm(.975)
plot(Ks, harmonics, type="l", xlab="Number of Topics", ylab="Estimated log-likelihood")
#assess precision of the Monte Carlo approximation (see page 257 of Kaiser's notes)
lines(x=Ks, y=harmonics+z*sqrt((1/n.iter)*stderr), lty=2)
lines(x=Ks, y=harmonics-z*sqrt((1/n.iter)*stderr), lty=2, col=2)
points(Ks, harmonics)
names(harmonics) <- Ks
#save(harmonics, file="4-204topics-beta0.1-alpha25byk.rda")
library(lubridate)
library(ggplot2)
idx <- which.max(harmonics) #index pointing to optimal number of topics
opt <- Ks[idx]
topic.id <- rez[[idx]][[1]][,sample(1:3, 1)]
date.id <- dates$date[doc.id]
time.df <- data.frame(date.id, topic.id)
time.df$yr <- year(time.df$date.id)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment