benmarwick · December 16, 2015 11:09
diff --git a/history-of-topics-JSTOR-DfR.R b/history-of-topics-JSTOR-DfR.R
 ############################################################
 # History of Topics in American Archaeology
 ############################################################

 #### prepare workspace
 rm(list = ls(all.names = TRUE))
 gc()

 #### get data into the R session 
 # set R's working directory
 setwd("C:/Users/Marwick/Downloads/JSTOR")
 # Get zip file of CSVs from JSTOR and unzip
 # this may take a few minutes...
 unzip("2013.4.18.ynRq3AJu.zip")
 # set working directory to newly created folder
 # with lots of CSV files
 setwd(paste0(getwd(), "//wordcounts")))


 ## get list of data, the CSV files of wordcounts 
 myfiles <- dir(pattern = "\\.(csv|CSV)$", full.names = TRUE)
 # read CSV files into a R data object
 library(plyr)
 system.time(csvfiles <<-  llply(myfiles, read.csv, .progress = "text", .inform = FALSE))
 # assign file names to each dataframe in the list
 names(csvfiles) <- myfiles

 ## reshape data
 # `untable' each CSV file into a list of data frames, one data frame per file
 system.time(csvfilesuntabled <<- sapply(1:length(csvfiles), function(x) {rep(csvfiles[[x]]$WORDCOUNTS, times = csvfiles[[x]]$WEIGHT)}))
 names(csvfilesuntabled) <- myfiles
 # go through each item of the list and randomise the order of the words
 # within so they are not in alpha order (which distorts the topic modelling)
 csvfilesuntabled <- lapply(csvfilesuntabled, function(i) sample(i, length(i)))

 ## bring in citations file with biblio data for each paper
 cit <- read.csv("C:/Users/Marwick/Downloads/JSTOR/citations.CSV")
 # replace for-slash with underscore to make it match the filenames
 # and replace odd \t that was added during import 
 library(stringr)
 cit$id <- str_extract(chartr('/', '_', cit$id), ".*[^\t]")
 # limit list of citations to full length articles only 
 # note that citation type is not in the correct column
 # and that we need \t in there also
 citfla <- cit[cit$publisher == 'fla\t',]
 # subset from the wordcount data only the full length articles
 # remove characters from the file names that are not in the citation list
 # to enable matching with citation IDs
 library(stringr)
 names(csvfilesuntabled) <- str_extract(basename(names(csvfilesuntabled)), "[^wordcounts_].+[^.CSV]")
 # subset items in the list of wordcount data whose names are in 
 # the list of fla citation IDs
 csvfilesuntabledfla <- csvfilesuntabled[which(names(csvfilesuntabled) %in% citfla$id)]
 # put citation IDs in order with wordcount data names  
 citfla1 <- (merge(names(csvfilesuntabledfla), citfla, by.x=1, by.y="id"))
 # create a variable that holds the year of publication for
 # each article
 citfla1$year <- str_extract(citfla1$issue, "[[:digit:]]+{4}")
 # now we have a table of citations with a unique ID for each article
 # that is linked to the year of publication. We can come back to this

 #### Make a corpus for cleaning text
 # create a corpus of all the articles
 library(tm)
 (mycorpus <-  Corpus(VectorSource(csvfilesuntabledfla)))
 # clean and simplify the text
 skipWords <- function(x) removeWords(x, stopwords("english"))
 funcs <- list(tolower, removePunctuation, removeNumbers, stripWhitespace, skipWords)
 mycorpus.clean <- tm_map(mycorpus, wordLengths=c(3,14), FUN = tm_reduce, tmFuns = funcs)

 #### do POS tagging and remove non-nouns
 options(java.parameters = "-Xmx2g" ) # voodoo to give Java 2gb RAM
 # install packages
 list.of.packages <- c("tm", "openNLP", "openNLPmodels.en")
 new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
 if(length(new.packages)) install.packages(new.packages)
 lapply(list.of.packages , require, character.only=T)
 # apply POS tags, this can take a very long time...
 mycorpus.clean.POStag <- list(vector, length(mycorpus.clean))
 for(i in 1:length(mycorpus.clean)){
  mycorpus.clean.POStag[[i]] <- tmTagPOS(mycorpus.clean[[i]])
  invisible(gc(v=FALSE)) # force empty RAM between each iteration
  cat(paste0(i," of ", length(mycorpus.clean), " items\n")) 
  flush.console()
 }

 # split the word/tag strings to make a dataframe with words in 
 # one col and tags in another col
 mycorpus.POStag.split <- llply(1:length(mycorpus.clean.POStag), function(i)  read.table(textConnection(gsub(" ", "\n", mycorpus.clean.POStag[[i]])),
 sep="/", stringsAsFactors=FALSE), .progress = "text", .inform = FALSE)
 # just keep NNs
 mycorpus.nouns <- llply(1:length(mycorpus.POStag.split), function(i) mycorpus.POStag.split[[i]][mycorpus.POStag.split[[i]]$V2 == "NN" | mycorpus.POStag.split[[i]]$V2 == "NNS",], .progress = "text", .inform = FALSE)
 # make character string of NNs for topic modelling
 mycorpus.noun.strings <- llply(1:length(mycorpus.nouns), function(i) paste(mycorpus.nouns[[i]][,1], collapse = ", "), .progress = "text", .inform = FALSE)
 # inspect...
 str(mycorpus.noun.strings)
 # remove everything except the last object and garbage clear to free up memory
 rm(list=setdiff(ls(), "mycorpus.noun.strings")); gc()
 # split vector every n words for topic modelling
 n <- 1000 
 splitwords <- vector("list", length(mycorpus.noun.strings))
 splitwordslist <- vector("list", length(mycorpus.noun.strings))
 for(i in 1:length(mycorpus.noun.strings)){
 splitwords[[i]] <- unlist(strsplit(mycorpus.noun.strings[[i]], " "))
 splitwordslist[[i]] <- (split(unlist(strsplit(splitwords[[i]] , " ")), seq_along(unlist(strsplit(splitwords[[i]], " ")))%/%n))
 }
 # combine list of lists into one big list of n word pieces
 listofnwords <- unlist(splitwordslist, recursive = FALSE)
 # make a label so that we can put years on the batches of n words
 lengths <- sapply(1:length(mycorpus.noun.strings), function(i) length(splitwordslist[[i]]))
 names(listofnwords) <- unlist(lapply(1:length(lengths), function(x) rep(list.files()[x], lengths[x])))
 names(listofnwords) <- paste0(names(listofnwords), "_", unlist(lapply(lengths, function(x) seq(1:x))))
 # remove odd characters
 listofnwordsclean <- lapply(listofnwords, function(j) iconv(j, "latin1", "ASCII", sub=""))

 #### clean some more (remove punctuation again, remove people's names, make compact dtm)
 mycorpus <- Corpus(VectorSource(listofnwordsclean))
 # remove punctuation
 mycorpus <- llply(1:length(mycorpus), function(i) gsub("[[:punct:]]", "",  mycorpus[[i]]))
 # remove common human names http://www.fakenamegenerator.com/ http://www.census.gov/genealogy/www/data/2000surnames/index.html
 babynames <- tolower(read.csv("https://raw.github.com/hadley/data-baby-names/master/baby-names.csv")[,2])
 mycorpus <- lapply(mycorpus, function(j) j[!j %in% babynames])
 # put back in corpus form again
 mycorpus <- Corpus(VectorSource(mycorpus))
 # create dtm from corpus
 skipWords <- function(x) removeWords(x, stopwords("english"))
 funcs <- list(tolower, removePunctuation, removeNumbers, stripWhitespace, skipWords)
 mycorpusclean <-  tm_map(mycorpus, FUN = tm_reduce, tmFuns = funcs)
 mydtm <- DocumentTermMatrix(mycorpusclean, control = list(wordLengths=c(3,14)))
 mydtm
 mydtm.sparse <- removeSparseTerms(mydtm, 0.9)
 # inspect and iterate to improve
 inspect(mydtm.sparse[1:10,1:10]) # have a quick look at the term document matrix
 tot <- sum(colSums(as.matrix(mydtm.sparse))) # total number of words in the matrix
 findFreqTerms(mydtm.sparse, lowfreq= 0.005 * tot) # edit stopword file here
 # C:\emacs\R\win-library\3.0\tm\stopwords do not leave any spaces after the words
 mycorpus.stopwords <- tm_map(mycorpus, FUN = tm_reduce, tmFuns = funcs)
 mydtm.stopwords <- DocumentTermMatrix(mycorpus.stopwords, control = list(minWordLength = 4, stopwords = TRUE))
 mydtm.stopwords.sparse <- removeSparseTerms(mydtm.stopwords, 0.8)
 str(mydtm.stopwords.sparse )
 findFreqTerms(mydtm.stopwords.sparse, lowfreq= 0.005 * tot)
 mydtm.stopwords.sparse$dimnames$Terms[mydtm.stopwords.sparse$dimnames$Terms == "david"]
 ## on-the-fly visualisations
 # hierarchical cluster plot
 plot(hclust(dist(mydtm.stopwords.sparse))) 
 # Visualization of the correlations within a term-document matrix
 plot((mydtm.stopwords.sparse), terms = findFreqTerms((mydtm.stopwords.sparse), lowfreq = 10)[1:20], corThreshold = 0.003)

 #### Generate topic model
 # clear the memory to make space
 rm(list=setdiff(ls(), "mydtm")); gc()
 # How many topics to use? Iterate with different numbers of topics to maximimse LL
 library(topicmodels)
 # create the sequence that stores the number of topics to 
 # iterate over
 sequ <- seq(20, 300, by = 20)
 # basic loop to iterate over different topic numbers 
 ldas <- llply(sequ, function(k) logLik(LDA(mydtm, k)), .progress = "text")
 # convert list output to dataframe 
 best.model.logLik <- data.frame(logLik = as.matrix(lapply(lda[sequ], logLik)), ntopic = sequ)
 # plot
 with(best.model.logLik, plot(ntopic, logLik, type = 'l', xlab="Number of topics", ylab="Log likelihood"))

 #### topics over time

 # top five topics for +ve correlation

 # top five topics for -ve correlation

 # is archy more scientific?

 # is archy more gender inclusive?
diff --git a/number-of-topics.r b/number-of-topics.r
 library(LDAviz)

 # laod all American Antiquity  articles
 load("~/teamviewer/kretzler_and_marwick.RData")

 # extract words and docs
 # https://github.com/cpsievert/xkcd-Topics/blob/master/preprocess.R
 doc.id <- rep(nouns$i, nouns$v)
 word.id <- rep(nouns$j, nouns$v)
 n.iter <- 100
 Ks <- seq(10, 200, by=20)
 rez <- NULL
 ctr <- 1
 for (i in Ks) {
  rez[[ctr]] <- fitLDA(word.id, doc.id, k=i, n.chains=3, n.iter=n.iter, alpha=25/i, beta=0.1)
  ctr <- ctr + 1
 }
  
 # https://github.com/cpsievert/xkcd-Topics/blob/master/postprocess.R
 #m onitor convergence
 for (i in seq_along(rez)){
  plotLoglik(rez[[i]][[2]])
 }
 Rhats <- data.frame(matrix(numeric(2*length(Ks)), nrow=length(Ks), ncol=2))
 for (i in seq_along(rez)){
  l <- rez[[i]][[1]]
  l2 <- split(c(l), rep(1:dim(l)[2], each=dim(l)[1]))
  ml <- mcmc.list(lapply(l2, mcmc))
  Rhats[i, ] <- gelman.diag(ml)[[1]]
 }
 names(Rhats) <- c("Point est.", "Upper C.I.")
 Rhats$num.topics <- Ks
 Rhats


 W <- length(word.id)
 V <- length(unique(word.id))

 sample_post <- function(fit){ #grabs samples from posterior
  samples <- matrix(numeric(W*n.iter), nrow=W, ncol=n.iter)
  samples[,1] <- fit[[1]][,sample(1:3, 1)] #randomly choose one chain
  k <- max(samples[,1])
  for (i in seq_len(n.iter)[-1]){
    samples[,i] <- fitLDA(word.id, doc.id, k=k, n.chains=1, n.iter=1, topics.init=samples[,i-1], 
                          alpha=25/i, beta=0.1)[[1]]
  }
  return(samples)
 }

 compute_ll <- function(samples, k){ #Compute log p(w|T)
  lik <- numeric(n.iter)
  for (i in seq_len(n.iter)){
    nj <- table(samples[,i])
    nwj <- table(word.id, samples[,i])
    lik[i] <- k*(lgamma(V*0.01) - V*lgamma(0.01)) + sum(colSums(lgamma(nwj + 0.01)) - lgamma(nj + V*0.01))
  }
  return(lik)
 }

 harmonics <- numeric(length(rez))
 stderr <- numeric(length(rez))
 for (i in seq_along(rez)){
  samplez <- sample_post(rez[[i]])
  ll <- compute_ll(samplez, k=Ks[i])
  harmonics[i] <- 1/mean(1/ll)  #point estimate of p(w|T)
  stderr <- mean((ll - harmonics[i])^2)   #sample variance
 }

 z <- qnorm(.975)

 plot(Ks, harmonics, type="l", xlab="Number of Topics", ylab="Estimated log-likelihood")
 #assess precision of the Monte Carlo approximation (see page 257 of Kaiser's notes)
 lines(x=Ks, y=harmonics+z*sqrt((1/n.iter)*stderr), lty=2) 
 lines(x=Ks, y=harmonics-z*sqrt((1/n.iter)*stderr), lty=2, col=2)
 points(Ks, harmonics)

 names(harmonics) <- Ks
 #save(harmonics, file="4-204topics-beta0.1-alpha25byk.rda")


 library(lubridate)
 library(ggplot2)
 idx <- which.max(harmonics) #index pointing to optimal number of topics
 opt <- Ks[idx]
 topic.id <- rez[[idx]][[1]][,sample(1:3, 1)]
 date.id <- dates$date[doc.id]
 time.df <- data.frame(date.id, topic.id)
 time.df$yr <- year(time.df$date.id)
	############################################################
	# History of Topics in American Archaeology
	############################################################

	#### prepare workspace
	rm(list = ls(all.names = TRUE))
	gc()

	#### get data into the R session
	# set R's working directory
	setwd("C:/Users/Marwick/Downloads/JSTOR")
	# Get zip file of CSVs from JSTOR and unzip
	# this may take a few minutes...
	unzip("2013.4.18.ynRq3AJu.zip")
	# set working directory to newly created folder
	# with lots of CSV files
	setwd(paste0(getwd(), "//wordcounts")))


	## get list of data, the CSV files of wordcounts
	myfiles <- dir(pattern = "\\.(csv\|CSV)$", full.names = TRUE)
	# read CSV files into a R data object
	library(plyr)
	system.time(csvfiles <<- llply(myfiles, read.csv, .progress = "text", .inform = FALSE))
	# assign file names to each dataframe in the list
	names(csvfiles) <- myfiles

	## reshape data
	# `untable' each CSV file into a list of data frames, one data frame per file
	system.time(csvfilesuntabled <<- sapply(1:length(csvfiles), function(x) {rep(csvfiles[[x]]$WORDCOUNTS, times = csvfiles[[x]]$WEIGHT)}))
	names(csvfilesuntabled) <- myfiles
	# go through each item of the list and randomise the order of the words
	# within so they are not in alpha order (which distorts the topic modelling)
	csvfilesuntabled <- lapply(csvfilesuntabled, function(i) sample(i, length(i)))

	## bring in citations file with biblio data for each paper
	cit <- read.csv("C:/Users/Marwick/Downloads/JSTOR/citations.CSV")
	# replace for-slash with underscore to make it match the filenames
	# and replace odd \t that was added during import
	library(stringr)
	cit$id <- str_extract(chartr('/', '_', cit$id), ".*[^\t]")
	# limit list of citations to full length articles only
	# note that citation type is not in the correct column
	# and that we need \t in there also
	citfla <- cit[cit$publisher == 'fla\t',]
	# subset from the wordcount data only the full length articles
	# remove characters from the file names that are not in the citation list
	# to enable matching with citation IDs
	library(stringr)
	names(csvfilesuntabled) <- str_extract(basename(names(csvfilesuntabled)), "[^wordcounts_].+[^.CSV]")
	# subset items in the list of wordcount data whose names are in
	# the list of fla citation IDs
	csvfilesuntabledfla <- csvfilesuntabled[which(names(csvfilesuntabled) %in% citfla$id)]
	# put citation IDs in order with wordcount data names
	citfla1 <- (merge(names(csvfilesuntabledfla), citfla, by.x=1, by.y="id"))
	# create a variable that holds the year of publication for
	# each article
	citfla1$year <- str_extract(citfla1$issue, "[[:digit:]]+{4}")
	# now we have a table of citations with a unique ID for each article
	# that is linked to the year of publication. We can come back to this

	#### Make a corpus for cleaning text
	# create a corpus of all the articles
	library(tm)
	(mycorpus <- Corpus(VectorSource(csvfilesuntabledfla)))
	# clean and simplify the text
	skipWords <- function(x) removeWords(x, stopwords("english"))
	funcs <- list(tolower, removePunctuation, removeNumbers, stripWhitespace, skipWords)
	mycorpus.clean <- tm_map(mycorpus, wordLengths=c(3,14), FUN = tm_reduce, tmFuns = funcs)

	#### do POS tagging and remove non-nouns
	options(java.parameters = "-Xmx2g" ) # voodoo to give Java 2gb RAM
	# install packages
	list.of.packages <- c("tm", "openNLP", "openNLPmodels.en")
	new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
	if(length(new.packages)) install.packages(new.packages)
	lapply(list.of.packages , require, character.only=T)
	# apply POS tags, this can take a very long time...
	mycorpus.clean.POStag <- list(vector, length(mycorpus.clean))
	for(i in 1:length(mycorpus.clean)){
	mycorpus.clean.POStag[[i]] <- tmTagPOS(mycorpus.clean[[i]])
	invisible(gc(v=FALSE)) # force empty RAM between each iteration
	cat(paste0(i," of ", length(mycorpus.clean), " items\n"))
	flush.console()
	}

	# split the word/tag strings to make a dataframe with words in
	# one col and tags in another col
	mycorpus.POStag.split <- llply(1:length(mycorpus.clean.POStag), function(i) read.table(textConnection(gsub(" ", "\n", mycorpus.clean.POStag[[i]])),
	sep="/", stringsAsFactors=FALSE), .progress = "text", .inform = FALSE)
	# just keep NNs
	mycorpus.nouns <- llply(1:length(mycorpus.POStag.split), function(i) mycorpus.POStag.split[[i]][mycorpus.POStag.split[[i]]$V2 == "NN" \| mycorpus.POStag.split[[i]]$V2 == "NNS",], .progress = "text", .inform = FALSE)
	# make character string of NNs for topic modelling
	mycorpus.noun.strings <- llply(1:length(mycorpus.nouns), function(i) paste(mycorpus.nouns[[i]][,1], collapse = ", "), .progress = "text", .inform = FALSE)
	# inspect...
	str(mycorpus.noun.strings)
	# remove everything except the last object and garbage clear to free up memory
	rm(list=setdiff(ls(), "mycorpus.noun.strings")); gc()
	# split vector every n words for topic modelling
	n <- 1000
	splitwords <- vector("list", length(mycorpus.noun.strings))
	splitwordslist <- vector("list", length(mycorpus.noun.strings))
	for(i in 1:length(mycorpus.noun.strings)){
	splitwords[[i]] <- unlist(strsplit(mycorpus.noun.strings[[i]], " "))
	splitwordslist[[i]] <- (split(unlist(strsplit(splitwords[[i]] , " ")), seq_along(unlist(strsplit(splitwords[[i]], " ")))%/%n))
	}
	# combine list of lists into one big list of n word pieces
	listofnwords <- unlist(splitwordslist, recursive = FALSE)
	# make a label so that we can put years on the batches of n words
	lengths <- sapply(1:length(mycorpus.noun.strings), function(i) length(splitwordslist[[i]]))
	names(listofnwords) <- unlist(lapply(1:length(lengths), function(x) rep(list.files()[x], lengths[x])))
	names(listofnwords) <- paste0(names(listofnwords), "_", unlist(lapply(lengths, function(x) seq(1:x))))
	# remove odd characters
	listofnwordsclean <- lapply(listofnwords, function(j) iconv(j, "latin1", "ASCII", sub=""))

	#### clean some more (remove punctuation again, remove people's names, make compact dtm)
	mycorpus <- Corpus(VectorSource(listofnwordsclean))
	# remove punctuation
	mycorpus <- llply(1:length(mycorpus), function(i) gsub("[[:punct:]]", "", mycorpus[[i]]))
	# remove common human names http://www.fakenamegenerator.com/ http://www.census.gov/genealogy/www/data/2000surnames/index.html
	babynames <- tolower(read.csv("https://raw.github.com/hadley/data-baby-names/master/baby-names.csv")[,2])
	mycorpus <- lapply(mycorpus, function(j) j[!j %in% babynames])
	# put back in corpus form again
	mycorpus <- Corpus(VectorSource(mycorpus))
	# create dtm from corpus
	skipWords <- function(x) removeWords(x, stopwords("english"))
	funcs <- list(tolower, removePunctuation, removeNumbers, stripWhitespace, skipWords)
	mycorpusclean <- tm_map(mycorpus, FUN = tm_reduce, tmFuns = funcs)
	mydtm <- DocumentTermMatrix(mycorpusclean, control = list(wordLengths=c(3,14)))
	mydtm
	mydtm.sparse <- removeSparseTerms(mydtm, 0.9)
	# inspect and iterate to improve
	inspect(mydtm.sparse[1:10,1:10]) # have a quick look at the term document matrix
	tot <- sum(colSums(as.matrix(mydtm.sparse))) # total number of words in the matrix
	findFreqTerms(mydtm.sparse, lowfreq= 0.005 * tot) # edit stopword file here
	# C:\emacs\R\win-library\3.0\tm\stopwords do not leave any spaces after the words
	mycorpus.stopwords <- tm_map(mycorpus, FUN = tm_reduce, tmFuns = funcs)
	mydtm.stopwords <- DocumentTermMatrix(mycorpus.stopwords, control = list(minWordLength = 4, stopwords = TRUE))
	mydtm.stopwords.sparse <- removeSparseTerms(mydtm.stopwords, 0.8)
	str(mydtm.stopwords.sparse )
	findFreqTerms(mydtm.stopwords.sparse, lowfreq= 0.005 * tot)
	mydtm.stopwords.sparse$dimnames$Terms[mydtm.stopwords.sparse$dimnames$Terms == "david"]
	## on-the-fly visualisations
	# hierarchical cluster plot
	plot(hclust(dist(mydtm.stopwords.sparse)))
	# Visualization of the correlations within a term-document matrix
	plot((mydtm.stopwords.sparse), terms = findFreqTerms((mydtm.stopwords.sparse), lowfreq = 10)[1:20], corThreshold = 0.003)

	#### Generate topic model
	# clear the memory to make space
	rm(list=setdiff(ls(), "mydtm")); gc()
	# How many topics to use? Iterate with different numbers of topics to maximimse LL
	library(topicmodels)
	# create the sequence that stores the number of topics to
	# iterate over
	sequ <- seq(20, 300, by = 20)
	# basic loop to iterate over different topic numbers
	ldas <- llply(sequ, function(k) logLik(LDA(mydtm, k)), .progress = "text")
	# convert list output to dataframe
	best.model.logLik <- data.frame(logLik = as.matrix(lapply(lda[sequ], logLik)), ntopic = sequ)
	# plot
	with(best.model.logLik, plot(ntopic, logLik, type = 'l', xlab="Number of topics", ylab="Log likelihood"))

	#### topics over time

	# top five topics for +ve correlation

	# top five topics for -ve correlation

	# is archy more scientific?

	# is archy more gender inclusive?
	library(LDAviz)

	# laod all American Antiquity articles
	load("~/teamviewer/kretzler_and_marwick.RData")

	# extract words and docs
	# https://github.com/cpsievert/xkcd-Topics/blob/master/preprocess.R
	doc.id <- rep(nouns$i, nouns$v)
	word.id <- rep(nouns$j, nouns$v)
	n.iter <- 100
	Ks <- seq(10, 200, by=20)
	rez <- NULL
	ctr <- 1
	for (i in Ks) {
	rez[[ctr]] <- fitLDA(word.id, doc.id, k=i, n.chains=3, n.iter=n.iter, alpha=25/i, beta=0.1)
	ctr <- ctr + 1
	}

	# https://github.com/cpsievert/xkcd-Topics/blob/master/postprocess.R
	#m onitor convergence
	for (i in seq_along(rez)){
	plotLoglik(rez[[i]][[2]])
	}
	Rhats <- data.frame(matrix(numeric(2*length(Ks)), nrow=length(Ks), ncol=2))
	for (i in seq_along(rez)){
	l <- rez[[i]][[1]]
	l2 <- split(c(l), rep(1:dim(l)[2], each=dim(l)[1]))
	ml <- mcmc.list(lapply(l2, mcmc))
	Rhats[i, ] <- gelman.diag(ml)[[1]]
	}
	names(Rhats) <- c("Point est.", "Upper C.I.")
	Rhats$num.topics <- Ks
	Rhats


	W <- length(word.id)
	V <- length(unique(word.id))

	sample_post <- function(fit){ #grabs samples from posterior
	samples <- matrix(numeric(W*n.iter), nrow=W, ncol=n.iter)
	samples[,1] <- fit[[1]][,sample(1:3, 1)] #randomly choose one chain
	k <- max(samples[,1])
	for (i in seq_len(n.iter)[-1]){
	samples[,i] <- fitLDA(word.id, doc.id, k=k, n.chains=1, n.iter=1, topics.init=samples[,i-1],
	alpha=25/i, beta=0.1)[[1]]
	}
	return(samples)
	}

	compute_ll <- function(samples, k){ #Compute log p(w\|T)
	lik <- numeric(n.iter)
	for (i in seq_len(n.iter)){
	nj <- table(samples[,i])
	nwj <- table(word.id, samples[,i])
	lik[i] <- k(lgamma(V0.01) - Vlgamma(0.01)) + sum(colSums(lgamma(nwj + 0.01)) - lgamma(nj + V0.01))
	}
	return(lik)
	}

	harmonics <- numeric(length(rez))
	stderr <- numeric(length(rez))
	for (i in seq_along(rez)){
	samplez <- sample_post(rez[[i]])
	ll <- compute_ll(samplez, k=Ks[i])
	harmonics[i] <- 1/mean(1/ll) #point estimate of p(w\|T)
	stderr <- mean((ll - harmonics[i])^2) #sample variance
	}

	z <- qnorm(.975)

	plot(Ks, harmonics, type="l", xlab="Number of Topics", ylab="Estimated log-likelihood")
	#assess precision of the Monte Carlo approximation (see page 257 of Kaiser's notes)
	lines(x=Ks, y=harmonics+zsqrt((1/n.iter)stderr), lty=2)
	lines(x=Ks, y=harmonics-zsqrt((1/n.iter)stderr), lty=2, col=2)
	points(Ks, harmonics)

	names(harmonics) <- Ks
	#save(harmonics, file="4-204topics-beta0.1-alpha25byk.rda")


	library(lubridate)
	library(ggplot2)
	idx <- which.max(harmonics) #index pointing to optimal number of topics
	opt <- Ks[idx]
	topic.id <- rez[[idx]][[1]][,sample(1:3, 1)]
	date.id <- dates$date[doc.id]
	time.df <- data.frame(date.id, topic.id)
	time.df$yr <- year(time.df$date.id)