Last active
December 11, 2015 06:28
-
-
Save benmarwick/4559589 to your computer and use it in GitHub Desktop.
Uses R to control MALLET and generate models with different numbers of topics. A Log Likelihood value is extracted from each model and the vector of Log Likelihood values from all models generated is plotted and inspected to see which model has the highest, and thus what number of topics best suits the corpus. For a Linux machine.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# R interface with MALLET to loop over different numbers of topics | |
# on a linux machine | |
# first, download MALLET | |
# second, install java | |
# configure variables and filenames for MALLET | |
## here using MALLET's built-in example data | |
# set list of topic numbers to iterate over | |
seq <- seq(2, 100, 1) # adjust to suit | |
# create list to store output of loop | |
loop <- vector("list", length(seq)) | |
# define loop | |
for (i in seq) { | |
# folder containing txt files for MALLET to work on | |
importdir <- "/home/two/Dropbox/R/2013.1.1.E84E4jrp/mallet-2.0.7/sample-data/web/en" # adjust to suit | |
# name of file for MALLET to train model on | |
output <- paste("output.mallet", i, sep = ".") | |
# set number of topics for MALLET to use | |
ntopics <- i | |
# set optimisation interval for MALLET to use | |
optint <- 10 | |
# set number of iterations per model | |
iter <- 1000 | |
# set file names for output of model, extensions must be as shown | |
outputstate <- paste("topic-state", i, "gz", sep = ".") | |
outputtopickeys <- paste("output_keys", i, "txt", sep = ".") | |
outputdoctopics <- paste("output_composition", i, "txt", sep = ".") | |
diagnostics <- paste("diagnostics", i, "xml", sep = ".") | |
# combine variables into strings ready for windows command line | |
cd <- "cd /home/two/Dropbox/R/2013.1.1.E84E4jrp/mallet-2.0.7" # location of the bin directory, adjust to suit | |
import <- paste("bin/mallet import-dir --input", importdir, "--output", output, "--keep-sequence --remove-stopwords", sep = " ") | |
train <- paste("bin/mallet run cc.mallet.topics.tui.TopicTrainer --input", output, "--num-topics", ntopics, "--optimize-interval", optint, "--output-state", outputstate, "--output-topic-keys", outputtopickeys, "--output-doc-topics", outputdoctopics, "--diagnostics-file", diagnostics, "--optimize-burn-in 200", sep = " ") | |
# send commands to the Linux shell and run MALLET from there | |
# collect console output in a list called 'loop' | |
loop[[i]] <- system(paste(cd, import, train, sep = " ; "), | |
intern = TRUE) | |
} | |
# extract LogLiklihoods to see what number of topics has lowest LL | |
library(plyr) | |
LL1 <- ldply(seq, function(x) loop[x][[1]][[length(loop[x][[1]])-2]]) # gets the LL from the last iteration | |
# LL1 <- ldply(seq, function(x) loop[x][[1]][[10]]) # alternatively, gets the LL from the first iteration | |
LL2 <- ldply(1:nrow(LL1), function (x) as.numeric(strsplit(LL1$V1, " ")[[x]][3])) | |
LLdf <- cbind(seq, LL2) | |
# plot LL by number of topics | |
library(ggplot2) | |
library(grid) | |
ggplot(LLdf, aes(x = seq, y = V1)) + | |
xlab("Number of topics") + | |
ylab("Log likelihood of the model") + | |
geom_line() + | |
theme_bw() + | |
theme(axis.title.x = element_text(vjust = -0.5, size = 14)) + | |
theme(axis.title.y=element_text(size = 14, angle=90, vjust= -0.25)) + | |
theme(plot.margin = unit(c(1,1,2,2), "lines")) | |
# inspect sorted table to find the number of topics with the lowest LL | |
# in case it's not easy to read it from the plot | |
LLdf[order(-LLdf$V1), ] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment