Skip to content

Instantly share code, notes, and snippets.

@benmarwick
Created January 16, 2013 07:58
Show Gist options
  • Save benmarwick/4545395 to your computer and use it in GitHub Desktop.
Save benmarwick/4545395 to your computer and use it in GitHub Desktop.
Uses R to control MALLET and generate models with different numbers of topics. A Log Likelihood value is extracted from each model and the vector of Log Likelihood values from all models generated is plotted and inspected to see which model has the highest, and thus what number of topics best suits the corpus. For a Windows machine.
# setup system enviroment for R and MALLET
MALLET_HOME <- "c:/mallet-2.0.7" # location of the bin directory
Sys.setenv("MALLET_HOME" = MALLET_HOME)
Sys.setenv(PATH = "c:/Program Files (x86)/Java/jre7/bin")
# configure variables and filenames for MALLET
## here using MALLET's built-in example data
# set list of topic numbers to iterate over
seq <- seq(2, 100, 1)
# create list to store output of loop
loop <- vector("list", length(seq))
# define loop
for (i in seq) {
# folder containing txt files for MALLET to work on
importdir <- "C:\\mallet-2.0.7\\sample-data\\web\\en"
# name of file for MALLET to train model on
output <- paste("tutorial.mallet", i, sep = ".")
# set number of topics for MALLET to use
ntopics <- i
# set optimisation interval for MALLET to use
optint <- 10
# set number of iterations per model
iter <- 1000
# set file names for output of model, extensions must be as shown
outputstate <- paste("topic-state", i, "gz", sep = ".")
outputtopickeys <- paste("tutorial_keys", i, "txt", sep = ".")
outputdoctopics <- paste("tutorial_composition", i, "txt", sep = ".")
diagnostics <- paste("diagnostics", i, "xml", sep = ".")
# combine variables into strings ready for windows command line
cd <- "cd C:\\mallet-2.0.7" # location of the bin directory
import <- paste("bin\\mallet import-dir --input", importdir, "--output", output, "--keep-sequence --remove-stopwords", sep = " ")
train <- paste("bin\\mallet run cc.mallet.topics.tui.TopicTrainer --input", output, "--num-topics", ntopics, "--optimize-interval", optint, "--output-state", outputstate, "--output-topic-keys", outputtopickeys, "--output-doc-topics", outputdoctopics, "--diagnostics-file", diagnostics, "--optimize-burn-in 200", sep = " ")
# send commands to the Windows command prompt and run MALLET from there
# collect console output in a list
loop[[i]] <- shell(shQuote(paste(cd, import, train, sep = " && ")),
invisible = FALSE, intern = TRUE)
}
# extract LogLiklihoods to see what number of topics has lowest LL
library(plyr)
LL1 <- ldply(seq, function(x) loop[x][[1]][[10]])
LL2 <- ldply(1:nrow(LL1), function (x) as.numeric(strsplit(LL1$V1, " ")[[x]][3]))
LLdf <- cbind(seq, LL2)
# plot LL by number of topics
library(ggplot2)
library(grid)
ggplot(LLdf, aes(x = seq, y = V1)) +
xlab("Number of topics") +
ylab("Log likelihood of the model") +
geom_line() +
theme_bw() +
theme(axis.title.x = element_text(vjust = -0.5, size = 14)) +
theme(axis.title.y=element_text(size = 14, angle=90, vjust= -0.25)) +
theme(plot.margin = unit(c(1,1,2,2), "lines"))
# inspect sorted table to find the number of topics with the lowest LL
# in case it's not easy to read it from the plot
LLdf[order(-LLdf$V1), ]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment