benmarwick · December 11, 2015 06:28
diff --git a/R2MALLET-loop-linux.r b/R2MALLET-loop-linux.r
 # R interface with MALLET to loop over different numbers of topics
 # on a linux machine

 # first, download MALLET
 # second, install java

 # configure variables and filenames for MALLET
 ## here using MALLET's built-in example data 

 # set list of topic numbers to iterate over
 seq <- seq(2, 100, 1) # adjust to suit

 # create list to store output of loop
 loop <- vector("list", length(seq))

 # define loop
 for (i in seq) {
  
  # folder containing txt files for MALLET to work on
  importdir <- "/home/two/Dropbox/R/2013.1.1.E84E4jrp/mallet-2.0.7/sample-data/web/en" # adjust to suit
  # name of file for MALLET to train model on
  output <- paste("output.mallet", i, sep = ".")
  # set number of topics for MALLET to use
  ntopics <- i
  # set optimisation interval for MALLET to use
  optint <-  10
  # set number of iterations per model
  iter <- 1000
  
  # set file names for output of model, extensions must be as shown
  outputstate <-     paste("topic-state", i, "gz", sep = ".")
  outputtopickeys <- paste("output_keys", i, "txt", sep = ".")
  outputdoctopics <- paste("output_composition", i, "txt", sep = ".")
  diagnostics <-     paste("diagnostics", i, "xml", sep = ".")
  
  # combine variables into strings ready for windows command line
  cd <- "cd /home/two/Dropbox/R/2013.1.1.E84E4jrp/mallet-2.0.7" # location of the bin directory, adjust to suit
  import <- paste("bin/mallet import-dir --input", importdir, "--output", output, "--keep-sequence --remove-stopwords", sep = " ")
  train  <- paste("bin/mallet run cc.mallet.topics.tui.TopicTrainer --input", output, "--num-topics", ntopics, "--optimize-interval",  optint, "--output-state", outputstate, "--output-topic-keys", outputtopickeys, "--output-doc-topics", outputdoctopics, "--diagnostics-file", diagnostics, "--optimize-burn-in 200", sep = " ")
  
  # send commands to the Linux shell and run MALLET from there
  # collect console output in a list called 'loop'
  loop[[i]] <- system(paste(cd, import, train, sep = " ; "), 
                      intern = TRUE)
 }

 # extract LogLiklihoods to see what number of topics has lowest LL
 library(plyr)
 LL1 <- ldply(seq, function(x) loop[x][[1]][[length(loop[x][[1]])-2]]) # gets the LL from the last iteration
 # LL1 <- ldply(seq, function(x) loop[x][[1]][[10]]) # alternatively, gets the LL from the first iteration
 LL2 <- ldply(1:nrow(LL1), function (x) as.numeric(strsplit(LL1$V1, " ")[[x]][3]))
 LLdf <- cbind(seq, LL2) 
 # plot LL by number of topics
 library(ggplot2)
 library(grid)
 ggplot(LLdf, aes(x = seq, y = V1)) + 
  xlab("Number of topics") + 
  ylab("Log likelihood of the model") + 
  geom_line() + 
  theme_bw()  + 
  theme(axis.title.x = element_text(vjust = -0.5, size = 14)) + 
  theme(axis.title.y=element_text(size = 14, angle=90, vjust= -0.25)) + 
  theme(plot.margin = unit(c(1,1,2,2), "lines"))

 # inspect sorted table to find the number of topics with the lowest LL
 # in case it's not easy to read it from the plot
 LLdf[order(-LLdf$V1), ]
	# R interface with MALLET to loop over different numbers of topics
	# on a linux machine

	# first, download MALLET
	# second, install java

	# configure variables and filenames for MALLET
	## here using MALLET's built-in example data

	# set list of topic numbers to iterate over
	seq <- seq(2, 100, 1) # adjust to suit

	# create list to store output of loop
	loop <- vector("list", length(seq))

	# define loop
	for (i in seq) {

	# folder containing txt files for MALLET to work on
	importdir <- "/home/two/Dropbox/R/2013.1.1.E84E4jrp/mallet-2.0.7/sample-data/web/en" # adjust to suit
	# name of file for MALLET to train model on
	output <- paste("output.mallet", i, sep = ".")
	# set number of topics for MALLET to use
	ntopics <- i
	# set optimisation interval for MALLET to use
	optint <- 10
	# set number of iterations per model
	iter <- 1000

	# set file names for output of model, extensions must be as shown
	outputstate <- paste("topic-state", i, "gz", sep = ".")
	outputtopickeys <- paste("output_keys", i, "txt", sep = ".")
	outputdoctopics <- paste("output_composition", i, "txt", sep = ".")
	diagnostics <- paste("diagnostics", i, "xml", sep = ".")

	# combine variables into strings ready for windows command line
	cd <- "cd /home/two/Dropbox/R/2013.1.1.E84E4jrp/mallet-2.0.7" # location of the bin directory, adjust to suit
	import <- paste("bin/mallet import-dir --input", importdir, "--output", output, "--keep-sequence --remove-stopwords", sep = " ")
	train <- paste("bin/mallet run cc.mallet.topics.tui.TopicTrainer --input", output, "--num-topics", ntopics, "--optimize-interval", optint, "--output-state", outputstate, "--output-topic-keys", outputtopickeys, "--output-doc-topics", outputdoctopics, "--diagnostics-file", diagnostics, "--optimize-burn-in 200", sep = " ")

	# send commands to the Linux shell and run MALLET from there
	# collect console output in a list called 'loop'
	loop[[i]] <- system(paste(cd, import, train, sep = " ; "),
	intern = TRUE)
	}

	# extract LogLiklihoods to see what number of topics has lowest LL
	library(plyr)
	LL1 <- ldply(seq, function(x) loop[x][[1]][[length(loop[x][[1]])-2]]) # gets the LL from the last iteration
	# LL1 <- ldply(seq, function(x) loop[x][[1]][[10]]) # alternatively, gets the LL from the first iteration
	LL2 <- ldply(1:nrow(LL1), function (x) as.numeric(strsplit(LL1$V1, " ")[[x]][3]))
	LLdf <- cbind(seq, LL2)
	# plot LL by number of topics
	library(ggplot2)
	library(grid)
	ggplot(LLdf, aes(x = seq, y = V1)) +
	xlab("Number of topics") +
	ylab("Log likelihood of the model") +
	geom_line() +
	theme_bw() +
	theme(axis.title.x = element_text(vjust = -0.5, size = 14)) +
	theme(axis.title.y=element_text(size = 14, angle=90, vjust= -0.25)) +
	theme(plot.margin = unit(c(1,1,2,2), "lines"))

	# inspect sorted table to find the number of topics with the lowest LL
	# in case it's not easy to read it from the plot
	LLdf[order(-LLdf$V1), ]