benmarwick · January 16, 2013 07:58
diff --git a/R2MALLET-loop.r b/R2MALLET-loop.r
 # setup system enviroment for R and MALLET
 MALLET_HOME <- "c:/mallet-2.0.7" # location of the bin directory
 Sys.setenv("MALLET_HOME" = MALLET_HOME)
 Sys.setenv(PATH = "c:/Program Files (x86)/Java/jre7/bin")

 # configure variables and filenames for MALLET
 ## here using MALLET's built-in example data 

 # set list of topic numbers to iterate over
 seq <- seq(2, 100, 1)

 # create list to store output of loop
 loop <- vector("list", length(seq))

 # define loop
 for (i in seq) {

  # folder containing txt files for MALLET to work on
  importdir <- "C:\\mallet-2.0.7\\sample-data\\web\\en"
  # name of file for MALLET to train model on
  output <- paste("tutorial.mallet", i, sep = ".")
  # set number of topics for MALLET to use
  ntopics <- i
  # set optimisation interval for MALLET to use
  optint <-  10
  # set number of iterations per model
  iter <- 1000
  
  # set file names for output of model, extensions must be as shown
  outputstate <-     paste("topic-state", i, "gz", sep = ".")
  outputtopickeys <- paste("tutorial_keys", i, "txt", sep = ".")
  outputdoctopics <- paste("tutorial_composition", i, "txt", sep = ".")
  diagnostics <-     paste("diagnostics", i, "xml", sep = ".")
  
  # combine variables into strings ready for windows command line
  cd <- "cd C:\\mallet-2.0.7" # location of the bin directory
  import <- paste("bin\\mallet import-dir --input", importdir, "--output", output, "--keep-sequence --remove-stopwords", sep = " ")
  train  <- paste("bin\\mallet run cc.mallet.topics.tui.TopicTrainer --input", output, "--num-topics", ntopics, "--optimize-interval",  optint, "--output-state", outputstate, "--output-topic-keys", outputtopickeys, "--output-doc-topics", outputdoctopics, "--diagnostics-file", diagnostics, "--optimize-burn-in 200", sep = " ")
  
  # send commands to the Windows command prompt and run MALLET from there
  # collect console output in a list
  loop[[i]] <- shell(shQuote(paste(cd, import, train, sep = " && ")), 
                     invisible = FALSE, intern = TRUE)
 }

 # extract LogLiklihoods to see what number of topics has lowest LL
 library(plyr)
 LL1 <- ldply(seq, function(x) loop[x][[1]][[10]])
 LL2 <- ldply(1:nrow(LL1), function (x) as.numeric(strsplit(LL1$V1, " ")[[x]][3]))
 LLdf <- cbind(seq, LL2) 
 # plot LL by number of topics
 library(ggplot2)
 library(grid)
 ggplot(LLdf, aes(x = seq, y = V1)) + 
  xlab("Number of topics") + 
  ylab("Log likelihood of the model") + 
  geom_line() + 
  theme_bw()  + 
  theme(axis.title.x = element_text(vjust = -0.5, size = 14)) + 
  theme(axis.title.y=element_text(size = 14, angle=90, vjust= -0.25)) + 
  theme(plot.margin = unit(c(1,1,2,2), "lines"))

 # inspect sorted table to find the number of topics with the lowest LL
 # in case it's not easy to read it from the plot
 LLdf[order(-LLdf$V1), ]
	# setup system enviroment for R and MALLET
	MALLET_HOME <- "c:/mallet-2.0.7" # location of the bin directory
	Sys.setenv("MALLET_HOME" = MALLET_HOME)
	Sys.setenv(PATH = "c:/Program Files (x86)/Java/jre7/bin")

	# configure variables and filenames for MALLET
	## here using MALLET's built-in example data

	# set list of topic numbers to iterate over
	seq <- seq(2, 100, 1)

	# create list to store output of loop
	loop <- vector("list", length(seq))

	# define loop
	for (i in seq) {

	# folder containing txt files for MALLET to work on
	importdir <- "C:\\mallet-2.0.7\\sample-data\\web\\en"
	# name of file for MALLET to train model on
	output <- paste("tutorial.mallet", i, sep = ".")
	# set number of topics for MALLET to use
	ntopics <- i
	# set optimisation interval for MALLET to use
	optint <- 10
	# set number of iterations per model
	iter <- 1000

	# set file names for output of model, extensions must be as shown
	outputstate <- paste("topic-state", i, "gz", sep = ".")
	outputtopickeys <- paste("tutorial_keys", i, "txt", sep = ".")
	outputdoctopics <- paste("tutorial_composition", i, "txt", sep = ".")
	diagnostics <- paste("diagnostics", i, "xml", sep = ".")

	# combine variables into strings ready for windows command line
	cd <- "cd C:\\mallet-2.0.7" # location of the bin directory
	import <- paste("bin\\mallet import-dir --input", importdir, "--output", output, "--keep-sequence --remove-stopwords", sep = " ")
	train <- paste("bin\\mallet run cc.mallet.topics.tui.TopicTrainer --input", output, "--num-topics", ntopics, "--optimize-interval", optint, "--output-state", outputstate, "--output-topic-keys", outputtopickeys, "--output-doc-topics", outputdoctopics, "--diagnostics-file", diagnostics, "--optimize-burn-in 200", sep = " ")

	# send commands to the Windows command prompt and run MALLET from there
	# collect console output in a list
	loop[[i]] <- shell(shQuote(paste(cd, import, train, sep = " && ")),
	invisible = FALSE, intern = TRUE)
	}

	# extract LogLiklihoods to see what number of topics has lowest LL
	library(plyr)
	LL1 <- ldply(seq, function(x) loop[x][[1]][[10]])
	LL2 <- ldply(1:nrow(LL1), function (x) as.numeric(strsplit(LL1$V1, " ")[[x]][3]))
	LLdf <- cbind(seq, LL2)
	# plot LL by number of topics
	library(ggplot2)
	library(grid)
	ggplot(LLdf, aes(x = seq, y = V1)) +
	xlab("Number of topics") +
	ylab("Log likelihood of the model") +
	geom_line() +
	theme_bw() +
	theme(axis.title.x = element_text(vjust = -0.5, size = 14)) +
	theme(axis.title.y=element_text(size = 14, angle=90, vjust= -0.25)) +
	theme(plot.margin = unit(c(1,1,2,2), "lines"))

	# inspect sorted table to find the number of topics with the lowest LL
	# in case it's not easy to read it from the plot
	LLdf[order(-LLdf$V1), ]