benmarwick · March 18, 2019 00:09
diff --git a/parallel-topicmodels.r b/parallel-topicmodels.r
 # Speed tests of different parallel and non-parallel methods
 # for iterating over different numbers of topics with 
 # topicmodels

 # clear workspace and stop any previous cluster instances
 rm(list = ls(all.names = TRUE))
 gc()
 sfStop() 

 library(topicmodels)
 library(plyr)
 library(snowfall)
 library(foreach)
 library(doSNOW)

 # get data
 data("AssociatedPress", package = "topicmodels")

 # set number of topics to start with
 k <- 20

 # set model options
 control_LDA_VEM <-
  list(estimate.alpha = TRUE, alpha = 50/k, estimate.beta = TRUE,
       verbose = 0, prefix = tempfile(), save = 0, keep = 0,
       seed = as.integer(100), nstart = 1, best = TRUE,
       var = list(iter.max = 10, tol = 10^-6),
       em = list(iter.max = 10, tol = 10^-4),
       initialize = "random")

 # set sequence of topic numbers to iterate over
 seq <- seq(2, 500, by = 100)

 # set parallel processing options

 # initiate cores
 sfInit(parallel=TRUE, cpus=4, type="SOCK") # for snowfall
 cl <- makeCluster(4, type = "SOCK") # for snow
 registerDoSNOW(cl) # for snow

 # send data and packages to multicores 
 sfExport("AssociatedPress", "control_LDA_VEM") # for snowfall
 sfLibrary(topicmodels) # for snowfall

 # again for snow
 clusterEvalQ(cl, library(topicmodels)) # for snow
 clusterExport(cl, c("AssociatedPress", "control_LDA_VEM")) # for snow

 # non-parallel methods
 #base
 BASE <- system.time(best.model.BASE <<- lapply(seq, function(d){LDA(AssociatedPress[1:20,], control = control_LDA_VEM, d)}))

 # plyr non-parallel
 PLYR_S <- system.time(best.model.PLYR_S <<- llply(seq, function(d){LDA(AssociatedPress[1:20,], control = control_LDA_VEM, d)}, .progress = "text"))

 # parallel methods

 # wrapper for some of these that don't handle the function well
 wrapper <- function (d) topicmodels:::LDA(AssociatedPress[1:20,], control = control_LDA_VEM, d)

 # using parLapply
 PARLAP <- system.time(best.model.PARLAP <<- parLapply(cl, seq, wrapper))

 # using dopar
 DOPAR <- system.time(best.model.DOPAR <<- foreach(i = seq, .export = c("AssociatedPress", "control_LDA_VEM"), .packages = "topicmodels", .verbose = TRUE) %dopar% (LDA(AssociatedPress[1:20,], control = control_LDA_VEM, k=i)))

 # using sfLapply                            
 SFLAPP <- system.time(best.model.SFLAPP <<- sfLapply(seq, function(d){topicmodels:::LDA(AssociatedPress[1:20,], control = control_LDA_VEM, d)})) 

 # using sfClusterApplyLB                            
 SFCLU <- system.time(best.model.SFCLU <<- sfClusterApplyLB(seq, function(d){topicmodels:::LDA(AssociatedPress[1:20,], control = control_LDA_VEM, d)})) 

 # using plyr in parallel (needs snow options)
 PLYRP <- system.time(best.model.PLYRP <<- llply(seq, function(d){topicmodels:::LDA(AssociatedPress[1:20,], control = control_LDA_VEM, d)}, .parallel = TRUE))

 # inspect results
 rbind(BASE, PLYR_S, PARLAP, DOPAR, SFLAPP, SFCLU, PLYRP)

 ### using the lda package
 AP <- dtm2ldaformat(AssociatedPress[1:30])
 wc<-word.counts(AP$documents) #get word counts
 AP <- dtm2ldaformat(AssociatedPress[1:30])
 wc <- word.counts(AP$documents) #get word counts
 AP.filtered <- filter.words(AP$documents, as.numeric(names(wc)[wc <= 2]) ) #remove words occuring less than 2 times
 AP.lex.lda <- lda.collapsed.gibbs.sampler(AP$documents, 
  k, 
  AP$vocab, 
  30, 
  0.1, 
  0.1, 
  compute.log.likelihood = TRUE) # uses a collapsed Gibbs sampler to ﬁt a latent Dirichlet allocation (LDA) model, consider 0.01 for alpha http://www.bytemining.com/2011/08/sigkdd-2011-conference-day-1-graph-mining-and-david-bleitopic-models/
  

 # speed test on many topics - it's very fast!
 sequ <- seq(2,202,10)
 system.time(lda.ll <- llply(sequ, function(d) lda.collapsed.gibbs.sampler(AP$documents, d, AP$vocab, 30, 0.1, 0.1, compute.log.likelihood = TRUE) , .progress = "text"))
	# Speed tests of different parallel and non-parallel methods
	# for iterating over different numbers of topics with
	# topicmodels

	# clear workspace and stop any previous cluster instances
	rm(list = ls(all.names = TRUE))
	gc()
	sfStop()

	library(topicmodels)
	library(plyr)
	library(snowfall)
	library(foreach)
	library(doSNOW)

	# get data
	data("AssociatedPress", package = "topicmodels")

	# set number of topics to start with
	k <- 20

	# set model options
	control_LDA_VEM <-
	list(estimate.alpha = TRUE, alpha = 50/k, estimate.beta = TRUE,
	verbose = 0, prefix = tempfile(), save = 0, keep = 0,
	seed = as.integer(100), nstart = 1, best = TRUE,
	var = list(iter.max = 10, tol = 10^-6),
	em = list(iter.max = 10, tol = 10^-4),
	initialize = "random")

	# set sequence of topic numbers to iterate over
	seq <- seq(2, 500, by = 100)

	# set parallel processing options

	# initiate cores
	sfInit(parallel=TRUE, cpus=4, type="SOCK") # for snowfall
	cl <- makeCluster(4, type = "SOCK") # for snow
	registerDoSNOW(cl) # for snow

	# send data and packages to multicores
	sfExport("AssociatedPress", "control_LDA_VEM") # for snowfall
	sfLibrary(topicmodels) # for snowfall

	# again for snow
	clusterEvalQ(cl, library(topicmodels)) # for snow
	clusterExport(cl, c("AssociatedPress", "control_LDA_VEM")) # for snow

	# non-parallel methods
	#base
	BASE <- system.time(best.model.BASE <<- lapply(seq, function(d){LDA(AssociatedPress[1:20,], control = control_LDA_VEM, d)}))

	# plyr non-parallel
	PLYR_S <- system.time(best.model.PLYR_S <<- llply(seq, function(d){LDA(AssociatedPress[1:20,], control = control_LDA_VEM, d)}, .progress = "text"))

	# parallel methods

	# wrapper for some of these that don't handle the function well
	wrapper <- function (d) topicmodels:::LDA(AssociatedPress[1:20,], control = control_LDA_VEM, d)

	# using parLapply
	PARLAP <- system.time(best.model.PARLAP <<- parLapply(cl, seq, wrapper))

	# using dopar
	DOPAR <- system.time(best.model.DOPAR <<- foreach(i = seq, .export = c("AssociatedPress", "control_LDA_VEM"), .packages = "topicmodels", .verbose = TRUE) %dopar% (LDA(AssociatedPress[1:20,], control = control_LDA_VEM, k=i)))

	# using sfLapply
	SFLAPP <- system.time(best.model.SFLAPP <<- sfLapply(seq, function(d){topicmodels:::LDA(AssociatedPress[1:20,], control = control_LDA_VEM, d)}))

	# using sfClusterApplyLB
	SFCLU <- system.time(best.model.SFCLU <<- sfClusterApplyLB(seq, function(d){topicmodels:::LDA(AssociatedPress[1:20,], control = control_LDA_VEM, d)}))

	# using plyr in parallel (needs snow options)
	PLYRP <- system.time(best.model.PLYRP <<- llply(seq, function(d){topicmodels:::LDA(AssociatedPress[1:20,], control = control_LDA_VEM, d)}, .parallel = TRUE))

	# inspect results
	rbind(BASE, PLYR_S, PARLAP, DOPAR, SFLAPP, SFCLU, PLYRP)

	### using the lda package
	AP <- dtm2ldaformat(AssociatedPress[1:30])
	wc<-word.counts(AP$documents) #get word counts
	AP <- dtm2ldaformat(AssociatedPress[1:30])
	wc <- word.counts(AP$documents) #get word counts
	AP.filtered <- filter.words(AP$documents, as.numeric(names(wc)[wc <= 2]) ) #remove words occuring less than 2 times
	AP.lex.lda <- lda.collapsed.gibbs.sampler(AP$documents,
	k,
	AP$vocab,
	30,
	0.1,
	0.1,
	compute.log.likelihood = TRUE) # uses a collapsed Gibbs sampler to ﬁt a latent Dirichlet allocation (LDA) model, consider 0.01 for alpha http://www.bytemining.com/2011/08/sigkdd-2011-conference-day-1-graph-mining-and-david-bleitopic-models/


	# speed test on many topics - it's very fast!
	sequ <- seq(2,202,10)
	system.time(lda.ll <- llply(sequ, function(d) lda.collapsed.gibbs.sampler(AP$documents, d, AP$vocab, 30, 0.1, 0.1, compute.log.likelihood = TRUE) , .progress = "text"))