Last active
March 18, 2019 00:09
-
-
Save benmarwick/4541872 to your computer and use it in GitHub Desktop.
Speed test of different methods of parallel processing to generate topic models with different numbers of topics. Coded for a single Windows 7 laptop with a four core processor (ie not a networked cluster) and data from the topicmodels package.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Speed tests of different parallel and non-parallel methods | |
# for iterating over different numbers of topics with | |
# topicmodels | |
# clear workspace and stop any previous cluster instances | |
rm(list = ls(all.names = TRUE)) | |
gc() | |
sfStop() | |
library(topicmodels) | |
library(plyr) | |
library(snowfall) | |
library(foreach) | |
library(doSNOW) | |
# get data | |
data("AssociatedPress", package = "topicmodels") | |
# set number of topics to start with | |
k <- 20 | |
# set model options | |
control_LDA_VEM <- | |
list(estimate.alpha = TRUE, alpha = 50/k, estimate.beta = TRUE, | |
verbose = 0, prefix = tempfile(), save = 0, keep = 0, | |
seed = as.integer(100), nstart = 1, best = TRUE, | |
var = list(iter.max = 10, tol = 10^-6), | |
em = list(iter.max = 10, tol = 10^-4), | |
initialize = "random") | |
# set sequence of topic numbers to iterate over | |
seq <- seq(2, 500, by = 100) | |
# set parallel processing options | |
# initiate cores | |
sfInit(parallel=TRUE, cpus=4, type="SOCK") # for snowfall | |
cl <- makeCluster(4, type = "SOCK") # for snow | |
registerDoSNOW(cl) # for snow | |
# send data and packages to multicores | |
sfExport("AssociatedPress", "control_LDA_VEM") # for snowfall | |
sfLibrary(topicmodels) # for snowfall | |
# again for snow | |
clusterEvalQ(cl, library(topicmodels)) # for snow | |
clusterExport(cl, c("AssociatedPress", "control_LDA_VEM")) # for snow | |
# non-parallel methods | |
#base | |
BASE <- system.time(best.model.BASE <<- lapply(seq, function(d){LDA(AssociatedPress[1:20,], control = control_LDA_VEM, d)})) | |
# plyr non-parallel | |
PLYR_S <- system.time(best.model.PLYR_S <<- llply(seq, function(d){LDA(AssociatedPress[1:20,], control = control_LDA_VEM, d)}, .progress = "text")) | |
# parallel methods | |
# wrapper for some of these that don't handle the function well | |
wrapper <- function (d) topicmodels:::LDA(AssociatedPress[1:20,], control = control_LDA_VEM, d) | |
# using parLapply | |
PARLAP <- system.time(best.model.PARLAP <<- parLapply(cl, seq, wrapper)) | |
# using dopar | |
DOPAR <- system.time(best.model.DOPAR <<- foreach(i = seq, .export = c("AssociatedPress", "control_LDA_VEM"), .packages = "topicmodels", .verbose = TRUE) %dopar% (LDA(AssociatedPress[1:20,], control = control_LDA_VEM, k=i))) | |
# using sfLapply | |
SFLAPP <- system.time(best.model.SFLAPP <<- sfLapply(seq, function(d){topicmodels:::LDA(AssociatedPress[1:20,], control = control_LDA_VEM, d)})) | |
# using sfClusterApplyLB | |
SFCLU <- system.time(best.model.SFCLU <<- sfClusterApplyLB(seq, function(d){topicmodels:::LDA(AssociatedPress[1:20,], control = control_LDA_VEM, d)})) | |
# using plyr in parallel (needs snow options) | |
PLYRP <- system.time(best.model.PLYRP <<- llply(seq, function(d){topicmodels:::LDA(AssociatedPress[1:20,], control = control_LDA_VEM, d)}, .parallel = TRUE)) | |
# inspect results | |
rbind(BASE, PLYR_S, PARLAP, DOPAR, SFLAPP, SFCLU, PLYRP) | |
### using the lda package | |
AP <- dtm2ldaformat(AssociatedPress[1:30]) | |
wc<-word.counts(AP$documents) #get word counts | |
AP <- dtm2ldaformat(AssociatedPress[1:30]) | |
wc <- word.counts(AP$documents) #get word counts | |
AP.filtered <- filter.words(AP$documents, as.numeric(names(wc)[wc <= 2]) ) #remove words occuring less than 2 times | |
AP.lex.lda <- lda.collapsed.gibbs.sampler(AP$documents, | |
k, | |
AP$vocab, | |
30, | |
0.1, | |
0.1, | |
compute.log.likelihood = TRUE) # uses a collapsed Gibbs sampler to fit a latent Dirichlet allocation (LDA) model, consider 0.01 for alpha http://www.bytemining.com/2011/08/sigkdd-2011-conference-day-1-graph-mining-and-david-bleitopic-models/ | |
# speed test on many topics - it's very fast! | |
sequ <- seq(2,202,10) | |
system.time(lda.ll <- llply(sequ, function(d) lda.collapsed.gibbs.sampler(AP$documents, d, AP$vocab, 30, 0.1, 0.1, compute.log.likelihood = TRUE) , .progress = "text")) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment