Created
August 3, 2014 09:10
-
-
Save mm--/12426018f129f99ccf6e to your computer and use it in GitHub Desktop.
An attempt to use Latent Dirichlet Distribution to do topic modeling and clustering on text
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/Rscript | |
## NOTE: This code got pretty messy, and is no longer meant to be run | |
## just as an Rscript. | |
## | |
## LDA test | |
## Josh Moller-Mara | |
## Take in a file with one "document" per line. | |
## Try to cluster them using Latent Dirichlet Allocation and select | |
## among models using DIC. | |
## Brief overview of LDA: (from probmods.org) | |
## 1. For each document, mixture weights over a set of K topics are | |
## drawn from a Dirichlet prior | |
## 2. Then N topics are sampled for the document, one for each | |
## word. | |
## Each topic is associated with a distribution over words, this | |
## distribution is drawn from a Dirichlet prior. | |
## 3. For each of the N topics drawn for the document, a word is | |
## sampled from the corresponding multinomial distribution. | |
## (I think it's really a categorical distribution, but that's just | |
## multinomial with n=1) | |
library(rjags) | |
library(plyr) | |
library(tm) | |
args <- commandArgs(trailingOnly = TRUE) | |
documents <- readLines(args[1]) | |
## Take our documents and turn it into a Term Document Matrix | |
## Rows are terms, and columns are documents. Cells are counts of terms in documents. | |
## Check it out using inspect(tdm) | |
## We strip out numbers and punctuation here to make it a bit easier on ourselves | |
docsToTDM <- function(documents) { | |
docs.tm <- Corpus(VectorSource(documents)) | |
docs.tm <- tm_map(docs.tm, tolower) | |
docs.tm <- tm_map(docs.tm, removeNumbers) | |
## docs.tm <- tm_map(docs.tm, removePunctuation) | |
docs.tm <- tm_map(docs.tm, function(x) gsub("[[:punct:]]", " ", x)) | |
docs.tm <- tm_map(docs.tm, stripWhitespace) | |
docs.tm <- tm_map(docs.tm, removeWords, stopwords('english')) | |
## We could also filter out non-frequent terms to keep the graph size small: | |
## tdm <- TermDocumentMatrix(docs.tm) | |
## findFreqTerms(tdm, 5) | |
## tdm.sparse <- removeSparseTerms(tdm, 0.99) | |
## tdm.sparse | |
TermDocumentMatrix(docs.tm) | |
} | |
mtdm <- as.matrix(docsToTDM(documents)) | |
## List of all the terms | |
words <- rownames(mtdm) | |
## Take in the term document matrix, words, and number of clusters (K) | |
## Return a JAGS model, which we'll burn in and sample later | |
genLDA <- function(mtdm, words, K, alpha.Words = 0.1, alpha.Topics = 0.1) { | |
## Here we translate all documents into a numbered matrix, so JAGS can understand it | |
## Each row is a document, it's columns are filled with numbers | |
## Each unique number represents a word in that document | |
## The number of columns is the maximum length of all documents | |
## Unused columns are filled with "NA" | |
word <- do.call(rbind.fill.matrix, | |
lapply(1:ncol(mtdm), function(i) t(rep(1:length(mtdm[,i]), mtdm[,i])))) | |
N <- ncol(mtdm) #Number of documents | |
Nwords <- length(words) #Number of terms | |
alphaTopics <- rep(alpha.Topics, K) #Hyperprior on topics | |
alphaWords <- rep(alpha.Words, Nwords) #Hyperprior on words | |
## These hyperpriors are set such that we can give weights such as (1,0,0) to topics. | |
## If we had 3 topics and used an alpha of (100, 100, 100), we'd | |
## only expect relatively even mixture weights on the | |
## topics. This isn't what we generally want. We'd like documents | |
## to be able to belong to mostly one topic. | |
## For each word in a document, we sample a topic | |
wordtopic <- matrix(NA, nrow(word), ncol(word)) | |
## Length of documents needed for indexing in JAGS | |
doclengths <- rowSums(!is.na(word)) | |
## How much we believe each document belongs to each of K topics | |
topicdist <- matrix(NA, N, K) | |
## How much we believe each word belongs to each of K topics | |
topicwords <- matrix(NA, K, Nwords) | |
## All the parameters to be passed to JAGS | |
dataList <- list(alphaTopics = alphaTopics, | |
alphaWords = alphaWords, | |
topicdist = topicdist, | |
wordtopic = wordtopic, | |
word = word, | |
Ndocs = N, | |
Ktopics = K, | |
length = doclengths, | |
Nwords = Nwords, | |
worddist = topicwords) | |
jags.model('lda-test2.jag', | |
data = dataList, | |
n.chains = 5, | |
n.adapt = 100) | |
} | |
## Take a look at how topics are distinguished | |
## For each word, show it's association with topics | |
wordsToClusters <- function(jags, words, n.iter = 100) { | |
sampleTW <- jags.samples(jags, | |
c('worddist'), | |
n.iter)$worddist | |
colnames(sampleTW) <- words | |
sTW <- summary(sampleTW, FUN = mean)$stat | |
sTW[,order(colSums(sTW))] | |
t(sweep(sTW,2,colSums(sTW), '/')) | |
} | |
## Lets assign topics to the documents | |
## We sample from "topicdist" and pick the topic with highest weight | |
labelDocuments <- function(jags, n.iter = 1000) { | |
topicdist.samp <- jags.samples(jags, | |
c('topicdist'), | |
n.iter) | |
marginal.weights <- summary(topicdist.samp$topicdist, FUN = mean)$stat | |
best.topic <- apply(marginal.weights, 1, which.max) | |
best.topic | |
} | |
############################## | |
## Burn our chain in. | |
## Hopefully. | |
## Ideally we'd do some plots showing mixing. | |
## Word of warning: The number of updates and chains I use is really arbitrary, and not | |
## carefully selected. | |
jags <- genLDA(mtdm, words, 3) | |
update(jags, 5000) | |
split(documents, labelDocuments(jags)) | |
## Take a look at how topics are distinguished | |
## For each word, show it's association with topics | |
wordsToClusters <- function(jags, words, n.iter = 100) { | |
sampleTW <- jags.samples(jags, | |
c('worddist'), | |
n.iter)$worddist | |
colnames(sampleTW) <- words | |
sTW <- summary(sampleTW, FUN = mean)$stat | |
sTW[,order(colSums(sTW))] | |
t(sweep(sTW,2,colSums(sTW), '/')) | |
} | |
## Try visualizing topics as word clouds | |
library(wordcloud) | |
genWordCloud <- function(sampleTW, words, columnNumber,...) { | |
freq <- t(summary(sampleTW, FUN = mean)$stat)[,columnNumber] | |
wordcloud(words[order(freq)],freq[order(freq)],...) | |
} | |
sampleTW <- jags.samples(jags, | |
c('worddist'), | |
300)$worddist | |
colnames(sampleTW) <- words | |
par(mfrow=c(1,3)) | |
for(i in 1:3) | |
genWordCloud(sampleTW, words, i, min.freq = 0.01) | |
############################## | |
## Failed Attempt to find number of clusters | |
############################## | |
library(snowfall) | |
sfInit( parallel=TRUE, cpus=3 ) | |
sfLibrary(rjags) | |
sfLibrary(plyr) | |
sfExportAll() | |
dicsamps <- sfLapply( 2:10, function(K) { | |
jags <- genLDA(mtdm, words, K) | |
update(jags, 3000) | |
dic.samples(jags, 30) | |
}) | |
sfStop() | |
invisible(mapply(function(x, i) { | |
print(i) | |
print(x) | |
}, | |
dicsamps, 1:length(dicsamps))) | |
which.min(sapply(dicsamps, function(ds) with(ds, sum(deviance) + sum(penalty)))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment