Skip to content

Instantly share code, notes, and snippets.

@gregoryely
Created May 5, 2013 02:16
Show Gist options
  • Save gregoryely/5519420 to your computer and use it in GitHub Desktop.
Save gregoryely/5519420 to your computer and use it in GitHub Desktop.
SOME MATLAB
%Function lda takes in a list of Words and documents and generates the
%probability matrices for LDA
% INPUTS:
% W -list of words
% D -list of documents assigned to each word
% nTopics - number of topics to use for LDA
% maxIter - maximum iterations to run LDA
% nVocab - (optional) number of unique words
%
% Outputs:
% nDK - the Topic-Document matrix
% nKW - the Document-Word Matrix
% (optional)
% nK - The total number of words assigned to each topic
% report - object that contains the change in residuals of the nDK and nKW
% matrix as a function of iteration. This is used to confirm convergence
% of the topic model.
function [nDK nKW nK report]= lda(W,D,nTopics,maxIter,nVocab)
tic
nWords = length(D);
nDocs = max(unique(D));
if ~exist('nVocab','var')
nVocab = max(unique(W));
end
%% Initialization
Z = ceil(rand(nWords,1)*nTopics);
%maxIter = 50;
% number of words assigned to Topic K in Doc D
nDK = zeros(nTopics,nDocs);
% number of times word w is assigned to topic k
nKW = zeros(nTopics,nVocab);
% number of words assigned to Topic K
nK = zeros(nTopics,1);
% Initilize the Probability matrices with the correct values.
for i = 1:nWords
topic = Z(i);
doc = D(i);
word = W(i);
nKW(topic,word) = nKW(topic,word) + 1;
nDK(topic,doc) = nDK(topic,doc) + 1;
nK(topic,1) = nK(topic,1) + 1;
end
toc
%% Begin Collapsed Gibbs sampling
j = 1;
while true
nDK_old = nDK;
nKW_old = nKW;
% Number of words
for i = 1:nWords
word = W(i);
topic = Z(i);
doc = D(i);
% Decrement counter
nDK(topic,doc) = nDK(topic,doc) - 1;
nKW(topic,word) = nKW(topic,word) - 1;
nK(topic) = nK(topic) - 1;
% For number of topics K
%pTemp = zeros(nTopics,1);
%for k = 1:nTopics
%temp =(nDK(k,doc) +alphas(k))*(nKW(k,word) + betas(word))/(nK(k) + nKW(k,:)*betas);
%temp =(nDK(k,doc) + 1)*(nKW(k,word) + 1)/(nK(k) + nVocab*1);
%pTemp(k,1) = temp;
%end % end K loop
%pTemp=pTemp/sum(pTemp);
% generate the topic distribution
% The Prior for loop is compressed into one line.
pTemp = ((nDK(:,doc) + 1).*(nKW(:,word) + 1))./(nK + nVocab*1);
% Choose a topic from the distribution.
topic = nRandNums(1,pTemp);
Z(i) = topic;
% Increment the counters.
nDK(topic,doc) = nDK(topic,doc) + 1;
nKW(topic,word) = nKW(topic,word) + 1;
nK(topic) = nK(topic) + 1;
end % end I loop
% calculate the residuals.
DK_res = norm(nDK(:)-nDK_old(:))/norm(nDK_old(:));
KW_res = norm(nKW(:)-nKW_old(:))/norm(nKW_old(:));
report.DK_res(j,1) = DK_res;
report.KW_res(j,1) = KW_res;
disp(['itr: ',num2str(j), ' DK_res: ', num2str(DK_res),' KW_res: ', num2str(KW_res)])
j = j + 1;
%bar(nK)
pause(.05)
if j >= maxIter
break
end
end
disp('Done')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment