Created
May 5, 2013 02:16
-
-
Save gregoryely/5519420 to your computer and use it in GitHub Desktop.
SOME MATLAB
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%Function lda takes in a list of Words and documents and generates the | |
%probability matrices for LDA | |
% INPUTS: | |
% W -list of words | |
% D -list of documents assigned to each word | |
% nTopics - number of topics to use for LDA | |
% maxIter - maximum iterations to run LDA | |
% nVocab - (optional) number of unique words | |
% | |
% Outputs: | |
% nDK - the Topic-Document matrix | |
% nKW - the Document-Word Matrix | |
% (optional) | |
% nK - The total number of words assigned to each topic | |
% report - object that contains the change in residuals of the nDK and nKW | |
% matrix as a function of iteration. This is used to confirm convergence | |
% of the topic model. | |
function [nDK nKW nK report]= lda(W,D,nTopics,maxIter,nVocab) | |
tic | |
nWords = length(D); | |
nDocs = max(unique(D)); | |
if ~exist('nVocab','var') | |
nVocab = max(unique(W)); | |
end | |
%% Initialization | |
Z = ceil(rand(nWords,1)*nTopics); | |
%maxIter = 50; | |
% number of words assigned to Topic K in Doc D | |
nDK = zeros(nTopics,nDocs); | |
% number of times word w is assigned to topic k | |
nKW = zeros(nTopics,nVocab); | |
% number of words assigned to Topic K | |
nK = zeros(nTopics,1); | |
% Initilize the Probability matrices with the correct values. | |
for i = 1:nWords | |
topic = Z(i); | |
doc = D(i); | |
word = W(i); | |
nKW(topic,word) = nKW(topic,word) + 1; | |
nDK(topic,doc) = nDK(topic,doc) + 1; | |
nK(topic,1) = nK(topic,1) + 1; | |
end | |
toc | |
%% Begin Collapsed Gibbs sampling | |
j = 1; | |
while true | |
nDK_old = nDK; | |
nKW_old = nKW; | |
% Number of words | |
for i = 1:nWords | |
word = W(i); | |
topic = Z(i); | |
doc = D(i); | |
% Decrement counter | |
nDK(topic,doc) = nDK(topic,doc) - 1; | |
nKW(topic,word) = nKW(topic,word) - 1; | |
nK(topic) = nK(topic) - 1; | |
% For number of topics K | |
%pTemp = zeros(nTopics,1); | |
%for k = 1:nTopics | |
%temp =(nDK(k,doc) +alphas(k))*(nKW(k,word) + betas(word))/(nK(k) + nKW(k,:)*betas); | |
%temp =(nDK(k,doc) + 1)*(nKW(k,word) + 1)/(nK(k) + nVocab*1); | |
%pTemp(k,1) = temp; | |
%end % end K loop | |
%pTemp=pTemp/sum(pTemp); | |
% generate the topic distribution | |
% The Prior for loop is compressed into one line. | |
pTemp = ((nDK(:,doc) + 1).*(nKW(:,word) + 1))./(nK + nVocab*1); | |
% Choose a topic from the distribution. | |
topic = nRandNums(1,pTemp); | |
Z(i) = topic; | |
% Increment the counters. | |
nDK(topic,doc) = nDK(topic,doc) + 1; | |
nKW(topic,word) = nKW(topic,word) + 1; | |
nK(topic) = nK(topic) + 1; | |
end % end I loop | |
% calculate the residuals. | |
DK_res = norm(nDK(:)-nDK_old(:))/norm(nDK_old(:)); | |
KW_res = norm(nKW(:)-nKW_old(:))/norm(nKW_old(:)); | |
report.DK_res(j,1) = DK_res; | |
report.KW_res(j,1) = KW_res; | |
disp(['itr: ',num2str(j), ' DK_res: ', num2str(DK_res),' KW_res: ', num2str(KW_res)]) | |
j = j + 1; | |
%bar(nK) | |
pause(.05) | |
if j >= maxIter | |
break | |
end | |
end | |
disp('Done') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment