gregoryely · May 5, 2013 02:16
diff --git a/lda.m b/lda.m
 %Function lda takes in a list of Words and documents and generates the
 %probability matrices for LDA
 % INPUTS:
 % W -list of words
 % D -list of documents assigned to each word
 % nTopics - number of topics to use for LDA
 % maxIter - maximum iterations to run LDA
 % nVocab - (optional) number of unique words
 %
 % Outputs:
 % nDK - the Topic-Document matrix
 % nKW - the Document-Word Matrix
 % (optional)
 % nK - The total number of words assigned to each topic
 % report - object that contains the change in residuals of the nDK and nKW
 % matrix as a function of iteration.  This is used to confirm convergence
 % of the topic model.

 function  [nDK nKW nK report]= lda(W,D,nTopics,maxIter,nVocab)


 tic
 nWords = length(D);
 nDocs = max(unique(D));
 if ~exist('nVocab','var')
    nVocab = max(unique(W));
 end

 %% Initialization
 Z = ceil(rand(nWords,1)*nTopics);

 %maxIter = 50;
 % number of words assigned to Topic K in Doc D
 nDK = zeros(nTopics,nDocs);
 % number of times word w is assigned to topic k
 nKW =  zeros(nTopics,nVocab);
 % number of words assigned to Topic K
 nK = zeros(nTopics,1);

 % Initilize the Probability matrices with the correct values.
 for i = 1:nWords
   topic = Z(i);
   doc = D(i);
   word = W(i);
   
   nKW(topic,word) = nKW(topic,word) + 1;
   nDK(topic,doc) = nDK(topic,doc) + 1;
   nK(topic,1) = nK(topic,1) + 1;
 end
 toc
 %%  Begin Collapsed Gibbs sampling
 j = 1;
 while true 
    
    nDK_old = nDK;
    nKW_old = nKW;
    
    % Number of words
    for i = 1:nWords
        word = W(i);
        topic = Z(i);
        doc = D(i);
        
        % Decrement counter
        nDK(topic,doc) = nDK(topic,doc) - 1;
        nKW(topic,word) = nKW(topic,word) - 1;
        nK(topic) = nK(topic) - 1;
        
        % For number of topics K
        %pTemp = zeros(nTopics,1);
        %for k = 1:nTopics
            
            %temp =(nDK(k,doc) +alphas(k))*(nKW(k,word) + betas(word))/(nK(k) + nKW(k,:)*betas);
            
            %temp =(nDK(k,doc) + 1)*(nKW(k,word) + 1)/(nK(k) + nVocab*1);
            
            %pTemp(k,1) = temp;
        %end % end K loop
        %pTemp=pTemp/sum(pTemp);
        
        % generate the topic distribution
        % The Prior for loop is compressed into one line.
        pTemp = ((nDK(:,doc) + 1).*(nKW(:,word) + 1))./(nK + nVocab*1);
        % Choose  a topic from the distribution.
        topic = nRandNums(1,pTemp);
        Z(i) = topic;
        
        % Increment the counters.
        nDK(topic,doc) = nDK(topic,doc) + 1;
        nKW(topic,word) = nKW(topic,word) + 1;
        nK(topic) = nK(topic) + 1;
        
    end % end I loop
    % calculate the residuals.
    DK_res = norm(nDK(:)-nDK_old(:))/norm(nDK_old(:));
    KW_res = norm(nKW(:)-nKW_old(:))/norm(nKW_old(:));
    
    report.DK_res(j,1) = DK_res;
    report.KW_res(j,1) = KW_res;
    
    disp(['itr: ',num2str(j), ' DK_res: ', num2str(DK_res),' KW_res: ', num2str(KW_res)])
    j = j + 1;
    %bar(nK)
    pause(.05)
    
    if j >= maxIter
        break
    end
 end
 disp('Done')
	%Function lda takes in a list of Words and documents and generates the
	%probability matrices for LDA
	% INPUTS:
	% W -list of words
	% D -list of documents assigned to each word
	% nTopics - number of topics to use for LDA
	% maxIter - maximum iterations to run LDA
	% nVocab - (optional) number of unique words
	%
	% Outputs:
	% nDK - the Topic-Document matrix
	% nKW - the Document-Word Matrix
	% (optional)
	% nK - The total number of words assigned to each topic
	% report - object that contains the change in residuals of the nDK and nKW
	% matrix as a function of iteration. This is used to confirm convergence
	% of the topic model.

	function [nDK nKW nK report]= lda(W,D,nTopics,maxIter,nVocab)


	tic
	nWords = length(D);
	nDocs = max(unique(D));
	if ~exist('nVocab','var')
	nVocab = max(unique(W));
	end

	%% Initialization
	Z = ceil(rand(nWords,1)*nTopics);

	%maxIter = 50;
	% number of words assigned to Topic K in Doc D
	nDK = zeros(nTopics,nDocs);
	% number of times word w is assigned to topic k
	nKW = zeros(nTopics,nVocab);
	% number of words assigned to Topic K
	nK = zeros(nTopics,1);

	% Initilize the Probability matrices with the correct values.
	for i = 1:nWords
	topic = Z(i);
	doc = D(i);
	word = W(i);

	nKW(topic,word) = nKW(topic,word) + 1;
	nDK(topic,doc) = nDK(topic,doc) + 1;
	nK(topic,1) = nK(topic,1) + 1;
	end
	toc
	%% Begin Collapsed Gibbs sampling
	j = 1;
	while true

	nDK_old = nDK;
	nKW_old = nKW;

	% Number of words
	for i = 1:nWords
	word = W(i);
	topic = Z(i);
	doc = D(i);

	% Decrement counter
	nDK(topic,doc) = nDK(topic,doc) - 1;
	nKW(topic,word) = nKW(topic,word) - 1;
	nK(topic) = nK(topic) - 1;

	% For number of topics K
	%pTemp = zeros(nTopics,1);
	%for k = 1:nTopics

	%temp =(nDK(k,doc) +alphas(k))(nKW(k,word) + betas(word))/(nK(k) + nKW(k,:)betas);

	%temp =(nDK(k,doc) + 1)(nKW(k,word) + 1)/(nK(k) + nVocab1);

	%pTemp(k,1) = temp;
	%end % end K loop
	%pTemp=pTemp/sum(pTemp);

	% generate the topic distribution
	% The Prior for loop is compressed into one line.
	pTemp = ((nDK(:,doc) + 1).(nKW(:,word) + 1))./(nK + nVocab1);
	% Choose a topic from the distribution.
	topic = nRandNums(1,pTemp);
	Z(i) = topic;

	% Increment the counters.
	nDK(topic,doc) = nDK(topic,doc) + 1;
	nKW(topic,word) = nKW(topic,word) + 1;
	nK(topic) = nK(topic) + 1;

	end % end I loop
	% calculate the residuals.
	DK_res = norm(nDK(:)-nDK_old(:))/norm(nDK_old(:));
	KW_res = norm(nKW(:)-nKW_old(:))/norm(nKW_old(:));

	report.DK_res(j,1) = DK_res;
	report.KW_res(j,1) = KW_res;

	disp(['itr: ',num2str(j), ' DK_res: ', num2str(DK_res),' KW_res: ', num2str(KW_res)])
	j = j + 1;
	%bar(nK)
	pause(.05)

	if j >= maxIter
	break
	end
	end
	disp('Done')