jacobsn · July 20, 2016 20:30
diff --git a/README.md b/README.md
diff --git a/klbounds.m b/klbounds.m
 %
 % This script is looking at the KL-divergence between a given distribution 
 % and a set of distributions and the average KL-divergence between the given 
 % distribution and each invidual distribution.
 %
 % Author: Nathan Jacobs (2016)
 %
 % If you see any problems with this please let me know at
 % [email protected].
 %

 %
 % Concern: in the variational autoencoder paper the latent loss is trying
 % to push each individual distribution to be a unit Gaussian, but that is
 % not the same as pushing the combination of all of those distributions to
 % be a unit Gaussian... or is it?
 %

 %
 % In other words, what is the relationship between
 %  \sum_i KL(q(z|x_i) || N(0;1)) and KL(\sum_i q(z|x_i) || N(0;1))?
 %

 %
 % Conclusion (from running the code below): is that the KL-divergence
 % method from the paper is an upperbound on the sum of the per example
 % distributions.
 %
 % This can also be derived analytically from the convexity of the
 % KL-divergence over the domain of probability distributions.  See
 % Lemma 1 in http://homes.cs.washington.edu/~anuprao/pubs/CSE533Autumn2010/lecture3.pdf
 % for a nice derivation.
 %

 % define the KL divergence using analytical equations from
 % http://vdumoulin.github.io/morphing_faces/#variational-autoencoders

 KL = @(m,v) .5*sum(v + m.^2 - 1 - log(v));

 d = 1; % dimensions
 N = 2; % number of mixture components
 k = 100; % samples per mixture

 MAX_ITER = 200;

 vals = nan(2,MAX_ITER);

 for iIter = 1:MAX_ITER
  
  %
  % make some sample distributions
  %
  
  % sample some means and variances
  means = randn(d,N);
  vars = .1+2*rand(d,N);
  
  % sample k from each of the N distros
  samps = zeros(size(means,1),k,size(means,2));
  for ix = 1:N
    samps(:,:,ix) = mvnrnd(means(:,ix), diag(vars(:,ix)),k)';
  end
  
  samps = reshape(samps,d,[]);
  
  %
  % compute the KL divergences
  %
  
  mu_est = mean(samps,2);
  cov_est = cov(samps');
  
  kl_average_first = KL(mu_est,diag(cov_est));
  kl_average_second = 0;
  for ix = 1:N
    kl_average_second = kl_average_second + KL(means(:,ix), vars(:,ix));
  end
  kl_average_second = kl_average_second / N;
  
  %
  % plot the results
  %
  vals(:,iIter) = [kl_average_first kl_average_second];
  
  figure(1); clf;
  plot(vals(1,:),vals(2,:),'.');
  hold on
  plot([0 max(vals(:))],[0 max(vals(:))]);
  hold off
  xlabel({'KL-Diverge of samples from Per-Example Gaussians','(so latent loss is truly a unit Gaussian)'})
  ylabel({'Average of Per-Example KL-divergence','(what Variational Autoencoder paper does)'});
  
  pause(.001)
  
 end
	%
	% This script is looking at the KL-divergence between a given distribution
	% and a set of distributions and the average KL-divergence between the given
	% distribution and each invidual distribution.
	%
	% Author: Nathan Jacobs (2016)
	%
	% If you see any problems with this please let me know at
	% [email protected].
	%

	%
	% Concern: in the variational autoencoder paper the latent loss is trying
	% to push each individual distribution to be a unit Gaussian, but that is
	% not the same as pushing the combination of all of those distributions to
	% be a unit Gaussian... or is it?
	%

	%
	% In other words, what is the relationship between
	% \sum_i KL(q(z\|x_i) \|\| N(0;1)) and KL(\sum_i q(z\|x_i) \|\| N(0;1))?
	%

	%
	% Conclusion (from running the code below): is that the KL-divergence
	% method from the paper is an upperbound on the sum of the per example
	% distributions.
	%
	% This can also be derived analytically from the convexity of the
	% KL-divergence over the domain of probability distributions. See
	% Lemma 1 in http://homes.cs.washington.edu/~anuprao/pubs/CSE533Autumn2010/lecture3.pdf
	% for a nice derivation.
	%

	% define the KL divergence using analytical equations from
	% http://vdumoulin.github.io/morphing_faces/#variational-autoencoders

	KL = @(m,v) .5*sum(v + m.^2 - 1 - log(v));

	d = 1; % dimensions
	N = 2; % number of mixture components
	k = 100; % samples per mixture

	MAX_ITER = 200;

	vals = nan(2,MAX_ITER);

	for iIter = 1:MAX_ITER

	%
	% make some sample distributions
	%

	% sample some means and variances
	means = randn(d,N);
	vars = .1+2*rand(d,N);

	% sample k from each of the N distros
	samps = zeros(size(means,1),k,size(means,2));
	for ix = 1:N
	samps(:,:,ix) = mvnrnd(means(:,ix), diag(vars(:,ix)),k)';
	end

	samps = reshape(samps,d,[]);

	%
	% compute the KL divergences
	%

	mu_est = mean(samps,2);
	cov_est = cov(samps');

	kl_average_first = KL(mu_est,diag(cov_est));
	kl_average_second = 0;
	for ix = 1:N
	kl_average_second = kl_average_second + KL(means(:,ix), vars(:,ix));
	end
	kl_average_second = kl_average_second / N;

	%
	% plot the results
	%
	vals(:,iIter) = [kl_average_first kl_average_second];

	figure(1); clf;
	plot(vals(1,:),vals(2,:),'.');
	hold on
	plot([0 max(vals(:))],[0 max(vals(:))]);
	hold off
	xlabel({'KL-Diverge of samples from Per-Example Gaussians','(so latent loss is truly a unit Gaussian)'})
	ylabel({'Average of Per-Example KL-divergence','(what Variational Autoencoder paper does)'});

	pause(.001)

	end