torfjelde · May 17, 2019 11:17 · torfjelde · Apr 6, 2019 · torfjelde · Apr 9, 2019
diff --git a/advi.png b/advi.png
diff --git a/turing_advi_example.jl b/turing_advi_example.jl
 using Turing, Bijectors, ForwardDiff, LinearAlgebra, Optim, LineSearches
 import StatsBase: sample
 import Optim: optimize

 #############
 # Utilities #
 #############

 function jac_inv_transform(dist::Distribution, x::T where T<:Real)
    ForwardDiff.derivative(x -> invlink(dist, x), x)
 end

 function jac_inv_transform(dist::Distribution, x::Array{T} where T <: Real)
    ForwardDiff.jacobian(x -> invlink(dist, x), x)
 end

 function center_diag_gaussian(x, μ, σ)
    # instead of creating a diagonal matrix, we just do elementwise multiplication
    (σ .^(-1)) .* (x - μ)
 end

 function center_diag_gaussian_inv(η, μ, σ)
    (η .* σ) + μ
 end

 #########################
 # Variational Inference #
 #########################

 abstract type VariationalInference end

 """
    sample(vi::VariationalInference, num_samples)

 Produces `num_samples` samples for the given VI method using number of samples equal to `num_samples`.
 """
 function sample(vi::VariationalInference, num_samples) end

 """
    elbo(vi::VariationalInference, num_samples)

 Computes empirical estimates of ELBO for the given VI method using number of samples equal to `num_samples`.
 """
 function elbo(vi::VariationalInference, num_samples) end

 """
    optimize(vi::VariationalInference)

 Finds parameters which maximizes the ELBO for the given VI method.
 """
 function optimize(vi::VariationalInference) end


 """
    ADVI(model::Turing.Model)

 Automatic Differentiation Variational Inference (ADVI) for a given model.
 """
 struct ADVI{T <: Real} <: VariationalInference
    model::Turing.Model
    μ::Vector{T}
    ω::Vector{T}
 end

 ADVI(model::Turing.Model) = begin
    # setup
    var_info = Turing.VarInfo()
    model(var_info, Turing.SampleFromUniform())
    num_params = size(var_info.vals, 1)

    ADVI(model, zeros(num_params), zeros(num_params))
 end

 function sample(vi::ADVI, num_samples)
    # setup
    var_info = Turing.VarInfo()
    vi.model(var_info, Turing.SampleFromUniform())
    num_params = size(var_info.vals, 1)

    # convenience
    μ, ω = vi.μ, vi.ω

    # buffer
    samples = zeros(num_samples, num_params)
    
    for i = 1:size(var_info.dists, 1)
        prior = var_info.dists[i]
        r = var_info.ranges[i]

        # initials
        μ_i = μ[r]
        ω_i = ω[r]

        # # sample from VI posterior
        θ_acc = zeros(length(μ_i))

        for j = 1:num_samples
            η = randn(length(μ_i))
            ζ = center_diag_gaussian_inv(η, μ_i, exp.(ω_i))
            θ = invlink(prior, ζ)

            samples[j, r] = θ
        end
    end

    return samples
 end

 function optimize(vi::ADVI; samples_per_step = 10, max_iters = 500)
    # setup
    var_info = Turing.VarInfo()
    vi.model(var_info, Turing.SampleFromUniform())
    num_params = size(var_info.vals, 1)

    function objective(x)
        # extract the mean-field Gaussian params
        μ, ω = x[1:num_params], x[num_params + 1: end]
        
        - elbo(vi, μ, ω, samples_per_step)
    end

    # for every param we need a mean μ and variance ω
    x = zeros(2 * num_params)
    diff_result = DiffResults.GradientResult(x)

    # used for truncated adaGrad as suggested in (Blei et al, 2015). 
    η = 0.1
    τ = 1.0
    ρ = zeros(2 * num_params)
    s = zeros(2 * num_params)
    g² = zeros(2 * num_params)

    # number of previous gradients to use to compute `s` in adaGrad
    stepsize_num_prev = 10

    i = 0
    while (i < max_iters) # & converged # <= add criterion? A running mean maybe?
        # compute gradient
        ForwardDiff.gradient!(diff_result, objective, x)
        
        # recursive implementation of updating the step-size
        # if beyound first sequence of steps we subtract of the previous g² before adding the next
        if i > stepsize_num_prev
            s -= g²
        end

        # update parameters for adaGrad
        g² .= DiffResults.gradient(diff_result).^2
        s += g²
        
        # compute stepsize
        @. ρ = η / (τ + sqrt(s))
        
        x .= x - ρ .* DiffResults.gradient(diff_result)
        @info "Step $i" ρ DiffResults.value(diff_result) norm(DiffResults.gradient(diff_result))

        i += 1
    end

    μ, ω = x[1:num_params], x[num_params + 1: end]

    return μ, ω
 end

 function elbo(vi::ADVI, μ::Vector{T}, ω::Vector{T}, num_samples) where T <: Real
    # setup
    var_info = Turing.VarInfo()

    # initial `Var_Info` object
    vi.model(var_info, Turing.SampleFromUniform())

    num_params = size(var_info.vals, 1)
    
    elbo_acc = 0.0

    for i = 1:num_samples
        # iterate through priors, sample and update
        for i = 1:size(var_info.dists, 1)
            prior = var_info.dists[i]
            r = var_info.ranges[i]

            # mean-field params for this set of model params
            μ_i = μ[r]
            ω_i = ω[r]

            # obtain samples from mean-field posterior approximation
            η = randn(length(μ_i))
            ζ = center_diag_gaussian_inv(η, μ_i, exp.(ω_i))
            
            # inverse-transform back to original param space
            θ = invlink(prior, ζ)

            # update
            var_info.vals[r] = θ

            # add the log-det-jacobian of inverse transform
            elbo_acc += log(abs(det(jac_inv_transform(prior, ζ)))) / num_samples
        end

        # sample with updated variables
        vi.model(var_info)
        elbo_acc += var_info.logp / num_samples
    end

    # add the term for the entropy of the variational posterior
    variational_posterior_entropy = sum(ω)
    elbo_acc += variational_posterior_entropy

    elbo_acc
 end

 function elbo(vi::ADVI, num_samples)
    # extract the mean-field Gaussian params
    μ, ω = vi.μ, vi.ω

    elbo(vi, μ, ω, num_samples)
 end

 ##################
 # Simple example #
 ##################

 @model demo(x) = begin
    s ~ InverseGamma(2,3)
    m ~ Normal(0.0, sqrt(s))  # `Normal(μ, σ)` has mean μ and variance σ², i.e. parametrize with std. not variance
    for i = 1:length(x)
        x[i] ~ Normal(m, sqrt(s))
    end
 end

 # generate data
 x = randn(1, 1000);

 # produce "true" samples using NUTS
 m = demo(x)
 chain = sample(m, NUTS(2000, 200, 0.65))

 # ADVI
 m = demo(x)
 vi = ADVI(m)                                                # default construction of ADVI
 μ, ω = optimize(vi, samples_per_step = 5, max_iters = 5000) # maximize ELBO
 vi = ADVI(m, μ, ω)                                          # construct new from optimized values
 samples = sample(vi, 2000)

 # quick check
 println([mean(samples, dims=1), [var(x), mean(x)]])

 # closed form
 using ConjugatePriors

 # prior
 # notation mapping has been verified by explicitly computing expressions
 # in "Conjugate Bayesian analysis of the Gaussian distribution" by Murphy
 μ₀ = 0.0 # => μ
 κ₀ = 1.0 # => ν, which scales the precision of the Normal
 α₀ = 2.0 # => "shape"
 β₀ = 3.0 # => "rate", which is 1 / θ, where θ is "scale"

 pri = NormalGamma(μ₀, κ₀, α₀, β₀)

 # posterior
 post = posterior(pri, Normal, x)

 # marginal distribution of τ = 1 / σ²
 # Eq. (90) in "Conjugate Bayesian analysis of the Gaussian distribution" by Murphy
 # `scale(post)` = θ
 p_τ = Gamma(post.shape, scale(post))
 p_σ²_pdf = z -> pdf(p_τ, 1 / z) # τ => 1 / σ² 

 # marginal of μ
 # Eq. (91) in "Conjugate Bayesian analysis of the Gaussian distribution" by Murphy
 p_μ = TDist(2 * post.shape)

 μₙ = post.mu    # μ → μ
 κₙ = post.nu    # κ → ν
 αₙ = post.shape # α → shape
 βₙ = post.rate  # β → rate

 # numerically more stable but doesn't seem to have effect; issue is probably internal to
 # `pdf` which needs to compute ≈ Γ(1000) 
 p_μ_pdf = z -> exp(logpdf(p_μ, (z - μₙ) * exp(- 0.5 * log(βₙ) + 0.5 * log(αₙ) + 0.5 * log(κₙ))))
 # p_μ_pdf1 = z -> pdf(p_μ, (z - μₙ) / √(βₙ / (αₙ * κₙ)))

 #################
 # Visualization #
 #################

 # visualize
 using Plots, StatsPlots, LaTeXStrings
 pyplot()

 p1 = plot();
 density!(samples[:, 1], label = "s (ADVI)", color = :blue, linestyle = :dash)
 histogram!(samples[:, 1], label = "", normed = true, alpha = 0.3, color = :blue);

 density!([chain[:s].value...], label = "s (NUTS)", color = :green, linestyle = :dashdot)
 histogram!([chain[:s].value...], label = "", normed = true, color = :green, alpha = 0.3)

 # normalize using Riemann approx. because of (almost certainly) numerical issues
 Δ = 0.001
 r = 0.75:0.001:1.25
 norm_const = sum(p_σ²_pdf.(r) .* Δ)
 plot!(r, p_σ²_pdf, label = "s (posterior)", color = :red);
 vline!([var(x)], label = "s (data)", linewidth = 1.5, color = :black, alpha = 0.7)

 xlims!(0.5, 1.5)

 title!(L"$x_i \sim \mathcal{N}(0, 1)$ for $i = 1,\dots,1000$")

 p2 = plot()
 density!(samples[:, 2], label = "m (ADVI)", color = :blue, linestyle = :dash)
 histogram!(samples[:, 2], label = "", normed = true, alpha = 0.3, color = :blue)

 density!([chain[:m].value...], label = "m (NUTS)", color = :green, linestyle = :dashdot)
 histogram!([chain[:m].value...], label = "", normed = true, color = :green, alpha = 0.3)

 # normalize using Riemann approx. because of (almost certainly) numerical issues
 Δ = 0.0001
 r = -0.1 + mean(x):Δ:0.1 + mean(x)
 norm_const = sum(p_μ_pdf.(r) .* Δ)
 plot!(r, z -> p_μ_pdf(z) / norm_const, label = "m (posterior)", color = :red);
 vline!([mean(x)], label = "m (data)", linewidth = 1.5, color = :black, alpha = 0.7)

 xlims!(-0.25, 0.25)

 p = plot(p1, p2; layout = (2, 1))

 savefig(p, "advi_proper.png")
	using Turing, Bijectors, ForwardDiff, LinearAlgebra, Optim, LineSearches
	import StatsBase: sample
	import Optim: optimize

	#############
	# Utilities #
	#############

	function jac_inv_transform(dist::Distribution, x::T where T<:Real)
	ForwardDiff.derivative(x -> invlink(dist, x), x)
	end

	function jac_inv_transform(dist::Distribution, x::Array{T} where T <: Real)
	ForwardDiff.jacobian(x -> invlink(dist, x), x)
	end

	function center_diag_gaussian(x, μ, σ)
	# instead of creating a diagonal matrix, we just do elementwise multiplication
	(σ .^(-1)) .* (x - μ)
	end

	function center_diag_gaussian_inv(η, μ, σ)
	(η .* σ) + μ
	end

	#########################
	# Variational Inference #
	#########################

	abstract type VariationalInference end

	"""
	sample(vi::VariationalInference, num_samples)

	Produces `num_samples` samples for the given VI method using number of samples equal to `num_samples`.
	"""
	function sample(vi::VariationalInference, num_samples) end

	"""
	elbo(vi::VariationalInference, num_samples)

	Computes empirical estimates of ELBO for the given VI method using number of samples equal to `num_samples`.
	"""
	function elbo(vi::VariationalInference, num_samples) end

	"""
	optimize(vi::VariationalInference)

	Finds parameters which maximizes the ELBO for the given VI method.
	"""
	function optimize(vi::VariationalInference) end


	"""
	ADVI(model::Turing.Model)

	Automatic Differentiation Variational Inference (ADVI) for a given model.
	"""
	struct ADVI{T <: Real} <: VariationalInference
	model::Turing.Model
	μ::Vector{T}
	ω::Vector{T}
	end

	ADVI(model::Turing.Model) = begin
	# setup
	var_info = Turing.VarInfo()
	model(var_info, Turing.SampleFromUniform())
	num_params = size(var_info.vals, 1)

	ADVI(model, zeros(num_params), zeros(num_params))
	end

	function sample(vi::ADVI, num_samples)
	# setup
	var_info = Turing.VarInfo()
	vi.model(var_info, Turing.SampleFromUniform())
	num_params = size(var_info.vals, 1)

	# convenience
	μ, ω = vi.μ, vi.ω

	# buffer
	samples = zeros(num_samples, num_params)

	for i = 1:size(var_info.dists, 1)
	prior = var_info.dists[i]
	r = var_info.ranges[i]

	# initials
	μ_i = μ[r]
	ω_i = ω[r]

	# # sample from VI posterior
	θ_acc = zeros(length(μ_i))

	for j = 1:num_samples
	η = randn(length(μ_i))
	ζ = center_diag_gaussian_inv(η, μ_i, exp.(ω_i))
	θ = invlink(prior, ζ)

	samples[j, r] = θ
	end
	end

	return samples
	end

	function optimize(vi::ADVI; samples_per_step = 10, max_iters = 500)
	# setup
	var_info = Turing.VarInfo()
	vi.model(var_info, Turing.SampleFromUniform())
	num_params = size(var_info.vals, 1)

	function objective(x)
	# extract the mean-field Gaussian params
	μ, ω = x[1:num_params], x[num_params + 1: end]

	- elbo(vi, μ, ω, samples_per_step)
	end

	# for every param we need a mean μ and variance ω
	x = zeros(2 * num_params)
	diff_result = DiffResults.GradientResult(x)

	# used for truncated adaGrad as suggested in (Blei et al, 2015).
	η = 0.1
	τ = 1.0
	ρ = zeros(2 * num_params)
	s = zeros(2 * num_params)
	g² = zeros(2 * num_params)

	# number of previous gradients to use to compute `s` in adaGrad
	stepsize_num_prev = 10

	i = 0
	while (i < max_iters) # & converged # <= add criterion? A running mean maybe?
	# compute gradient
	ForwardDiff.gradient!(diff_result, objective, x)

	# recursive implementation of updating the step-size
	# if beyound first sequence of steps we subtract of the previous g² before adding the next
	if i > stepsize_num_prev
	s -= g²
	end

	# update parameters for adaGrad
	g² .= DiffResults.gradient(diff_result).^2
	s += g²

	# compute stepsize
	@. ρ = η / (τ + sqrt(s))

	x .= x - ρ .* DiffResults.gradient(diff_result)
	@info "Step $i" ρ DiffResults.value(diff_result) norm(DiffResults.gradient(diff_result))

	i += 1
	end

	μ, ω = x[1:num_params], x[num_params + 1: end]

	return μ, ω
	end

	function elbo(vi::ADVI, μ::Vector{T}, ω::Vector{T}, num_samples) where T <: Real
	# setup
	var_info = Turing.VarInfo()

	# initial `Var_Info` object
	vi.model(var_info, Turing.SampleFromUniform())

	num_params = size(var_info.vals, 1)

	elbo_acc = 0.0

	for i = 1:num_samples
	# iterate through priors, sample and update
	for i = 1:size(var_info.dists, 1)
	prior = var_info.dists[i]
	r = var_info.ranges[i]

	# mean-field params for this set of model params
	μ_i = μ[r]
	ω_i = ω[r]

	# obtain samples from mean-field posterior approximation
	η = randn(length(μ_i))
	ζ = center_diag_gaussian_inv(η, μ_i, exp.(ω_i))

	# inverse-transform back to original param space
	θ = invlink(prior, ζ)

	# update
	var_info.vals[r] = θ

	# add the log-det-jacobian of inverse transform
	elbo_acc += log(abs(det(jac_inv_transform(prior, ζ)))) / num_samples
	end

	# sample with updated variables
	vi.model(var_info)
	elbo_acc += var_info.logp / num_samples
	end

	# add the term for the entropy of the variational posterior
	variational_posterior_entropy = sum(ω)
	elbo_acc += variational_posterior_entropy

	elbo_acc
	end

	function elbo(vi::ADVI, num_samples)
	# extract the mean-field Gaussian params
	μ, ω = vi.μ, vi.ω

	elbo(vi, μ, ω, num_samples)
	end

	##################
	# Simple example #
	##################

	@model demo(x) = begin
	s ~ InverseGamma(2,3)
	m ~ Normal(0.0, sqrt(s)) # `Normal(μ, σ)` has mean μ and variance σ², i.e. parametrize with std. not variance
	for i = 1:length(x)
	x[i] ~ Normal(m, sqrt(s))
	end
	end

	# generate data
	x = randn(1, 1000);

	# produce "true" samples using NUTS
	m = demo(x)
	chain = sample(m, NUTS(2000, 200, 0.65))

	# ADVI
	m = demo(x)
	vi = ADVI(m) # default construction of ADVI
	μ, ω = optimize(vi, samples_per_step = 5, max_iters = 5000) # maximize ELBO
	vi = ADVI(m, μ, ω) # construct new from optimized values
	samples = sample(vi, 2000)

	# quick check
	println([mean(samples, dims=1), [var(x), mean(x)]])

	# closed form
	using ConjugatePriors

	# prior
	# notation mapping has been verified by explicitly computing expressions
	# in "Conjugate Bayesian analysis of the Gaussian distribution" by Murphy
	μ₀ = 0.0 # => μ
	κ₀ = 1.0 # => ν, which scales the precision of the Normal
	α₀ = 2.0 # => "shape"
	β₀ = 3.0 # => "rate", which is 1 / θ, where θ is "scale"

	pri = NormalGamma(μ₀, κ₀, α₀, β₀)

	# posterior
	post = posterior(pri, Normal, x)

	# marginal distribution of τ = 1 / σ²
	# Eq. (90) in "Conjugate Bayesian analysis of the Gaussian distribution" by Murphy
	# `scale(post)` = θ
	p_τ = Gamma(post.shape, scale(post))
	p_σ²_pdf = z -> pdf(p_τ, 1 / z) # τ => 1 / σ²

	# marginal of μ
	# Eq. (91) in "Conjugate Bayesian analysis of the Gaussian distribution" by Murphy
	p_μ = TDist(2 * post.shape)

	μₙ = post.mu # μ → μ
	κₙ = post.nu # κ → ν
	αₙ = post.shape # α → shape
	βₙ = post.rate # β → rate

	# numerically more stable but doesn't seem to have effect; issue is probably internal to
	# `pdf` which needs to compute ≈ Γ(1000)
	p_μ_pdf = z -> exp(logpdf(p_μ, (z - μₙ) * exp(- 0.5 * log(βₙ) + 0.5 * log(αₙ) + 0.5 * log(κₙ))))
	# p_μ_pdf1 = z -> pdf(p_μ, (z - μₙ) / √(βₙ / (αₙ * κₙ)))

	#################
	# Visualization #
	#################

	# visualize
	using Plots, StatsPlots, LaTeXStrings
	pyplot()

	p1 = plot();
	density!(samples[:, 1], label = "s (ADVI)", color = :blue, linestyle = :dash)
	histogram!(samples[:, 1], label = "", normed = true, alpha = 0.3, color = :blue);

	density!([chain[:s].value...], label = "s (NUTS)", color = :green, linestyle = :dashdot)
	histogram!([chain[:s].value...], label = "", normed = true, color = :green, alpha = 0.3)

	# normalize using Riemann approx. because of (almost certainly) numerical issues
	Δ = 0.001
	r = 0.75:0.001:1.25
	norm_const = sum(p_σ²_pdf.(r) .* Δ)
	plot!(r, p_σ²_pdf, label = "s (posterior)", color = :red);
	vline!([var(x)], label = "s (data)", linewidth = 1.5, color = :black, alpha = 0.7)

	xlims!(0.5, 1.5)

	title!(L"$x_i \sim \mathcal{N}(0, 1)$ for $i = 1,\dots,1000$")

	p2 = plot()
	density!(samples[:, 2], label = "m (ADVI)", color = :blue, linestyle = :dash)
	histogram!(samples[:, 2], label = "", normed = true, alpha = 0.3, color = :blue)

	density!([chain[:m].value...], label = "m (NUTS)", color = :green, linestyle = :dashdot)
	histogram!([chain[:m].value...], label = "", normed = true, color = :green, alpha = 0.3)

	# normalize using Riemann approx. because of (almost certainly) numerical issues
	Δ = 0.0001
	r = -0.1 + mean(x):Δ:0.1 + mean(x)
	norm_const = sum(p_μ_pdf.(r) .* Δ)
	plot!(r, z -> p_μ_pdf(z) / norm_const, label = "m (posterior)", color = :red);
	vline!([mean(x)], label = "m (data)", linewidth = 1.5, color = :black, alpha = 0.7)

	xlims!(-0.25, 0.25)

	p = plot(p1, p2; layout = (2, 1))

	savefig(p, "advi_proper.png")