torfjelde · October 6, 2020 14:50
diff --git a/advi-simple-example.jl b/advi-simple-example.jl
 julia> using Bijectors

 julia> using ForwardDiff

 julia> using ComponentArrays, UnPack

 julia> using Bijectors: LeakyReLU, Shift, Scale

 julia> # Affine transformation is simply a `Scale` composed with `Shift`
       using Bijectors: Shift, Scale

 julia> function Affine(W::AbstractMatrix, b::AbstractVector; dim::Val{N} = Val(1)) where {N}
           return Shift(b; dim = dim) ∘ Scale(W; dim = dim)
       end
 Affine (generic function with 1 method)

 julia> # Example model whose posterior we will approximate
       using Turing

 julia> @model function demo(xs)
           s ~ InverseGamma(2, 3)
           m ~ Normal(0, √s)
           for i in eachindex(xs)
               xs[i] ~ Normal(m, √s)
           end
       end;

 julia> # Generate some data and condition
       xs = 0.1 .* randn(1000) .+ 1.;

 julia> m = demo(xs);

 julia> #########################################################
       ### Setup for the approximating/variational posterior ###
       #########################################################
       d = 2;

 julia> num_layers = 2;

 julia> # Structuring
       proto_arrs = [
           ComponentArray(A = randn(d, d), b = randn(d), α = randn())
           for l = 1:num_layers
       ];

 julia> proto_axes = getaxes.(proto_arrs);

 julia> num_params = length.(proto_arrs);

 julia> # Maps an array representing all the parameters to the approximate distribution
       function getq(θ)
           idx = 1
           layers = map(enumerate(proto_axes)) do (l, prot_axes)
               axes = proto_axes[l]
               k = num_params[l]
               @unpack A, b, α = ComponentArray(θ[idx:idx + k - 1], axes)
               aff = Affine(A, b)
               b = LeakyReLU(abs(α) + 0.01; dim = Val(1))
               idx += k

               if l == num_layers
                   layer = aff
               else
                   layer = b ∘ aff
               end

               layer
           end
           b = to_constrained ∘ Bijectors.composel(layers...)
           return transformed(base, b)
       end
 getq (generic function with 1 method)

 julia> # Construct the approximate/variational posterior
       base = MvNormal(zeros(2), ones(2));

 julia> to_unconstrained = bijector(m);

 julia> to_constrained = inv(to_unconstrained);

 julia> ######################################################
       ### Fitting the approximate/variational posterior ####
       ######################################################
       # Configuration for the VI algorithm
       advi = ADVI(10, 1_000)
 ADVI{AdvancedVI.ForwardDiffAD{40}}(10, 1000)

 julia> # Initial parameters
       θ = randn(sum(num_params));

 julia> # Turing.jl provides the impl of `vi` for `Model` `m`
       # q = vi(m, advi, getq, θ ./ sum(num_params))
       # Or we provide the map `θ ↦ logπ(data, θ)` directly (needed to compute the ELBO).
       logπ = Turing.Variational.make_logjoint(m);

 julia> q_test = getq(θ);

 julia> logπ(rand(q_test)) # check that it works
 -865.0655685335663

 julia> q = vi(logπ, advi, getq, θ ./ sum(num_params));
 ┌ Info: [ADVI] Should only be seen once: optimizer created for θ
 └   objectid(θ) = 0x77b5e7a2a094bb51
 [ADVI] Optimizing...100% Time: 0:00:31

 julia> # Check the result
       q_samples = rand(q, 1000);

 julia> mean(q_samples; dims = 2)
 2×1 Array{Float64,2}:
 0.0172600897154137
 0.979920660898407
	julia> using Bijectors

	julia> using ForwardDiff

	julia> using ComponentArrays, UnPack

	julia> using Bijectors: LeakyReLU, Shift, Scale

	julia> # Affine transformation is simply a `Scale` composed with `Shift`
	using Bijectors: Shift, Scale

	julia> function Affine(W::AbstractMatrix, b::AbstractVector; dim::Val{N} = Val(1)) where {N}
	return Shift(b; dim = dim) ∘ Scale(W; dim = dim)
	end
	Affine (generic function with 1 method)

	julia> # Example model whose posterior we will approximate
	using Turing

	julia> @model function demo(xs)
	s ~ InverseGamma(2, 3)
	m ~ Normal(0, √s)
	for i in eachindex(xs)
	xs[i] ~ Normal(m, √s)
	end
	end;

	julia> # Generate some data and condition
	xs = 0.1 .* randn(1000) .+ 1.;

	julia> m = demo(xs);

	julia> #########################################################
	### Setup for the approximating/variational posterior ###
	#########################################################
	d = 2;

	julia> num_layers = 2;

	julia> # Structuring
	proto_arrs = [
	ComponentArray(A = randn(d, d), b = randn(d), α = randn())
	for l = 1:num_layers
	];

	julia> proto_axes = getaxes.(proto_arrs);

	julia> num_params = length.(proto_arrs);

	julia> # Maps an array representing all the parameters to the approximate distribution
	function getq(θ)
	idx = 1
	layers = map(enumerate(proto_axes)) do (l, prot_axes)
	axes = proto_axes[l]
	k = num_params[l]
	@unpack A, b, α = ComponentArray(θ[idx:idx + k - 1], axes)
	aff = Affine(A, b)
	b = LeakyReLU(abs(α) + 0.01; dim = Val(1))
	idx += k

	if l == num_layers
	layer = aff
	else
	layer = b ∘ aff
	end

	layer
	end
	b = to_constrained ∘ Bijectors.composel(layers...)
	return transformed(base, b)
	end
	getq (generic function with 1 method)

	julia> # Construct the approximate/variational posterior
	base = MvNormal(zeros(2), ones(2));

	julia> to_unconstrained = bijector(m);

	julia> to_constrained = inv(to_unconstrained);

	julia> ######################################################
	### Fitting the approximate/variational posterior ####
	######################################################
	# Configuration for the VI algorithm
	advi = ADVI(10, 1_000)
	ADVI{AdvancedVI.ForwardDiffAD{40}}(10, 1000)

	julia> # Initial parameters
	θ = randn(sum(num_params));

	julia> # Turing.jl provides the impl of `vi` for `Model` `m`
	# q = vi(m, advi, getq, θ ./ sum(num_params))
	# Or we provide the map `θ ↦ logπ(data, θ)` directly (needed to compute the ELBO).
	logπ = Turing.Variational.make_logjoint(m);

	julia> q_test = getq(θ);

	julia> logπ(rand(q_test)) # check that it works
	-865.0655685335663

	julia> q = vi(logπ, advi, getq, θ ./ sum(num_params));
	┌ Info: [ADVI] Should only be seen once: optimizer created for θ
	└ objectid(θ) = 0x77b5e7a2a094bb51
	[ADVI] Optimizing...100% Time: 0:00:31

	julia> # Check the result
	q_samples = rand(q, 1000);

	julia> mean(q_samples; dims = 2)
	2×1 Array{Float64,2}:
	0.0172600897154137
	0.979920660898407