brandonwillard · August 27, 2021 22:41
diff --git a/hs-gibbs-under-determined.py b/hs-gibbs-under-determined.py
 import time
 from datetime import date, timedelta

 import arviz as az
 import formulaic
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import scipy

 # from polyagamma import random_polyagamma


 X_formula = formulaic.Formula(
    "obs ~ 1 + C(month) * C(weekday) * C(hour) * C(factor_1) * C(factor_2) * C(factor_3) * C(factor_4)"
 )


 def simulate_data(days_ago=100, alpha=100):
    as_of_date = date(2021, 2, 11)
    start_date, end_date = as_of_date + timedelta(days=-days_ago), as_of_date

    date_rng = pd.date_range(start=start_date, end=end_date, freq="H")
    N = len(date_rng)

    factor_1 = np.random.randint(0, 4, size=N)
    factor_2 = np.random.randint(0, 4, size=N)
    factor_3 = np.random.randint(0, 1, size=N)
    factor_4 = np.random.randint(0, 1, size=N)
    factor_5 = np.random.randint(0, 1, size=N)

    data = pd.DataFrame({"obs": 0}, index=date_rng)
    data = data.assign(
        month=pd.Categorical(data.index.month, categories=list(range(1, 13))),
        weekday=pd.Categorical(data.index.weekday, categories=list(range(7))),
        hour=pd.Categorical(data.index.hour, categories=list(range(24))),
        factor_1=factor_1,
        factor_2=factor_2,
        factor_3=factor_3,
        factor_4=factor_4,
        factor_5=factor_5,
    )

    _, X_csr = X_formula.get_model_matrix(data, output="sparse")

    beta = np.zeros(X_csr.shape[-1])
    beta[0 : (X_csr.shape[1] // 100)] = np.random.normal(
        np.exp(np.linspace(np.log(5), 0, num=X_csr.shape[1] // 100)), 1
    )

    eta = np.asarray(X_csr.dot(beta))

    # Binomial regression with N ~ Poisson
    # z = scipy.special.expit(eta)
    # q = np.random.beta(alpha * eta, alpha * (1 - eta))
    # obs = np.random.poisson(population_size * q)

    # Negative binomial regression
    # mu = np.exp(eta)
    # p = alpha / (mu + alpha)
    # obs = np.random.negative_binomial(alpha, p)

    obs = np.random.normal(eta, 1)

    data["obs"] = obs

    return data, beta


 np.random.seed(4223)

 sim_data, true_beta = simulate_data()


 def large_p_mvnormal_sampler(D_diag, Phi, a):
    r"""Efficiently sample from a large multivariate normal.

    This function draws samples from the following distribution:

    .. math::
       \beta \sim \operatorname{N}\left( \mu, \Sigma \right)

    where

    .. math::
       \mu = \Sigma \Phi^\top a, \\
       \Sigma = \left( \Phi^\top \Phi + D^{-1} \right)^{-1}

    and :math:`a \in \mathbb{R}^{n}`, :math:`\Phi \in \mathbb{R}^{n \times p}`.

    This approach is particularly effective when :math:`p \gg n`.
    
    From "Fast sampling with Gaussian scale-mixture priors in high-dimensional
    regression", Bhattacharya, Chakraborty, and Mallick, 2015.

    """
    N = a.shape[0]
    u = np.random.normal(0, np.sqrt(D_diag))
    delta = np.random.normal(size=N)
    v = Phi * u + delta
    Phi_D = Phi.multiply(D_diag)
    Z = (Phi_D * Phi.T + scipy.sparse.eye(N)).toarray()
    w = scipy.linalg.solve(Z, a - v, assume_a="sym")
    beta = u + Phi_D.T * w
    return beta


 def hs_gibbs(data, num_samp=5000, alpha=100, tau2=1.0):
    """Generate posterior samples for a regression with an HS prior."""

    y_csr, X_csr = X_formula.get_model_matrix(data, output="sparse")
    X_csc = X_csr.tocsc()

    N, M = X_csc.shape

    y = y_csr.toarray().squeeze()

    lambdas = np.full(M, 1)
    etas = np.full(M, 1)
    beta = np.zeros(M)
    omega = np.ones_like(y)

    # The augmented regression is `y ~ N(X @ beta, sigma2)` where `sigma2 = diag(1/omega)`
    sigma2 = 1.0
    sigma = np.sqrt(sigma2)
    y_aug = y

    trace = {"beta": [], "lambda": []}

    for i in range(num_samp):
        if i % 10 == 0:
            print(f"i = {i}")

        # TODO: Re-enable this for Polya-gamma negative-binomial models
        # random_polyagamma(alpha + y, X_csc * beta - np.log(alpha), out=omega)
        # y_aug = np.log(alpha) + (y - alpha) / (2.0 * omega)

        #
        # Efficient under-determined sampling steps...
        #
        # D_inv_diag = t2 / tau2
        V_diag_inv = np.abs(omega)
        Phi = X_csc.T.multiply(np.sqrt(V_diag_inv)).T
        # Phi.nnz / np.prod(Phi.shape) == 0.003938556107574225
        D_diag = tau2 * np.power(lambdas, 2)
        beta = sigma * large_p_mvnormal_sampler(D_diag, Phi, y_aug / sigma)

        #
        # Use slice sampling on `eta = 1 / lambda**2`
        #
        etas = 1.0 / np.power(lambdas, 2)
        u_eta = np.random.uniform(0, 1.0 / (1 + etas))
        exp_rate = np.power(beta, 2) / (2 * sigma2 * tau2)
        slice_upper_lim = (1 - u_eta) / u_eta

        # SciPy's truncated sampler needs an upper limit parameter that's
        # manually scaled by its `scale` parameter: e.g.
        # scipy.stats.truncexpon.rvs(upper_lim / scale, scale=scale)
        etas = scipy.stats.truncexpon.rvs(
            exp_rate / slice_upper_lim, scale=1 / exp_rate
        )

        lambdas = 1 / np.sqrt(etas)

        trace["beta"].append(beta)
        trace["lambda"].append(lambdas)

    return trace


 with np.errstate(over="raise"):
    np.random.seed(90923)

    start_time = time.perf_counter()

    sample_trace = hs_gibbs(sim_data, 100, tau2=1)

    end_time = time.perf_counter()

    time_diff = end_time - start_time

    sample_rate = len(sample_trace["beta"]) / time_diff
    print(sample_rate)


 beta_samples = np.asarray(sample_trace["beta"])
 beta_residuals = beta_samples - true_beta

 beta_mean = beta_samples.mean(0)
 (beta_mean - true_beta).mean()
 # 0.5483206279599594


 # %matplotlib qt

 aehmc_trace = az.from_dict(
    posterior={k: np.asarray([v]) for k, v in sample_trace.items()}
 )

 # ax = az.plot_trace(aehmc_trace)

 beta_subset = aehmc_trace.posterior["beta"].to_dataframe()

 # Let's look at the terms that are known to be non-zero...
 plot_slice = slice(0, 100)
 ax = beta_subset.loc[(0, slice(None), plot_slice)].boxplot(
    by="beta_dim_0", column="beta", rot=90
 )
 plt.scatter(
    np.arange(plot_slice.stop - plot_slice.start) + 1,
    true_beta[plot_slice],
    color="red",
    label="true_beta",
 )
 # plt.yscale("symlog")
 plt.legend()
 plt.tight_layout()

 # Let's look at the terms that are known to be zero...
 plot_slice = slice(true_beta.shape[0] - 200, true_beta.shape[0])
 ax = beta_subset.loc[(0, slice(None), plot_slice)].boxplot(
    by="beta_dim_0", column="beta", rot=90
 )
 plt.scatter(
    np.arange(plot_slice.stop - plot_slice.start) + 1,
    true_beta[plot_slice],
    color="red",
    label="true_beta",
 )
 plt.yscale("symlog")
 plt.legend()
 plt.tight_layout()

 # TODO: 10675 has way too much variance; is that a numerical issue?
	import time
	from datetime import date, timedelta

	import arviz as az
	import formulaic
	import matplotlib.pyplot as plt
	import numpy as np
	import pandas as pd
	import scipy

	# from polyagamma import random_polyagamma


	X_formula = formulaic.Formula(
	"obs ~ 1 + C(month) * C(weekday) * C(hour) * C(factor_1) * C(factor_2) * C(factor_3) * C(factor_4)"
	)


	def simulate_data(days_ago=100, alpha=100):
	as_of_date = date(2021, 2, 11)
	start_date, end_date = as_of_date + timedelta(days=-days_ago), as_of_date

	date_rng = pd.date_range(start=start_date, end=end_date, freq="H")
	N = len(date_rng)

	factor_1 = np.random.randint(0, 4, size=N)
	factor_2 = np.random.randint(0, 4, size=N)
	factor_3 = np.random.randint(0, 1, size=N)
	factor_4 = np.random.randint(0, 1, size=N)
	factor_5 = np.random.randint(0, 1, size=N)

	data = pd.DataFrame({"obs": 0}, index=date_rng)
	data = data.assign(
	month=pd.Categorical(data.index.month, categories=list(range(1, 13))),
	weekday=pd.Categorical(data.index.weekday, categories=list(range(7))),
	hour=pd.Categorical(data.index.hour, categories=list(range(24))),
	factor_1=factor_1,
	factor_2=factor_2,
	factor_3=factor_3,
	factor_4=factor_4,
	factor_5=factor_5,
	)

	_, X_csr = X_formula.get_model_matrix(data, output="sparse")

	beta = np.zeros(X_csr.shape[-1])
	beta[0 : (X_csr.shape[1] // 100)] = np.random.normal(
	np.exp(np.linspace(np.log(5), 0, num=X_csr.shape[1] // 100)), 1
	)

	eta = np.asarray(X_csr.dot(beta))

	# Binomial regression with N ~ Poisson
	# z = scipy.special.expit(eta)
	# q = np.random.beta(alpha * eta, alpha * (1 - eta))
	# obs = np.random.poisson(population_size * q)

	# Negative binomial regression
	# mu = np.exp(eta)
	# p = alpha / (mu + alpha)
	# obs = np.random.negative_binomial(alpha, p)

	obs = np.random.normal(eta, 1)

	data["obs"] = obs

	return data, beta


	np.random.seed(4223)

	sim_data, true_beta = simulate_data()


	def large_p_mvnormal_sampler(D_diag, Phi, a):
	r"""Efficiently sample from a large multivariate normal.

	This function draws samples from the following distribution:

	.. math::
	\beta \sim \operatorname{N}\left( \mu, \Sigma \right)

	where

	.. math::
	\mu = \Sigma \Phi^\top a, \\
	\Sigma = \left( \Phi^\top \Phi + D^{-1} \right)^{-1}

	and :math:`a \in \mathbb{R}^{n}`, :math:`\Phi \in \mathbb{R}^{n \times p}`.

	This approach is particularly effective when :math:`p \gg n`.

	From "Fast sampling with Gaussian scale-mixture priors in high-dimensional
	regression", Bhattacharya, Chakraborty, and Mallick, 2015.

	"""
	N = a.shape[0]
	u = np.random.normal(0, np.sqrt(D_diag))
	delta = np.random.normal(size=N)
	v = Phi * u + delta
	Phi_D = Phi.multiply(D_diag)
	Z = (Phi_D * Phi.T + scipy.sparse.eye(N)).toarray()
	w = scipy.linalg.solve(Z, a - v, assume_a="sym")
	beta = u + Phi_D.T * w
	return beta


	def hs_gibbs(data, num_samp=5000, alpha=100, tau2=1.0):
	"""Generate posterior samples for a regression with an HS prior."""

	y_csr, X_csr = X_formula.get_model_matrix(data, output="sparse")
	X_csc = X_csr.tocsc()

	N, M = X_csc.shape

	y = y_csr.toarray().squeeze()

	lambdas = np.full(M, 1)
	etas = np.full(M, 1)
	beta = np.zeros(M)
	omega = np.ones_like(y)

	# The augmented regression is `y ~ N(X @ beta, sigma2)` where `sigma2 = diag(1/omega)`
	sigma2 = 1.0
	sigma = np.sqrt(sigma2)
	y_aug = y

	trace = {"beta": [], "lambda": []}

	for i in range(num_samp):
	if i % 10 == 0:
	print(f"i = {i}")

	# TODO: Re-enable this for Polya-gamma negative-binomial models
	# random_polyagamma(alpha + y, X_csc * beta - np.log(alpha), out=omega)
	# y_aug = np.log(alpha) + (y - alpha) / (2.0 * omega)

	#
	# Efficient under-determined sampling steps...
	#
	# D_inv_diag = t2 / tau2
	V_diag_inv = np.abs(omega)
	Phi = X_csc.T.multiply(np.sqrt(V_diag_inv)).T
	# Phi.nnz / np.prod(Phi.shape) == 0.003938556107574225
	D_diag = tau2 * np.power(lambdas, 2)
	beta = sigma * large_p_mvnormal_sampler(D_diag, Phi, y_aug / sigma)

	#
	# Use slice sampling on `eta = 1 / lambda**2`
	#
	etas = 1.0 / np.power(lambdas, 2)
	u_eta = np.random.uniform(0, 1.0 / (1 + etas))
	exp_rate = np.power(beta, 2) / (2 * sigma2 * tau2)
	slice_upper_lim = (1 - u_eta) / u_eta

	# SciPy's truncated sampler needs an upper limit parameter that's
	# manually scaled by its `scale` parameter: e.g.
	# scipy.stats.truncexpon.rvs(upper_lim / scale, scale=scale)
	etas = scipy.stats.truncexpon.rvs(
	exp_rate / slice_upper_lim, scale=1 / exp_rate
	)

	lambdas = 1 / np.sqrt(etas)

	trace["beta"].append(beta)
	trace["lambda"].append(lambdas)

	return trace


	with np.errstate(over="raise"):
	np.random.seed(90923)

	start_time = time.perf_counter()

	sample_trace = hs_gibbs(sim_data, 100, tau2=1)

	end_time = time.perf_counter()

	time_diff = end_time - start_time

	sample_rate = len(sample_trace["beta"]) / time_diff
	print(sample_rate)


	beta_samples = np.asarray(sample_trace["beta"])
	beta_residuals = beta_samples - true_beta

	beta_mean = beta_samples.mean(0)
	(beta_mean - true_beta).mean()
	# 0.5483206279599594


	# %matplotlib qt

	aehmc_trace = az.from_dict(
	posterior={k: np.asarray([v]) for k, v in sample_trace.items()}
	)

	# ax = az.plot_trace(aehmc_trace)

	beta_subset = aehmc_trace.posterior["beta"].to_dataframe()

	# Let's look at the terms that are known to be non-zero...
	plot_slice = slice(0, 100)
	ax = beta_subset.loc[(0, slice(None), plot_slice)].boxplot(
	by="beta_dim_0", column="beta", rot=90
	)
	plt.scatter(
	np.arange(plot_slice.stop - plot_slice.start) + 1,
	true_beta[plot_slice],
	color="red",
	label="true_beta",
	)
	# plt.yscale("symlog")
	plt.legend()
	plt.tight_layout()

	# Let's look at the terms that are known to be zero...
	plot_slice = slice(true_beta.shape[0] - 200, true_beta.shape[0])
	ax = beta_subset.loc[(0, slice(None), plot_slice)].boxplot(
	by="beta_dim_0", column="beta", rot=90
	)
	plt.scatter(
	np.arange(plot_slice.stop - plot_slice.start) + 1,
	true_beta[plot_slice],
	color="red",
	label="true_beta",
	)
	plt.yscale("symlog")
	plt.legend()
	plt.tight_layout()

	# TODO: 10675 has way too much variance; is that a numerical issue?