albertbuchard · May 15, 2025 18:40
diff --git a/test_preweight_equivalence_full.py b/test_preweight_equivalence_full.py
 """
 This example demonstrates that fitting a weighted linear mixed-effects model using lmer(weights = …) 
 is algebraically equivalent to pre-multiplying the outcome and all predictors (including intercepts) 
 by the square root of the weights and fitting an unweighted model on the transformed data. 
 The test validates that this "pre-weight and refit" strategy yields identical fitted values and 
 is especially useful when packages like clubSandwich prohibit prior weights in robust variance estimation. 
 Includes realistic longitudinal synthetic data with fixed and random effects, dropout, and continuous covariates.
 """
 from typing import List

 import numpy as np
 import pandas as pd
 import pytest
 import rpy2.robjects as ro
 from rpy2.interactive.packages import importr
 from rpy2.robjects import pandas2ri

 pandas2ri.activate()


 # ---------- helpers ----------------------------------------------------------
 def ensure_r_packages(pkgs: List[str]) -> None:
    """Install any missing R packages from CRAN."""
    utils = importr("utils")
    utils.chooseCRANmirror(ind=1, graphics=False)
    installed = set(ro.r('installed.packages()[,"Package"]'))
    to_install = [p for p in pkgs if p not in installed]
    if to_install:
        utils.install_packages(
            ro.StrVector(to_install), repos="https://cloud.r-project.org"
        )


 # realistic synthetic data ---------------------------------------------------
 def _realistic_dataset(n_patients: int = 8, n_sessions: int = 5) -> pd.DataFrame:
    rng = np.random.default_rng(2025)
    rows = []
    for pid in range(1, n_patients + 1):
        age = rng.integers(25, 50)
        sex = rng.integers(0, 2)
        last = rng.integers(n_sessions - 1, n_sessions + 1)
        for sess in range(1, last + 1):
            stress = rng.normal()
            ysq = rng.normal()
            y = (
                30
                - 0.6 * age
                + 3.5 * sex
                - 1.2 * stress
                + 2.0 * ysq
                + 1.8 * sess
                + rng.normal(scale=2)
            )
            rows.append(
                dict(
                    id=pid,
                    session=sess,
                    age=age,
                    sex=sex,
                    stress=stress,
                    ysq_all_schema_mean_score=ysq,
                    y=y,
                )
            )
    df = pd.DataFrame(rows)
    df["ipaw"] = rng.gamma(shape=2.0, scale=0.8, size=len(df))
    return df


 # integration test -----------------------------------------------------------
 @pytest.mark.integration
 def test_preweight_refit_equivalence_full():
    ensure_r_packages(["dplyr", "lme4"])
    df = _realistic_dataset()
    ro.globalenv["dat"] = pandas2ri.py2rpy(df)

    r_code = """
      suppressPackageStartupMessages({library(dplyr); library(lme4)})

      ## weighted fit -------------------------------------------------
      fit_w <- lmer(
        y ~ session + age + sex + stress + ysq_all_schema_mean_score +
             (session | id),
        data = dat, weights = ipaw, REML = FALSE
      )
      yhat_w <- fitted(fit_w)

      ## pre-weighted refit  (explicit intercept) --------------------
      dat_pw <- dat %>%
        mutate(
          w_sqrt = sqrt(ipaw),
          intercept_star = w_sqrt,
          y_star         = w_sqrt * y,
          session_star   = w_sqrt * session,
          age_star       = w_sqrt * age,
          sex_star       = w_sqrt * sex,
          stress_star    = w_sqrt * stress,
          ysq_star       = w_sqrt * ysq_all_schema_mean_score
        )

      fit_pw <- lmer(
        y_star ~ -1 + intercept_star + session_star + age_star +
                 sex_star + stress_star + ysq_star +
                 (0 + intercept_star + session_star | id),
        data = dat_pw, REML = FALSE
      )

      ## compare fitted values --------------------------------------
      yhat_pw_back <- fitted(fit_pw) / dat_pw$w_sqrt
      max_abs_diff <- max(abs(yhat_w - yhat_pw_back))
    """
    ro.r(r_code)
    diff = float(ro.globalenv["max_abs_diff"][0])

    assert diff < 1e-8, f"max |Δ fitted| = {diff:.3e}"
	"""
	This example demonstrates that fitting a weighted linear mixed-effects model using lmer(weights = …)
	is algebraically equivalent to pre-multiplying the outcome and all predictors (including intercepts)
	by the square root of the weights and fitting an unweighted model on the transformed data.
	The test validates that this "pre-weight and refit" strategy yields identical fitted values and
	is especially useful when packages like clubSandwich prohibit prior weights in robust variance estimation.
	Includes realistic longitudinal synthetic data with fixed and random effects, dropout, and continuous covariates.
	"""
	from typing import List

	import numpy as np
	import pandas as pd
	import pytest
	import rpy2.robjects as ro
	from rpy2.interactive.packages import importr
	from rpy2.robjects import pandas2ri

	pandas2ri.activate()


	# ---------- helpers ----------------------------------------------------------
	def ensure_r_packages(pkgs: List[str]) -> None:
	"""Install any missing R packages from CRAN."""
	utils = importr("utils")
	utils.chooseCRANmirror(ind=1, graphics=False)
	installed = set(ro.r('installed.packages()[,"Package"]'))
	to_install = [p for p in pkgs if p not in installed]
	if to_install:
	utils.install_packages(
	ro.StrVector(to_install), repos="https://cloud.r-project.org"
	)


	# realistic synthetic data ---------------------------------------------------
	def _realistic_dataset(n_patients: int = 8, n_sessions: int = 5) -> pd.DataFrame:
	rng = np.random.default_rng(2025)
	rows = []
	for pid in range(1, n_patients + 1):
	age = rng.integers(25, 50)
	sex = rng.integers(0, 2)
	last = rng.integers(n_sessions - 1, n_sessions + 1)
	for sess in range(1, last + 1):
	stress = rng.normal()
	ysq = rng.normal()
	y = (
	30
	- 0.6 * age
	+ 3.5 * sex
	- 1.2 * stress
	+ 2.0 * ysq
	+ 1.8 * sess
	+ rng.normal(scale=2)
	)
	rows.append(
	dict(
	id=pid,
	session=sess,
	age=age,
	sex=sex,
	stress=stress,
	ysq_all_schema_mean_score=ysq,
	y=y,
	)
	)
	df = pd.DataFrame(rows)
	df["ipaw"] = rng.gamma(shape=2.0, scale=0.8, size=len(df))
	return df


	# integration test -----------------------------------------------------------
	@pytest.mark.integration
	def test_preweight_refit_equivalence_full():
	ensure_r_packages(["dplyr", "lme4"])
	df = _realistic_dataset()
	ro.globalenv["dat"] = pandas2ri.py2rpy(df)

	r_code = """
	suppressPackageStartupMessages({library(dplyr); library(lme4)})

	## weighted fit -------------------------------------------------
	fit_w <- lmer(
	y ~ session + age + sex + stress + ysq_all_schema_mean_score +
	(session \| id),
	data = dat, weights = ipaw, REML = FALSE
	)
	yhat_w <- fitted(fit_w)

	## pre-weighted refit (explicit intercept) --------------------
	dat_pw <- dat %>%
	mutate(
	w_sqrt = sqrt(ipaw),
	intercept_star = w_sqrt,
	y_star = w_sqrt * y,
	session_star = w_sqrt * session,
	age_star = w_sqrt * age,
	sex_star = w_sqrt * sex,
	stress_star = w_sqrt * stress,
	ysq_star = w_sqrt * ysq_all_schema_mean_score
	)

	fit_pw <- lmer(
	y_star ~ -1 + intercept_star + session_star + age_star +
	sex_star + stress_star + ysq_star +
	(0 + intercept_star + session_star \| id),
	data = dat_pw, REML = FALSE
	)

	## compare fitted values --------------------------------------
	yhat_pw_back <- fitted(fit_pw) / dat_pw$w_sqrt
	max_abs_diff <- max(abs(yhat_w - yhat_pw_back))
	"""
	ro.r(r_code)
	diff = float(ro.globalenv["max_abs_diff"][0])

	assert diff < 1e-8, f"max \|Δ fitted\| = {diff:.3e}"