thomasahle · October 19, 2025 07:30
diff --git a/random walk simulator b/random walk simulator
 # Simulate centered Bernoulli(p) random walk survival in [-1, 1]
 # and plot the estimated survival probability.
 #
 # How it works
 # ------------
 # For X_i ~ Bernoulli(p), define S_j = sum_{i=0}^j (X_i - p).
 # For a trial "survives" up to time n if, for every j in {0,...,n},
 # we have -1 <= S_j <= 1. We estimate the survival probability by
 # Monte Carlo over many independent trials.
 #
 # You can tweak the parameters in the "CONFIG" section. By default,
 # we fix p and plot the survival probability as a function of n=1..N_MAX.
 #
 # Notes
 # -----
 # - Uses only NumPy + Matplotlib (no seaborn).
 # - Each chart is a single plot (no subplots).
 # - No specific colors/styles are set (besides line style to differentiate),
 #   to satisfy the plotting constraints.
 # - For large n, the true survival probability may be extremely small; increase
 #   `TRIALS` to reduce variance, or limit to moderate n.
 #
 # Author: ChatGPT

 from typing import Sequence, Optional, Tuple
 import numpy as np
 import pandas as pd
 import math
 import matplotlib.pyplot as plt

 # For displaying tables nicely in the UI
 from caas_jupyter_tools import display_dataframe_to_user


 def _simulate_survival_once_matrix(p: float, steps: int, trials: int, seed: Optional[int] = None
                                  ) -> Tuple[np.ndarray, np.ndarray]:
    """
    Vectorized simulation:
      - Generate a (trials x steps) matrix of Bernoulli(p) draws for X_0,...,X_{steps-1}.
      - Form increments (X_i - p), take cumulative sums, and compute max |S_j| over j.
    Returns:
      cumsums: the cumulative sums matrix (trials x steps)
      max_abs_prefix: the running max of |cumsums| along the time axis (trials x steps)
    """
    rng = np.random.default_rng(seed)
    # Bernoulli draws for X_0,...,X_{steps-1}
    X = (rng.random((trials, steps)) < p).astype(np.float32)
    increments = X - np.float32(p)  # centered increments in {-p, 1-p}
    cumsums = np.cumsum(increments, axis=1, dtype=np.float32)
    # Running maximum of absolute partial sums along the time axis
    max_abs_prefix = np.maximum.accumulate(np.abs(cumsums), axis=1)
    return cumsums, max_abs_prefix


 def estimate_survival_prob(p: float, n: int, trials: int, seed: Optional[int] = None
                          ) -> Tuple[float, float]:
    """
    Estimate Pr[ max_{0<=j<=n} |S_j| <= 1 ] via Monte Carlo with `trials` independent runs.
    Returns (probability_estimate, standard_error).
    """
    steps = n + 1  # because S_j uses X_0..X_j inclusive
    _, max_abs_prefix = _simulate_survival_once_matrix(p, steps, trials, seed=seed)
    survive = max_abs_prefix[:, n] <= 1.0 + 1e-12  # include j = n
    phat = float(np.mean(survive))
    se = math.sqrt(phat * max(1.0 - phat, 0.0) / trials)
    return phat, se


 def scan_vs_n(p: float, n_max: int, trials: int, seed: Optional[int] = None,
              n_values: Optional[Sequence[int]] = None) -> pd.DataFrame:
    """
    For a fixed p, estimate survival probability for each n in n_values (default 1..n_max).
    Efficiently reuses the same simulated matrix for all prefixes 0..n_max.
    Returns a DataFrame with columns: ['n','p','prob','se','log_prob_over_n','exp_np','bound_exp','bound_one_over','bound_entropy'].
    """
    steps = n_max + 1
    _, max_abs_prefix = _simulate_survival_once_matrix(p, steps, trials, seed=seed)
    if n_values is None:
        n_values = list(range(1, n_max + 1))
    records = []
    p1mp = p * (1.0 - p)
    H = p * math.log(1.0 / p) + (1.0 - p) * math.log(1.0 / (1.0 - p))
    for n in n_values:
        survive = max_abs_prefix[:, n] <= 1.0 + 1e-12
        phat = float(np.mean(survive))
        se = math.sqrt(phat * max(1.0 - phat, 0.0) / trials)
        log_over_n = (math.log(phat) / n) if phat > 0 else float("nan")
        records.append({
            "n": n,
            "p": p,
            "prob": phat,
            "se": se,
            "log_prob_over_n": log_over_n,
            "bound_exp": math.exp(-n * p1mp),            # exp(-n p(1-p))
            "bound_one_over": (1.0 + p1mp) ** (-n),      # (1 + p(1-p))^{-n}
            "bound_entropy": math.exp(-n * H),           # exp(-n H(p))
        })
    return pd.DataFrame.from_records(records)


 def scan_vs_p(n: int, p_values: Sequence[float], trials: int, seed: Optional[int] = None) -> pd.DataFrame:
    """
    For a fixed n, estimate survival probability for each p in p_values.
    Returns a DataFrame with columns: ['p','n','prob','se','log_prob_over_n','bound_exp','bound_one_over','bound_entropy'].
    """
    records = []
    for p in p_values:
        phat, se = estimate_survival_prob(p, n, trials, seed=None if seed is None else (seed + int(1e6 * p)))
        p1mp = p * (1.0 - p)
        H = p * math.log(1.0 / p) + (1.0 - p) * math.log(1.0 / (1.0 - p))
        log_over_n = (math.log(phat) / n) if phat > 0 else float("nan")
        records.append({
            "p": p,
            "n": n,
            "prob": phat,
            "se": se,
            "log_prob_over_n": log_over_n,
            "bound_exp": math.exp(-n * p1mp),
            "bound_one_over": (1.0 + p1mp) ** (-n),
            "bound_entropy": math.exp(-n * H),
        })
    return pd.DataFrame.from_records(records)


 def plot_vs_n(df: pd.DataFrame) -> None:
    """
    Plot survival probability vs n and overlay the candidate bounds for the same p.
    """
    p = float(df["p"].iloc[0])
    plt.figure()
    plt.plot(df["n"], df["prob"], marker="o", label="Simulated survival prob.")
    plt.plot(df["n"], df["bound_exp"], linestyle="--", label="exp(-n p(1-p))")
    plt.plot(df["n"], df["bound_one_over"], linestyle=":", label="(1 + p(1-p))^{-n}")
    plt.plot(df["n"], df["bound_entropy"], linestyle="-.", label="exp(-n H(p))")
    plt.xlabel("n")
    plt.ylabel("Survival probability  (stay in [-1,1])")
    plt.title(f"Centered Bernoulli(p) walk survival in [-1,1]  (p={p:.3f})")
    plt.legend()
    plt.tight_layout()
    plt.show()


 def plot_logprob_over_n_vs_p(df: pd.DataFrame) -> None:
    """
    Plot (log survival probability)/n vs p with the two candidate exponents,
    similar in spirit to the figure described.
    """
    n = int(df["n"].iloc[0])
    plt.figure()
    plt.plot(df["p"], df["log_prob_over_n"], marker="o", label="Simulated  (log P)/n")
    # Overlay the conjectured curves transformed likewise
    plt.plot(df["p"], np.log(df["bound_exp"]) / n, linestyle="--", label="-(p(1-p))")
    plt.plot(df["p"], np.log(df["bound_one_over"]) / n, linestyle=":", label="-log(1 + p(1-p))")
    plt.plot(df["p"], np.log(df["bound_entropy"]) / n, linestyle="-.", label="-H(p)")
    plt.xlabel("p")
    plt.ylabel("(log survival probability) / n")
    plt.title(f"(log P_survival)/n vs p  (n={n})")
    plt.legend()
    plt.tight_layout()
    plt.show()


 # ======================
 # ====== CONFIG ========
 # ======================

 # Choose one of the following modes:
 #   "single": estimate for one (p, n) and print the number.
 #   "vary_n": fix p and plot survival probability vs n=1..N_MAX (default).
 #   "vary_p": fix n and plot (log P)/n vs p on a grid.
 MODE = "vary_n"  # "single", "vary_n", or "vary_p"

 # Common simulation controls
 TRIALS = 20000      # increase for more accuracy (at the cost of runtime)
 SEED = 42           # set None for different randomness each run

 # Parameters for MODE == "single"
 P_SINGLE = 0.30
 N_SINGLE = 200

 # Parameters for MODE == "vary_n"
 P_FIXED = 0.30
 N_MAX = 200
 # Optionally restrict to a subset of n to speed things up (None means 1..N_MAX)
 N_VALUES = None  # e.g., [10, 20, 30, 40, 50, 75, 100, 150, 200]

 # Parameters for MODE == "vary_p"
 N_FIXED = 150
 P_VALUES = np.round(np.linspace(0.05, 0.95, 19), 3)  # 0.05, 0.10, ..., 0.95


 # ======================
 # ====== RUN ===========
 # ======================

 if MODE == "single":
    phat, se = estimate_survival_prob(P_SINGLE, N_SINGLE, TRIALS, seed=SEED)
    print(f"Estimated survival probability for p={P_SINGLE}, n={N_SINGLE} over {TRIALS} trials: {phat:.6g} ± {se:.2g}")
 elif MODE == "vary_n":
    df = scan_vs_n(P_FIXED, N_MAX, TRIALS, seed=SEED, n_values=N_VALUES)
    # Show table for reference
    display_dataframe_to_user("Survival vs n", df)
    # Plot
    plot_vs_n(df)
 else:  # MODE == "vary_p"
    df = scan_vs_p(N_FIXED, P_VALUES, TRIALS, seed=SEED)
    display_dataframe_to_user("(log P_survival)/n vs p", df)
    plot_logprob_over_n_vs_p(df)
	# Simulate centered Bernoulli(p) random walk survival in [-1, 1]
	# and plot the estimated survival probability.
	#
	# How it works
	# ------------
	# For X_i ~ Bernoulli(p), define S_j = sum_{i=0}^j (X_i - p).
	# For a trial "survives" up to time n if, for every j in {0,...,n},
	# we have -1 <= S_j <= 1. We estimate the survival probability by
	# Monte Carlo over many independent trials.
	#
	# You can tweak the parameters in the "CONFIG" section. By default,
	# we fix p and plot the survival probability as a function of n=1..N_MAX.
	#
	# Notes
	# -----
	# - Uses only NumPy + Matplotlib (no seaborn).
	# - Each chart is a single plot (no subplots).
	# - No specific colors/styles are set (besides line style to differentiate),
	# to satisfy the plotting constraints.
	# - For large n, the true survival probability may be extremely small; increase
	# `TRIALS` to reduce variance, or limit to moderate n.
	#
	# Author: ChatGPT

	from typing import Sequence, Optional, Tuple
	import numpy as np
	import pandas as pd
	import math
	import matplotlib.pyplot as plt

	# For displaying tables nicely in the UI
	from caas_jupyter_tools import display_dataframe_to_user


	def _simulate_survival_once_matrix(p: float, steps: int, trials: int, seed: Optional[int] = None
	) -> Tuple[np.ndarray, np.ndarray]:
	"""
	Vectorized simulation:
	- Generate a (trials x steps) matrix of Bernoulli(p) draws for X_0,...,X_{steps-1}.
	- Form increments (X_i - p), take cumulative sums, and compute max \|S_j\| over j.
	Returns:
	cumsums: the cumulative sums matrix (trials x steps)
	max_abs_prefix: the running max of \|cumsums\| along the time axis (trials x steps)
	"""
	rng = np.random.default_rng(seed)
	# Bernoulli draws for X_0,...,X_{steps-1}
	X = (rng.random((trials, steps)) < p).astype(np.float32)
	increments = X - np.float32(p) # centered increments in {-p, 1-p}
	cumsums = np.cumsum(increments, axis=1, dtype=np.float32)
	# Running maximum of absolute partial sums along the time axis
	max_abs_prefix = np.maximum.accumulate(np.abs(cumsums), axis=1)
	return cumsums, max_abs_prefix


	def estimate_survival_prob(p: float, n: int, trials: int, seed: Optional[int] = None
	) -> Tuple[float, float]:
	"""
	Estimate Pr[ max_{0<=j<=n} \|S_j\| <= 1 ] via Monte Carlo with `trials` independent runs.
	Returns (probability_estimate, standard_error).
	"""
	steps = n + 1 # because S_j uses X_0..X_j inclusive
	_, max_abs_prefix = _simulate_survival_once_matrix(p, steps, trials, seed=seed)
	survive = max_abs_prefix[:, n] <= 1.0 + 1e-12 # include j = n
	phat = float(np.mean(survive))
	se = math.sqrt(phat * max(1.0 - phat, 0.0) / trials)
	return phat, se


	def scan_vs_n(p: float, n_max: int, trials: int, seed: Optional[int] = None,
	n_values: Optional[Sequence[int]] = None) -> pd.DataFrame:
	"""
	For a fixed p, estimate survival probability for each n in n_values (default 1..n_max).
	Efficiently reuses the same simulated matrix for all prefixes 0..n_max.
	Returns a DataFrame with columns: ['n','p','prob','se','log_prob_over_n','exp_np','bound_exp','bound_one_over','bound_entropy'].
	"""
	steps = n_max + 1
	_, max_abs_prefix = _simulate_survival_once_matrix(p, steps, trials, seed=seed)
	if n_values is None:
	n_values = list(range(1, n_max + 1))
	records = []
	p1mp = p * (1.0 - p)
	H = p * math.log(1.0 / p) + (1.0 - p) * math.log(1.0 / (1.0 - p))
	for n in n_values:
	survive = max_abs_prefix[:, n] <= 1.0 + 1e-12
	phat = float(np.mean(survive))
	se = math.sqrt(phat * max(1.0 - phat, 0.0) / trials)
	log_over_n = (math.log(phat) / n) if phat > 0 else float("nan")
	records.append({
	"n": n,
	"p": p,
	"prob": phat,
	"se": se,
	"log_prob_over_n": log_over_n,
	"bound_exp": math.exp(-n * p1mp), # exp(-n p(1-p))
	"bound_one_over": (1.0 + p1mp) ** (-n), # (1 + p(1-p))^{-n}
	"bound_entropy": math.exp(-n * H), # exp(-n H(p))
	})
	return pd.DataFrame.from_records(records)


	def scan_vs_p(n: int, p_values: Sequence[float], trials: int, seed: Optional[int] = None) -> pd.DataFrame:
	"""
	For a fixed n, estimate survival probability for each p in p_values.
	Returns a DataFrame with columns: ['p','n','prob','se','log_prob_over_n','bound_exp','bound_one_over','bound_entropy'].
	"""
	records = []
	for p in p_values:
	phat, se = estimate_survival_prob(p, n, trials, seed=None if seed is None else (seed + int(1e6 * p)))
	p1mp = p * (1.0 - p)
	H = p * math.log(1.0 / p) + (1.0 - p) * math.log(1.0 / (1.0 - p))
	log_over_n = (math.log(phat) / n) if phat > 0 else float("nan")
	records.append({
	"p": p,
	"n": n,
	"prob": phat,
	"se": se,
	"log_prob_over_n": log_over_n,
	"bound_exp": math.exp(-n * p1mp),
	"bound_one_over": (1.0 + p1mp) ** (-n),
	"bound_entropy": math.exp(-n * H),
	})
	return pd.DataFrame.from_records(records)


	def plot_vs_n(df: pd.DataFrame) -> None:
	"""
	Plot survival probability vs n and overlay the candidate bounds for the same p.
	"""
	p = float(df["p"].iloc[0])
	plt.figure()
	plt.plot(df["n"], df["prob"], marker="o", label="Simulated survival prob.")
	plt.plot(df["n"], df["bound_exp"], linestyle="--", label="exp(-n p(1-p))")
	plt.plot(df["n"], df["bound_one_over"], linestyle=":", label="(1 + p(1-p))^{-n}")
	plt.plot(df["n"], df["bound_entropy"], linestyle="-.", label="exp(-n H(p))")
	plt.xlabel("n")
	plt.ylabel("Survival probability (stay in [-1,1])")
	plt.title(f"Centered Bernoulli(p) walk survival in [-1,1] (p={p:.3f})")
	plt.legend()
	plt.tight_layout()
	plt.show()


	def plot_logprob_over_n_vs_p(df: pd.DataFrame) -> None:
	"""
	Plot (log survival probability)/n vs p with the two candidate exponents,
	similar in spirit to the figure described.
	"""
	n = int(df["n"].iloc[0])
	plt.figure()
	plt.plot(df["p"], df["log_prob_over_n"], marker="o", label="Simulated (log P)/n")
	# Overlay the conjectured curves transformed likewise
	plt.plot(df["p"], np.log(df["bound_exp"]) / n, linestyle="--", label="-(p(1-p))")
	plt.plot(df["p"], np.log(df["bound_one_over"]) / n, linestyle=":", label="-log(1 + p(1-p))")
	plt.plot(df["p"], np.log(df["bound_entropy"]) / n, linestyle="-.", label="-H(p)")
	plt.xlabel("p")
	plt.ylabel("(log survival probability) / n")
	plt.title(f"(log P_survival)/n vs p (n={n})")
	plt.legend()
	plt.tight_layout()
	plt.show()


	# ======================
	# ====== CONFIG ========
	# ======================

	# Choose one of the following modes:
	# "single": estimate for one (p, n) and print the number.
	# "vary_n": fix p and plot survival probability vs n=1..N_MAX (default).
	# "vary_p": fix n and plot (log P)/n vs p on a grid.
	MODE = "vary_n" # "single", "vary_n", or "vary_p"

	# Common simulation controls
	TRIALS = 20000 # increase for more accuracy (at the cost of runtime)
	SEED = 42 # set None for different randomness each run

	# Parameters for MODE == "single"
	P_SINGLE = 0.30
	N_SINGLE = 200

	# Parameters for MODE == "vary_n"
	P_FIXED = 0.30
	N_MAX = 200
	# Optionally restrict to a subset of n to speed things up (None means 1..N_MAX)
	N_VALUES = None # e.g., [10, 20, 30, 40, 50, 75, 100, 150, 200]

	# Parameters for MODE == "vary_p"
	N_FIXED = 150
	P_VALUES = np.round(np.linspace(0.05, 0.95, 19), 3) # 0.05, 0.10, ..., 0.95


	# ======================
	# ====== RUN ===========
	# ======================

	if MODE == "single":
	phat, se = estimate_survival_prob(P_SINGLE, N_SINGLE, TRIALS, seed=SEED)
	print(f"Estimated survival probability for p={P_SINGLE}, n={N_SINGLE} over {TRIALS} trials: {phat:.6g} ± {se:.2g}")
	elif MODE == "vary_n":
	df = scan_vs_n(P_FIXED, N_MAX, TRIALS, seed=SEED, n_values=N_VALUES)
	# Show table for reference
	display_dataframe_to_user("Survival vs n", df)
	# Plot
	plot_vs_n(df)
	else: # MODE == "vary_p"
	df = scan_vs_p(N_FIXED, P_VALUES, TRIALS, seed=SEED)
	display_dataframe_to_user("(log P_survival)/n vs p", df)
	plot_logprob_over_n_vs_p(df)
No results found