Created
July 31, 2025 09:09
-
-
Save al6x/11e66ab92c525f2ef2c1510e6ac7a3f7 to your computer and use it in GitHub Desktop.
Tail Estimators
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import matplotlib.pyplot as plt | |
from scipy.stats import t | |
from scipy.optimize import minimize_scalar, minimize | |
def student_sample(df, n, seed=None): | |
rng = np.random.default_rng(seed) | |
return rng.standard_t(df, size=n) | |
def estimate_hill(x): | |
x = np.sort(x)[::-1] | |
logx = np.log(x) | |
sumk = np.cumsum(logx) | |
k = np.arange(1, len(x)+1) | |
denom = (sumk/k) - logx | |
return np.array([1/d if d != 0 else np.nan for d in denom]) | |
def fit_mle_student(x): | |
def nll(df): | |
return np.inf if df <= 2 else -np.sum(t.logpdf(x, df, loc=0, scale=1)) | |
res = minimize_scalar(nll, bounds=(2.01, 100), method='bounded') | |
if not res.success: | |
raise RuntimeError("MLE fit failed") | |
return res.x | |
def estimate_gpd_mle(x, threshold, init=(1/2.1, 1.0), min_exceed=5): | |
# Generalized Pareto Distribution (GPD) MLE estimation | |
y = x[x > threshold] - threshold | |
if y.size < min_exceed: return None | |
def nll(params): | |
ξ, β = params | |
if β <= 0: return np.inf | |
base = 1 + ξ * y / β | |
if np.any(base <= 0): return np.inf | |
pdf = (1 / β) * base ** -(1/ξ + 1) | |
if np.any(pdf <= 0) or np.any(~np.isfinite(pdf)): return np.inf | |
return -np.sum(np.log(pdf)) | |
res = minimize(nll, x0=init, bounds=((1/20, 1/2), (1e-6, 1000))) | |
if not res.success: return None | |
ξ, β = res.x | |
return (1/ξ, β) # return df = 1/xi | |
def estimate_gpd_mle_and_prepare_data_for_plot(x, K, count=20, init=(0.2, 1.0)): | |
# Ignore, nothing interesting, just some data preparation for the plot | |
x_sorted = np.sort(x) | |
ranks = np.linspace(10, K, count).astype(int) | |
df_estimates = np.full_like(ranks, np.nan, dtype=float) | |
current_init = init | |
for i, k in enumerate(ranks): | |
threshold = x_sorted[-k] | |
res = estimate_gpd_mle(x, threshold, init=current_init) | |
if res is None: continue | |
df, beta = res | |
df_estimates[i] = df | |
# warm start next with xi,beta | |
current_init = (1/df, beta) | |
return ranks, df_estimates | |
def estimate_tail_ls(x): | |
# Least Squares estimation of the tail index | |
n = len(x) | |
if n < 3: | |
return None | |
x_sorted = np.sort(x) | |
logx = np.log(x_sorted) | |
logp = np.log(np.arange(n, 0, -1)) - np.log(n + 1) | |
A = np.vstack([logx, np.ones_like(logx)]).T | |
try: | |
slope, _ = np.linalg.lstsq(A, logp, rcond=None)[0] | |
alpha = -slope | |
return alpha if alpha > 0 else None | |
except: | |
return None | |
def estimate_tail_ls_and_prepare_data_for_plot(x, K, count=20): | |
# Ignore, nothing interesting, just some data preparation for the plot | |
x_sorted = np.sort(x) | |
ranks = np.linspace(10, K, count).astype(int) | |
df_estimates = np.full_like(ranks, np.nan, dtype=float) | |
for i, k in enumerate(ranks): | |
tail = x_sorted[-k:] | |
df = estimate_tail_ls(tail) | |
if df is not None: | |
df_estimates[i] = df | |
return ranks, df_estimates | |
# # Parameters | |
N = 30 | |
n_obs = 20_000 | |
q = 0.97 | |
plt.figure(figsize=(8, 6)) | |
for _ in range(N): | |
x = student_sample(4, n_obs) | |
# Hill estimates | |
k = int((1 - q) * len(x)) | |
tail = np.sort(x)[-k:][::-1] | |
hill = estimate_hill(tail) | |
plt.plot(hill, linewidth=1, alpha=0.5, color='black') | |
# Exact t MLE | |
mle_df = fit_mle_student(x) | |
plt.axhline(mle_df, linewidth=1, alpha=0.5, color='red') | |
# GPD estimates across multiple order-statistic thresholds | |
ranks, dfs = estimate_gpd_mle_and_prepare_data_for_plot(x, K=k, count=20) | |
plt.plot(ranks, dfs, marker='o', linestyle='-', markersize=3, alpha=0.5, color='blue', linewidth=1, ) | |
# LS estimates across multiple order-statistic thresholds | |
ranks, dfs = estimate_tail_ls_and_prepare_data_for_plot(x, K=k, count=20) | |
plt.plot(ranks, dfs, marker='o', linestyle='-', markersize=3, alpha=0.5, color='green', linewidth=1) | |
plt.ylim(2, 6) | |
plt.xlabel('Order-stats rank k') | |
plt.ylabel('Tail index estimate') | |
plt.title(f'Hill (black), MLE (red), GPD (blue), LS (green) over {N} samples. True ν = 4') | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment