Skip to content

Instantly share code, notes, and snippets.

@al6x
Created July 31, 2025 09:09
Show Gist options
  • Save al6x/11e66ab92c525f2ef2c1510e6ac7a3f7 to your computer and use it in GitHub Desktop.
Save al6x/11e66ab92c525f2ef2c1510e6ac7a3f7 to your computer and use it in GitHub Desktop.
Tail Estimators
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import t
from scipy.optimize import minimize_scalar, minimize
def student_sample(df, n, seed=None):
rng = np.random.default_rng(seed)
return rng.standard_t(df, size=n)
def estimate_hill(x):
x = np.sort(x)[::-1]
logx = np.log(x)
sumk = np.cumsum(logx)
k = np.arange(1, len(x)+1)
denom = (sumk/k) - logx
return np.array([1/d if d != 0 else np.nan for d in denom])
def fit_mle_student(x):
def nll(df):
return np.inf if df <= 2 else -np.sum(t.logpdf(x, df, loc=0, scale=1))
res = minimize_scalar(nll, bounds=(2.01, 100), method='bounded')
if not res.success:
raise RuntimeError("MLE fit failed")
return res.x
def estimate_gpd_mle(x, threshold, init=(1/2.1, 1.0), min_exceed=5):
# Generalized Pareto Distribution (GPD) MLE estimation
y = x[x > threshold] - threshold
if y.size < min_exceed: return None
def nll(params):
ξ, β = params
if β <= 0: return np.inf
base = 1 + ξ * y / β
if np.any(base <= 0): return np.inf
pdf = (1 / β) * base ** -(1/ξ + 1)
if np.any(pdf <= 0) or np.any(~np.isfinite(pdf)): return np.inf
return -np.sum(np.log(pdf))
res = minimize(nll, x0=init, bounds=((1/20, 1/2), (1e-6, 1000)))
if not res.success: return None
ξ, β = res.x
return (1/ξ, β) # return df = 1/xi
def estimate_gpd_mle_and_prepare_data_for_plot(x, K, count=20, init=(0.2, 1.0)):
# Ignore, nothing interesting, just some data preparation for the plot
x_sorted = np.sort(x)
ranks = np.linspace(10, K, count).astype(int)
df_estimates = np.full_like(ranks, np.nan, dtype=float)
current_init = init
for i, k in enumerate(ranks):
threshold = x_sorted[-k]
res = estimate_gpd_mle(x, threshold, init=current_init)
if res is None: continue
df, beta = res
df_estimates[i] = df
# warm start next with xi,beta
current_init = (1/df, beta)
return ranks, df_estimates
def estimate_tail_ls(x):
# Least Squares estimation of the tail index
n = len(x)
if n < 3:
return None
x_sorted = np.sort(x)
logx = np.log(x_sorted)
logp = np.log(np.arange(n, 0, -1)) - np.log(n + 1)
A = np.vstack([logx, np.ones_like(logx)]).T
try:
slope, _ = np.linalg.lstsq(A, logp, rcond=None)[0]
alpha = -slope
return alpha if alpha > 0 else None
except:
return None
def estimate_tail_ls_and_prepare_data_for_plot(x, K, count=20):
# Ignore, nothing interesting, just some data preparation for the plot
x_sorted = np.sort(x)
ranks = np.linspace(10, K, count).astype(int)
df_estimates = np.full_like(ranks, np.nan, dtype=float)
for i, k in enumerate(ranks):
tail = x_sorted[-k:]
df = estimate_tail_ls(tail)
if df is not None:
df_estimates[i] = df
return ranks, df_estimates
# # Parameters
N = 30
n_obs = 20_000
q = 0.97
plt.figure(figsize=(8, 6))
for _ in range(N):
x = student_sample(4, n_obs)
# Hill estimates
k = int((1 - q) * len(x))
tail = np.sort(x)[-k:][::-1]
hill = estimate_hill(tail)
plt.plot(hill, linewidth=1, alpha=0.5, color='black')
# Exact t MLE
mle_df = fit_mle_student(x)
plt.axhline(mle_df, linewidth=1, alpha=0.5, color='red')
# GPD estimates across multiple order-statistic thresholds
ranks, dfs = estimate_gpd_mle_and_prepare_data_for_plot(x, K=k, count=20)
plt.plot(ranks, dfs, marker='o', linestyle='-', markersize=3, alpha=0.5, color='blue', linewidth=1, )
# LS estimates across multiple order-statistic thresholds
ranks, dfs = estimate_tail_ls_and_prepare_data_for_plot(x, K=k, count=20)
plt.plot(ranks, dfs, marker='o', linestyle='-', markersize=3, alpha=0.5, color='green', linewidth=1)
plt.ylim(2, 6)
plt.xlabel('Order-stats rank k')
plt.ylabel('Tail index estimate')
plt.title(f'Hill (black), MLE (red), GPD (blue), LS (green) over {N} samples. True ν = 4')
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment