al6x · July 31, 2025 09:09
diff --git a/tail-estimators.py b/tail-estimators.py
 import numpy as np
 import matplotlib.pyplot as plt
 from scipy.stats import t
 from scipy.optimize import minimize_scalar, minimize

 def student_sample(df, n, seed=None):
  rng = np.random.default_rng(seed)
  return rng.standard_t(df, size=n)

 def estimate_hill(x):
  x = np.sort(x)[::-1]
  logx = np.log(x)
  sumk = np.cumsum(logx)
  k = np.arange(1, len(x)+1)
  denom = (sumk/k) - logx
  return np.array([1/d if d != 0 else np.nan for d in denom])

 def fit_mle_student(x):
  def nll(df):
    return np.inf if df <= 2 else -np.sum(t.logpdf(x, df, loc=0, scale=1))
  res = minimize_scalar(nll, bounds=(2.01, 100), method='bounded')
  if not res.success:
    raise RuntimeError("MLE fit failed")
  return res.x

 def estimate_gpd_mle(x, threshold, init=(1/2.1, 1.0), min_exceed=5):
  # Generalized Pareto Distribution (GPD) MLE estimation
  y = x[x > threshold] - threshold
  if y.size < min_exceed: return None

  def nll(params):
    ξ, β = params
    if β <= 0: return np.inf
    base = 1 + ξ * y / β
    if np.any(base <= 0): return np.inf
    pdf = (1 / β) * base ** -(1/ξ + 1)
    if np.any(pdf <= 0) or np.any(~np.isfinite(pdf)): return np.inf
    return -np.sum(np.log(pdf))
  res = minimize(nll, x0=init, bounds=((1/20, 1/2), (1e-6, 1000)))
  if not res.success: return None
  ξ, β = res.x
  return (1/ξ, β)  # return df = 1/xi



 def estimate_gpd_mle_and_prepare_data_for_plot(x, K, count=20, init=(0.2, 1.0)):
  # Ignore, nothing interesting, just some data preparation for the plot
  x_sorted = np.sort(x)
  ranks = np.linspace(10, K, count).astype(int)
  df_estimates = np.full_like(ranks, np.nan, dtype=float)
  current_init = init

  for i, k in enumerate(ranks):
    threshold = x_sorted[-k]
    res = estimate_gpd_mle(x, threshold, init=current_init)
    if res is None: continue
    df, beta = res
    df_estimates[i] = df
    # warm start next with xi,beta
    current_init = (1/df, beta)

  return ranks, df_estimates

 def estimate_tail_ls(x):
  # Least Squares estimation of the tail index
  n = len(x)
  if n < 3:
    return None
  x_sorted = np.sort(x)
  logx = np.log(x_sorted)
  logp = np.log(np.arange(n, 0, -1)) - np.log(n + 1)
  A = np.vstack([logx, np.ones_like(logx)]).T
  try:
    slope, _ = np.linalg.lstsq(A, logp, rcond=None)[0]
    alpha = -slope
    return alpha if alpha > 0 else None
  except:
    return None

 def estimate_tail_ls_and_prepare_data_for_plot(x, K, count=20):
  # Ignore, nothing interesting, just some data preparation for the plot
  x_sorted = np.sort(x)
  ranks = np.linspace(10, K, count).astype(int)
  df_estimates = np.full_like(ranks, np.nan, dtype=float)

  for i, k in enumerate(ranks):
    tail = x_sorted[-k:]
    df = estimate_tail_ls(tail)
    if df is not None:
      df_estimates[i] = df

  return ranks, df_estimates

 # # Parameters
 N = 30
 n_obs = 20_000
 q = 0.97

 plt.figure(figsize=(8, 6))
 for _ in range(N):
  x = student_sample(4, n_obs)

  # Hill estimates
  k = int((1 - q) * len(x))
  tail = np.sort(x)[-k:][::-1]
  hill = estimate_hill(tail)
  plt.plot(hill, linewidth=1, alpha=0.5, color='black')

  # Exact t MLE
  mle_df = fit_mle_student(x)
  plt.axhline(mle_df, linewidth=1, alpha=0.5, color='red')

  # GPD estimates across multiple order-statistic thresholds
  ranks, dfs = estimate_gpd_mle_and_prepare_data_for_plot(x, K=k, count=20)
  plt.plot(ranks, dfs, marker='o', linestyle='-', markersize=3, alpha=0.5, color='blue', linewidth=1, )

  # LS estimates across multiple order-statistic thresholds
  ranks, dfs = estimate_tail_ls_and_prepare_data_for_plot(x, K=k, count=20)
  plt.plot(ranks, dfs, marker='o', linestyle='-', markersize=3, alpha=0.5, color='green', linewidth=1)

 plt.ylim(2, 6)
 plt.xlabel('Order-stats rank k')
 plt.ylabel('Tail index estimate')
 plt.title(f'Hill (black), MLE (red), GPD (blue), LS (green) over {N} samples. True ν = 4')
 plt.show()
	import numpy as np
	import matplotlib.pyplot as plt
	from scipy.stats import t
	from scipy.optimize import minimize_scalar, minimize

	def student_sample(df, n, seed=None):
	rng = np.random.default_rng(seed)
	return rng.standard_t(df, size=n)

	def estimate_hill(x):
	x = np.sort(x)[::-1]
	logx = np.log(x)
	sumk = np.cumsum(logx)
	k = np.arange(1, len(x)+1)
	denom = (sumk/k) - logx
	return np.array([1/d if d != 0 else np.nan for d in denom])

	def fit_mle_student(x):
	def nll(df):
	return np.inf if df <= 2 else -np.sum(t.logpdf(x, df, loc=0, scale=1))
	res = minimize_scalar(nll, bounds=(2.01, 100), method='bounded')
	if not res.success:
	raise RuntimeError("MLE fit failed")
	return res.x

	def estimate_gpd_mle(x, threshold, init=(1/2.1, 1.0), min_exceed=5):
	# Generalized Pareto Distribution (GPD) MLE estimation
	y = x[x > threshold] - threshold
	if y.size < min_exceed: return None

	def nll(params):
	ξ, β = params
	if β <= 0: return np.inf
	base = 1 + ξ * y / β
	if np.any(base <= 0): return np.inf
	pdf = (1 / β) * base ** -(1/ξ + 1)
	if np.any(pdf <= 0) or np.any(~np.isfinite(pdf)): return np.inf
	return -np.sum(np.log(pdf))
	res = minimize(nll, x0=init, bounds=((1/20, 1/2), (1e-6, 1000)))
	if not res.success: return None
	ξ, β = res.x
	return (1/ξ, β) # return df = 1/xi



	def estimate_gpd_mle_and_prepare_data_for_plot(x, K, count=20, init=(0.2, 1.0)):
	# Ignore, nothing interesting, just some data preparation for the plot
	x_sorted = np.sort(x)
	ranks = np.linspace(10, K, count).astype(int)
	df_estimates = np.full_like(ranks, np.nan, dtype=float)
	current_init = init

	for i, k in enumerate(ranks):
	threshold = x_sorted[-k]
	res = estimate_gpd_mle(x, threshold, init=current_init)
	if res is None: continue
	df, beta = res
	df_estimates[i] = df
	# warm start next with xi,beta
	current_init = (1/df, beta)

	return ranks, df_estimates

	def estimate_tail_ls(x):
	# Least Squares estimation of the tail index
	n = len(x)
	if n < 3:
	return None
	x_sorted = np.sort(x)
	logx = np.log(x_sorted)
	logp = np.log(np.arange(n, 0, -1)) - np.log(n + 1)
	A = np.vstack([logx, np.ones_like(logx)]).T
	try:
	slope, _ = np.linalg.lstsq(A, logp, rcond=None)[0]
	alpha = -slope
	return alpha if alpha > 0 else None
	except:
	return None

	def estimate_tail_ls_and_prepare_data_for_plot(x, K, count=20):
	# Ignore, nothing interesting, just some data preparation for the plot
	x_sorted = np.sort(x)
	ranks = np.linspace(10, K, count).astype(int)
	df_estimates = np.full_like(ranks, np.nan, dtype=float)

	for i, k in enumerate(ranks):
	tail = x_sorted[-k:]
	df = estimate_tail_ls(tail)
	if df is not None:
	df_estimates[i] = df

	return ranks, df_estimates

	# # Parameters
	N = 30
	n_obs = 20_000
	q = 0.97

	plt.figure(figsize=(8, 6))
	for _ in range(N):
	x = student_sample(4, n_obs)

	# Hill estimates
	k = int((1 - q) * len(x))
	tail = np.sort(x)[-k:][::-1]
	hill = estimate_hill(tail)
	plt.plot(hill, linewidth=1, alpha=0.5, color='black')

	# Exact t MLE
	mle_df = fit_mle_student(x)
	plt.axhline(mle_df, linewidth=1, alpha=0.5, color='red')

	# GPD estimates across multiple order-statistic thresholds
	ranks, dfs = estimate_gpd_mle_and_prepare_data_for_plot(x, K=k, count=20)
	plt.plot(ranks, dfs, marker='o', linestyle='-', markersize=3, alpha=0.5, color='blue', linewidth=1, )

	# LS estimates across multiple order-statistic thresholds
	ranks, dfs = estimate_tail_ls_and_prepare_data_for_plot(x, K=k, count=20)
	plt.plot(ranks, dfs, marker='o', linestyle='-', markersize=3, alpha=0.5, color='green', linewidth=1)

	plt.ylim(2, 6)
	plt.xlabel('Order-stats rank k')
	plt.ylabel('Tail index estimate')
	plt.title(f'Hill (black), MLE (red), GPD (blue), LS (green) over {N} samples. True ν = 4')
	plt.show()