Skip to content

Instantly share code, notes, and snippets.

@arraytools
Last active June 12, 2025 17:20
Show Gist options
  • Save arraytools/d383f72a243a2e322004904ae6903610 to your computer and use it in GitHub Desktop.
Save arraytools/d383f72a243a2e322004904ae6903610 to your computer and use it in GitHub Desktop.
Compare K-S test statistic calculated by hand (through ecdf() ) and by R (through ks.test() )
# 1. Generate sample data
set.seed(123)
sample_data <- rnorm(100, mean = 0, sd = 1) # A sample from a standard normal distribution
# 2. Define the theoretical CDF (for standard normal distribution)
# In R, pnorm() is the CDF for the normal distribution
theoretical_cdf <- function(x) pnorm(x, mean = 0, sd = 1)
# 3. Compute the ECDF of your sample
ecdf_sample <- ecdf(sample_data)
# 4. Evaluate ECDF and theoretical CDF at relevant points
# The KS statistic is the maximum difference, which occurs at the data points themselves
# or just below them. We need to check both F_n(x_i) and F_n(x_i-1) approx.
# A common approach is to sort the data and evaluate at these points.
sorted_data <- sort(sample_data)
n <- length(sorted_data)
# Values of the ECDF at the sorted data points
Fn_values <- ecdf_sample(sorted_data)
# Values of the theoretical CDF at the sorted data points
F0_values <- theoretical_cdf(sorted_data)
# Calculate the differences at each data point
diff_at_points <- abs(Fn_values - F0_values)
# Also consider the step just before each point.
# F_n(x_i^-) = (i-1)/n
# F_0(x_i) is the theoretical CDF at that point.
# The `ecdf` function in R handles this automatically when you evaluate it,
# but for a manual calculation of the supremum, we need to consider both sides of the jump.
# For simplicity, we can calculate two types of differences:
# D+ = max_i (F_n(x_i) - F_0(x_i))
# D- = max_i (F_0(x_i) - F_n(x_i-1))
# The KS statistic is max(D+, D-)
# ECDF values at the points x_i (proportion of data <= x_i)
Fn_at_xi <- (1:n) / n
# ECDF values just before the points x_i (proportion of data < x_i)
Fn_before_xi <- (0:(n-1)) / n
# The two potential differences to check for the supremum:
# 1. |F_n(x_i) - F_0(x_i)|
diff1 <- abs(Fn_at_xi - F0_values)
# 2. |F_n(x_i-1) - F_0(x_i)|
# This needs careful handling for the first point (i=1), where F_n(x_0) would be 0.
# The `ecdf_sample(x)` evaluates to the value of the step function at x.
# The max difference can occur just before or at an observation.
# The standard definition of D_n takes the supremum over all x.
# For sorted data x_1 <= x_2 <= ... <= x_n, the supremum will always occur at one of the x_i.
# Specifically, we need to check:
# D_n = max(max_{i} (i/n - F_0(x_i)), max_{i} (F_0(x_i) - (i-1)/n))
D_plus <- max(Fn_at_xi - F0_values)
D_minus <- max(F0_values - Fn_before_xi)
ks_statistic_manual_one_sample <- max(D_plus, D_minus)
cat("Manually calculated One-Sample KS Statistic:", ks_statistic_manual_one_sample, "\n")
# Manually calculated One-Sample KS Statistic: 0.0930341
# Compare with R's built-in ks.test()
ks_result <- ks.test(sample_data, "pnorm", mean = 0, sd = 1)
cat("R's ks.test() One-Sample KS Statistic:", ks_result$statistic, "\n")
# R's ks.test() One-Sample KS Statistic: 0.0930341
# Slight differences might occur due to how floating-point numbers are handled
# and the exact evaluation points for the supremum, but they should be very close.
# 1. Generate two sample datasets
set.seed(456)
sample1 <- rnorm(50, mean = 0, sd = 1)
sample2 <- rnorm(60, mean = 0.2, sd = 1) # Slightly shifted mean for potential difference
# 2. Compute ECDFs for both samples
ecdf1 <- ecdf(sample1)
ecdf2 <- ecdf(sample2)
# 3. Combine and sort all unique data points from both samples
# The supremum can only occur at values present in either sample.
all_points <- sort(unique(c(sample1, sample2)))
# 4. Evaluate both ECDFs at these combined sorted points
Fn_values_at_all_points <- ecdf1(all_points)
Gn_values_at_all_points <- ecdf2(all_points)
# 5. Calculate the absolute differences
differences <- abs(Fn_values_at_all_points - Gn_values_at_all_points)
# 6. Find the maximum of these differences
ks_statistic_manual_two_sample <- max(differences)
cat("Manually calculated Two-Sample KS Statistic:", ks_statistic_manual_two_sample, "\n")
# Manually calculated Two-Sample KS Statistic: 0.09666667
# Compare with R's built-in ks.test()
ks_result_two_sample <- ks.test(sample1, sample2)
cat("R's ks.test() Two-Sample KS Statistic:", ks_result_two_sample$statistic, "\n")
# R's ks.test() Two-Sample KS Statistic: 0.09666667
# --- 1. Set up the data and parameters ---
set.seed(789) # For reproducibility
# Sample 1: From a standard normal distribution
sample1 <- rnorm(50, mean = 0, sd = 1)
# Sample 2: From a slightly different normal distribution (null hypothesis is likely false)
# This will likely result in a small p-value from both methods
sample2 <- rnorm(60, mean = 0.5, sd = 1)
# Number of permutations
num_permutations <- 10000
# --- 2. Calculate the observed KS statistic ---
# This is the statistic for our original, unpermuted data.
observed_ks_result <- ks.test(sample1, sample2)
observed_D_statistic <- observed_ks_result$statistic
cat("--- Observed Data --- \n")
cat("Observed KS D-statistic:", observed_D_statistic, "\n")
# Observed KS D-statistic: 0.3466667
cat("p-value from R's ks.test():", observed_ks_result$p.value, "\n\n")
# p-value from R's ks.test(): 0.002056568
# --- 3. Perform the Permutation Test ---
# Combine the two samples into one pool
pooled_data <- c(sample1, sample2)
n1 <- length(sample1)
n2 <- length(sample2)
total_n <- n1 + n2
# Vector to store KS statistics from each permutation
permutation_D_statistics <- numeric(num_permutations)
cat("--- Running Permutation Test --- \n")
for (i in 1:num_permutations) {
# Shuffle the pooled data
permuted_data <- sample(pooled_data)
# Split the permuted data back into two 'samples' of the original sizes
permuted_sample1 <- permuted_data[1:n1]
permuted_sample2 <- permuted_data[(n1 + 1):total_n]
# Calculate the KS statistic for the permuted samples
# We use ks.test() directly here as requested
perm_ks_result <- ks.test(permuted_sample1, permuted_sample2)
permutation_D_statistics[i] <- perm_ks_result$statistic
}
# --- 4. Calculate the permutation p-value ---
# The p-value is the proportion of permuted D-statistics that are
# as extreme as or more extreme than the observed D-statistic.
# For KS test, 'more extreme' means 'greater than or equal to'.
permutation_p_value <- sum(permutation_D_statistics >= observed_D_statistic) / num_permutations
cat("\n--- Permutation Test Results --- \n")
cat("Number of permutations:", num_permutations, "\n")
cat("Permutation p-value:", permutation_p_value, "\n")
# Permutation p-value: 0.0016
# --- 5. Optional: Visualize the permutation distribution ---
hist(permutation_D_statistics,
main = "Permutation Distribution of KS D-Statistic",
xlab = "KS D-Statistic",
freq = FALSE, # Use frequency = FALSE for density histogram
col = "lightblue",
border = "white")
abline(v = observed_D_statistic, col = "red", lty = 2, lwd = 2)
text(x = observed_D_statistic, y = max(hist(permutation_D_statistics, plot = FALSE)$density) * 0.9,
labels = paste("Observed D =", round(observed_D_statistic, 3)),
col = "red", pos = 4)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment