Last active
June 12, 2025 17:20
-
-
Save arraytools/d383f72a243a2e322004904ae6903610 to your computer and use it in GitHub Desktop.
Compare K-S test statistic calculated by hand (through ecdf() ) and by R (through ks.test() )
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 1. Generate sample data | |
set.seed(123) | |
sample_data <- rnorm(100, mean = 0, sd = 1) # A sample from a standard normal distribution | |
# 2. Define the theoretical CDF (for standard normal distribution) | |
# In R, pnorm() is the CDF for the normal distribution | |
theoretical_cdf <- function(x) pnorm(x, mean = 0, sd = 1) | |
# 3. Compute the ECDF of your sample | |
ecdf_sample <- ecdf(sample_data) | |
# 4. Evaluate ECDF and theoretical CDF at relevant points | |
# The KS statistic is the maximum difference, which occurs at the data points themselves | |
# or just below them. We need to check both F_n(x_i) and F_n(x_i-1) approx. | |
# A common approach is to sort the data and evaluate at these points. | |
sorted_data <- sort(sample_data) | |
n <- length(sorted_data) | |
# Values of the ECDF at the sorted data points | |
Fn_values <- ecdf_sample(sorted_data) | |
# Values of the theoretical CDF at the sorted data points | |
F0_values <- theoretical_cdf(sorted_data) | |
# Calculate the differences at each data point | |
diff_at_points <- abs(Fn_values - F0_values) | |
# Also consider the step just before each point. | |
# F_n(x_i^-) = (i-1)/n | |
# F_0(x_i) is the theoretical CDF at that point. | |
# The `ecdf` function in R handles this automatically when you evaluate it, | |
# but for a manual calculation of the supremum, we need to consider both sides of the jump. | |
# For simplicity, we can calculate two types of differences: | |
# D+ = max_i (F_n(x_i) - F_0(x_i)) | |
# D- = max_i (F_0(x_i) - F_n(x_i-1)) | |
# The KS statistic is max(D+, D-) | |
# ECDF values at the points x_i (proportion of data <= x_i) | |
Fn_at_xi <- (1:n) / n | |
# ECDF values just before the points x_i (proportion of data < x_i) | |
Fn_before_xi <- (0:(n-1)) / n | |
# The two potential differences to check for the supremum: | |
# 1. |F_n(x_i) - F_0(x_i)| | |
diff1 <- abs(Fn_at_xi - F0_values) | |
# 2. |F_n(x_i-1) - F_0(x_i)| | |
# This needs careful handling for the first point (i=1), where F_n(x_0) would be 0. | |
# The `ecdf_sample(x)` evaluates to the value of the step function at x. | |
# The max difference can occur just before or at an observation. | |
# The standard definition of D_n takes the supremum over all x. | |
# For sorted data x_1 <= x_2 <= ... <= x_n, the supremum will always occur at one of the x_i. | |
# Specifically, we need to check: | |
# D_n = max(max_{i} (i/n - F_0(x_i)), max_{i} (F_0(x_i) - (i-1)/n)) | |
D_plus <- max(Fn_at_xi - F0_values) | |
D_minus <- max(F0_values - Fn_before_xi) | |
ks_statistic_manual_one_sample <- max(D_plus, D_minus) | |
cat("Manually calculated One-Sample KS Statistic:", ks_statistic_manual_one_sample, "\n") | |
# Manually calculated One-Sample KS Statistic: 0.0930341 | |
# Compare with R's built-in ks.test() | |
ks_result <- ks.test(sample_data, "pnorm", mean = 0, sd = 1) | |
cat("R's ks.test() One-Sample KS Statistic:", ks_result$statistic, "\n") | |
# R's ks.test() One-Sample KS Statistic: 0.0930341 | |
# Slight differences might occur due to how floating-point numbers are handled | |
# and the exact evaluation points for the supremum, but they should be very close. |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 1. Generate two sample datasets | |
set.seed(456) | |
sample1 <- rnorm(50, mean = 0, sd = 1) | |
sample2 <- rnorm(60, mean = 0.2, sd = 1) # Slightly shifted mean for potential difference | |
# 2. Compute ECDFs for both samples | |
ecdf1 <- ecdf(sample1) | |
ecdf2 <- ecdf(sample2) | |
# 3. Combine and sort all unique data points from both samples | |
# The supremum can only occur at values present in either sample. | |
all_points <- sort(unique(c(sample1, sample2))) | |
# 4. Evaluate both ECDFs at these combined sorted points | |
Fn_values_at_all_points <- ecdf1(all_points) | |
Gn_values_at_all_points <- ecdf2(all_points) | |
# 5. Calculate the absolute differences | |
differences <- abs(Fn_values_at_all_points - Gn_values_at_all_points) | |
# 6. Find the maximum of these differences | |
ks_statistic_manual_two_sample <- max(differences) | |
cat("Manually calculated Two-Sample KS Statistic:", ks_statistic_manual_two_sample, "\n") | |
# Manually calculated Two-Sample KS Statistic: 0.09666667 | |
# Compare with R's built-in ks.test() | |
ks_result_two_sample <- ks.test(sample1, sample2) | |
cat("R's ks.test() Two-Sample KS Statistic:", ks_result_two_sample$statistic, "\n") | |
# R's ks.test() Two-Sample KS Statistic: 0.09666667 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# --- 1. Set up the data and parameters --- | |
set.seed(789) # For reproducibility | |
# Sample 1: From a standard normal distribution | |
sample1 <- rnorm(50, mean = 0, sd = 1) | |
# Sample 2: From a slightly different normal distribution (null hypothesis is likely false) | |
# This will likely result in a small p-value from both methods | |
sample2 <- rnorm(60, mean = 0.5, sd = 1) | |
# Number of permutations | |
num_permutations <- 10000 | |
# --- 2. Calculate the observed KS statistic --- | |
# This is the statistic for our original, unpermuted data. | |
observed_ks_result <- ks.test(sample1, sample2) | |
observed_D_statistic <- observed_ks_result$statistic | |
cat("--- Observed Data --- \n") | |
cat("Observed KS D-statistic:", observed_D_statistic, "\n") | |
# Observed KS D-statistic: 0.3466667 | |
cat("p-value from R's ks.test():", observed_ks_result$p.value, "\n\n") | |
# p-value from R's ks.test(): 0.002056568 | |
# --- 3. Perform the Permutation Test --- | |
# Combine the two samples into one pool | |
pooled_data <- c(sample1, sample2) | |
n1 <- length(sample1) | |
n2 <- length(sample2) | |
total_n <- n1 + n2 | |
# Vector to store KS statistics from each permutation | |
permutation_D_statistics <- numeric(num_permutations) | |
cat("--- Running Permutation Test --- \n") | |
for (i in 1:num_permutations) { | |
# Shuffle the pooled data | |
permuted_data <- sample(pooled_data) | |
# Split the permuted data back into two 'samples' of the original sizes | |
permuted_sample1 <- permuted_data[1:n1] | |
permuted_sample2 <- permuted_data[(n1 + 1):total_n] | |
# Calculate the KS statistic for the permuted samples | |
# We use ks.test() directly here as requested | |
perm_ks_result <- ks.test(permuted_sample1, permuted_sample2) | |
permutation_D_statistics[i] <- perm_ks_result$statistic | |
} | |
# --- 4. Calculate the permutation p-value --- | |
# The p-value is the proportion of permuted D-statistics that are | |
# as extreme as or more extreme than the observed D-statistic. | |
# For KS test, 'more extreme' means 'greater than or equal to'. | |
permutation_p_value <- sum(permutation_D_statistics >= observed_D_statistic) / num_permutations | |
cat("\n--- Permutation Test Results --- \n") | |
cat("Number of permutations:", num_permutations, "\n") | |
cat("Permutation p-value:", permutation_p_value, "\n") | |
# Permutation p-value: 0.0016 | |
# --- 5. Optional: Visualize the permutation distribution --- | |
hist(permutation_D_statistics, | |
main = "Permutation Distribution of KS D-Statistic", | |
xlab = "KS D-Statistic", | |
freq = FALSE, # Use frequency = FALSE for density histogram | |
col = "lightblue", | |
border = "white") | |
abline(v = observed_D_statistic, col = "red", lty = 2, lwd = 2) | |
text(x = observed_D_statistic, y = max(hist(permutation_D_statistics, plot = FALSE)$density) * 0.9, | |
labels = paste("Observed D =", round(observed_D_statistic, 3)), | |
col = "red", pos = 4) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment