arraytools · June 12, 2025 17:20
diff --git a/ks_1sample.R b/ks_1sample.R
 # 1. Generate sample data
 set.seed(123)
 sample_data <- rnorm(100, mean = 0, sd = 1) # A sample from a standard normal distribution

 # 2. Define the theoretical CDF (for standard normal distribution)
 # In R, pnorm() is the CDF for the normal distribution
 theoretical_cdf <- function(x) pnorm(x, mean = 0, sd = 1)

 # 3. Compute the ECDF of your sample
 ecdf_sample <- ecdf(sample_data)

 # 4. Evaluate ECDF and theoretical CDF at relevant points
 # The KS statistic is the maximum difference, which occurs at the data points themselves
 # or just below them. We need to check both F_n(x_i) and F_n(x_i-1) approx.
 # A common approach is to sort the data and evaluate at these points.
 sorted_data <- sort(sample_data)
 n <- length(sorted_data)

 # Values of the ECDF at the sorted data points
 Fn_values <- ecdf_sample(sorted_data)

 # Values of the theoretical CDF at the sorted data points
 F0_values <- theoretical_cdf(sorted_data)

 # Calculate the differences at each data point
 diff_at_points <- abs(Fn_values - F0_values)

 # Also consider the step just before each point.
 # F_n(x_i^-) = (i-1)/n
 # F_0(x_i) is the theoretical CDF at that point.
 # The `ecdf` function in R handles this automatically when you evaluate it,
 # but for a manual calculation of the supremum, we need to consider both sides of the jump.
 # For simplicity, we can calculate two types of differences:
 # D+ = max_i (F_n(x_i) - F_0(x_i))
 # D- = max_i (F_0(x_i) - F_n(x_i-1))
 # The KS statistic is max(D+, D-)

 # ECDF values at the points x_i (proportion of data <= x_i)
 Fn_at_xi <- (1:n) / n

 # ECDF values just before the points x_i (proportion of data < x_i)
 Fn_before_xi <- (0:(n-1)) / n

 # The two potential differences to check for the supremum:
 # 1. |F_n(x_i) - F_0(x_i)|
 diff1 <- abs(Fn_at_xi - F0_values)

 # 2. |F_n(x_i-1) - F_0(x_i)|
 # This needs careful handling for the first point (i=1), where F_n(x_0) would be 0.
 # The `ecdf_sample(x)` evaluates to the value of the step function at x.
 # The max difference can occur just before or at an observation.
 # The standard definition of D_n takes the supremum over all x.
 # For sorted data x_1 <= x_2 <= ... <= x_n, the supremum will always occur at one of the x_i.
 # Specifically, we need to check:
 # D_n = max(max_{i} (i/n - F_0(x_i)), max_{i} (F_0(x_i) - (i-1)/n))

 D_plus <- max(Fn_at_xi - F0_values)
 D_minus <- max(F0_values - Fn_before_xi)

 ks_statistic_manual_one_sample <- max(D_plus, D_minus)
 cat("Manually calculated One-Sample KS Statistic:", ks_statistic_manual_one_sample, "\n")
 # Manually calculated One-Sample KS Statistic: 0.0930341 

 # Compare with R's built-in ks.test()
 ks_result <- ks.test(sample_data, "pnorm", mean = 0, sd = 1)
 cat("R's ks.test() One-Sample KS Statistic:", ks_result$statistic, "\n")
 # R's ks.test() One-Sample KS Statistic: 0.0930341 

 # Slight differences might occur due to how floating-point numbers are handled
 # and the exact evaluation points for the supremum, but they should be very close.
diff --git a/ks_2sample.R b/ks_2sample.R
 # 1. Generate two sample datasets
 set.seed(456)
 sample1 <- rnorm(50, mean = 0, sd = 1)
 sample2 <- rnorm(60, mean = 0.2, sd = 1) # Slightly shifted mean for potential difference

 # 2. Compute ECDFs for both samples
 ecdf1 <- ecdf(sample1)
 ecdf2 <- ecdf(sample2)

 # 3. Combine and sort all unique data points from both samples
 # The supremum can only occur at values present in either sample.
 all_points <- sort(unique(c(sample1, sample2)))

 # 4. Evaluate both ECDFs at these combined sorted points
 Fn_values_at_all_points <- ecdf1(all_points)
 Gn_values_at_all_points <- ecdf2(all_points)

 # 5. Calculate the absolute differences
 differences <- abs(Fn_values_at_all_points - Gn_values_at_all_points)

 # 6. Find the maximum of these differences
 ks_statistic_manual_two_sample <- max(differences)

 cat("Manually calculated Two-Sample KS Statistic:", ks_statistic_manual_two_sample, "\n")
 # Manually calculated Two-Sample KS Statistic: 0.09666667 

 # Compare with R's built-in ks.test()
 ks_result_two_sample <- ks.test(sample1, sample2)
 cat("R's ks.test() Two-Sample KS Statistic:", ks_result_two_sample$statistic, "\n")
 # R's ks.test() Two-Sample KS Statistic: 0.09666667 
diff --git a/ks_2sample_permutation.R b/ks_2sample_permutation.R
 # --- 1. Set up the data and parameters ---
 set.seed(789) # For reproducibility

 # Sample 1: From a standard normal distribution
 sample1 <- rnorm(50, mean = 0, sd = 1)

 # Sample 2: From a slightly different normal distribution (null hypothesis is likely false)
 # This will likely result in a small p-value from both methods
 sample2 <- rnorm(60, mean = 0.5, sd = 1)

 # Number of permutations
 num_permutations <- 10000

 # --- 2. Calculate the observed KS statistic ---
 # This is the statistic for our original, unpermuted data.
 observed_ks_result <- ks.test(sample1, sample2)
 observed_D_statistic <- observed_ks_result$statistic

 cat("--- Observed Data --- \n")
 cat("Observed KS D-statistic:", observed_D_statistic, "\n")
 # Observed KS D-statistic: 0.3466667 
 cat("p-value from R's ks.test():", observed_ks_result$p.value, "\n\n")
 # p-value from R's ks.test(): 0.002056568

 # --- 3. Perform the Permutation Test ---

 # Combine the two samples into one pool
 pooled_data <- c(sample1, sample2)
 n1 <- length(sample1)
 n2 <- length(sample2)
 total_n <- n1 + n2

 # Vector to store KS statistics from each permutation
 permutation_D_statistics <- numeric(num_permutations)

 cat("--- Running Permutation Test --- \n")
 for (i in 1:num_permutations) {
  # Shuffle the pooled data
  permuted_data <- sample(pooled_data)
  
  # Split the permuted data back into two 'samples' of the original sizes
  permuted_sample1 <- permuted_data[1:n1]
  permuted_sample2 <- permuted_data[(n1 + 1):total_n]
  
  # Calculate the KS statistic for the permuted samples
  # We use ks.test() directly here as requested
  perm_ks_result <- ks.test(permuted_sample1, permuted_sample2)
  permutation_D_statistics[i] <- perm_ks_result$statistic
 }

 # --- 4. Calculate the permutation p-value ---
 # The p-value is the proportion of permuted D-statistics that are
 # as extreme as or more extreme than the observed D-statistic.
 # For KS test, 'more extreme' means 'greater than or equal to'.
 permutation_p_value <- sum(permutation_D_statistics >= observed_D_statistic) / num_permutations

 cat("\n--- Permutation Test Results --- \n")
 cat("Number of permutations:", num_permutations, "\n")
 cat("Permutation p-value:", permutation_p_value, "\n")
 # Permutation p-value: 0.0016 

 # --- 5. Optional: Visualize the permutation distribution ---
 hist(permutation_D_statistics,
     main = "Permutation Distribution of KS D-Statistic",
     xlab = "KS D-Statistic",
     freq = FALSE, # Use frequency = FALSE for density histogram
     col = "lightblue",
     border = "white")
 abline(v = observed_D_statistic, col = "red", lty = 2, lwd = 2)
 text(x = observed_D_statistic, y = max(hist(permutation_D_statistics, plot = FALSE)$density) * 0.9,
     labels = paste("Observed D =", round(observed_D_statistic, 3)),
     col = "red", pos = 4)
	# 1. Generate sample data
	set.seed(123)
	sample_data <- rnorm(100, mean = 0, sd = 1) # A sample from a standard normal distribution

	# 2. Define the theoretical CDF (for standard normal distribution)
	# In R, pnorm() is the CDF for the normal distribution
	theoretical_cdf <- function(x) pnorm(x, mean = 0, sd = 1)

	# 3. Compute the ECDF of your sample
	ecdf_sample <- ecdf(sample_data)

	# 4. Evaluate ECDF and theoretical CDF at relevant points
	# The KS statistic is the maximum difference, which occurs at the data points themselves
	# or just below them. We need to check both F_n(x_i) and F_n(x_i-1) approx.
	# A common approach is to sort the data and evaluate at these points.
	sorted_data <- sort(sample_data)
	n <- length(sorted_data)

	# Values of the ECDF at the sorted data points
	Fn_values <- ecdf_sample(sorted_data)

	# Values of the theoretical CDF at the sorted data points
	F0_values <- theoretical_cdf(sorted_data)

	# Calculate the differences at each data point
	diff_at_points <- abs(Fn_values - F0_values)

	# Also consider the step just before each point.
	# F_n(x_i^-) = (i-1)/n
	# F_0(x_i) is the theoretical CDF at that point.
	# The `ecdf` function in R handles this automatically when you evaluate it,
	# but for a manual calculation of the supremum, we need to consider both sides of the jump.
	# For simplicity, we can calculate two types of differences:
	# D+ = max_i (F_n(x_i) - F_0(x_i))
	# D- = max_i (F_0(x_i) - F_n(x_i-1))
	# The KS statistic is max(D+, D-)

	# ECDF values at the points x_i (proportion of data <= x_i)
	Fn_at_xi <- (1:n) / n

	# ECDF values just before the points x_i (proportion of data < x_i)
	Fn_before_xi <- (0:(n-1)) / n

	# The two potential differences to check for the supremum:
	# 1. \|F_n(x_i) - F_0(x_i)\|
	diff1 <- abs(Fn_at_xi - F0_values)

	# 2. \|F_n(x_i-1) - F_0(x_i)\|
	# This needs careful handling for the first point (i=1), where F_n(x_0) would be 0.
	# The `ecdf_sample(x)` evaluates to the value of the step function at x.
	# The max difference can occur just before or at an observation.
	# The standard definition of D_n takes the supremum over all x.
	# For sorted data x_1 <= x_2 <= ... <= x_n, the supremum will always occur at one of the x_i.
	# Specifically, we need to check:
	# D_n = max(max_{i} (i/n - F_0(x_i)), max_{i} (F_0(x_i) - (i-1)/n))

	D_plus <- max(Fn_at_xi - F0_values)
	D_minus <- max(F0_values - Fn_before_xi)

	ks_statistic_manual_one_sample <- max(D_plus, D_minus)
	cat("Manually calculated One-Sample KS Statistic:", ks_statistic_manual_one_sample, "\n")
	# Manually calculated One-Sample KS Statistic: 0.0930341

	# Compare with R's built-in ks.test()
	ks_result <- ks.test(sample_data, "pnorm", mean = 0, sd = 1)
	cat("R's ks.test() One-Sample KS Statistic:", ks_result$statistic, "\n")
	# R's ks.test() One-Sample KS Statistic: 0.0930341

	# Slight differences might occur due to how floating-point numbers are handled
	# and the exact evaluation points for the supremum, but they should be very close.
	# 1. Generate two sample datasets
	set.seed(456)
	sample1 <- rnorm(50, mean = 0, sd = 1)
	sample2 <- rnorm(60, mean = 0.2, sd = 1) # Slightly shifted mean for potential difference

	# 2. Compute ECDFs for both samples
	ecdf1 <- ecdf(sample1)
	ecdf2 <- ecdf(sample2)

	# 3. Combine and sort all unique data points from both samples
	# The supremum can only occur at values present in either sample.
	all_points <- sort(unique(c(sample1, sample2)))

	# 4. Evaluate both ECDFs at these combined sorted points
	Fn_values_at_all_points <- ecdf1(all_points)
	Gn_values_at_all_points <- ecdf2(all_points)

	# 5. Calculate the absolute differences
	differences <- abs(Fn_values_at_all_points - Gn_values_at_all_points)

	# 6. Find the maximum of these differences
	ks_statistic_manual_two_sample <- max(differences)

	cat("Manually calculated Two-Sample KS Statistic:", ks_statistic_manual_two_sample, "\n")
	# Manually calculated Two-Sample KS Statistic: 0.09666667

	# Compare with R's built-in ks.test()
	ks_result_two_sample <- ks.test(sample1, sample2)
	cat("R's ks.test() Two-Sample KS Statistic:", ks_result_two_sample$statistic, "\n")
	# R's ks.test() Two-Sample KS Statistic: 0.09666667
	# --- 1. Set up the data and parameters ---
	set.seed(789) # For reproducibility

	# Sample 1: From a standard normal distribution
	sample1 <- rnorm(50, mean = 0, sd = 1)

	# Sample 2: From a slightly different normal distribution (null hypothesis is likely false)
	# This will likely result in a small p-value from both methods
	sample2 <- rnorm(60, mean = 0.5, sd = 1)

	# Number of permutations
	num_permutations <- 10000

	# --- 2. Calculate the observed KS statistic ---
	# This is the statistic for our original, unpermuted data.
	observed_ks_result <- ks.test(sample1, sample2)
	observed_D_statistic <- observed_ks_result$statistic

	cat("--- Observed Data --- \n")
	cat("Observed KS D-statistic:", observed_D_statistic, "\n")
	# Observed KS D-statistic: 0.3466667
	cat("p-value from R's ks.test():", observed_ks_result$p.value, "\n\n")
	# p-value from R's ks.test(): 0.002056568

	# --- 3. Perform the Permutation Test ---

	# Combine the two samples into one pool
	pooled_data <- c(sample1, sample2)
	n1 <- length(sample1)
	n2 <- length(sample2)
	total_n <- n1 + n2

	# Vector to store KS statistics from each permutation
	permutation_D_statistics <- numeric(num_permutations)

	cat("--- Running Permutation Test --- \n")
	for (i in 1:num_permutations) {
	# Shuffle the pooled data
	permuted_data <- sample(pooled_data)

	# Split the permuted data back into two 'samples' of the original sizes
	permuted_sample1 <- permuted_data[1:n1]
	permuted_sample2 <- permuted_data[(n1 + 1):total_n]

	# Calculate the KS statistic for the permuted samples
	# We use ks.test() directly here as requested
	perm_ks_result <- ks.test(permuted_sample1, permuted_sample2)
	permutation_D_statistics[i] <- perm_ks_result$statistic
	}

	# --- 4. Calculate the permutation p-value ---
	# The p-value is the proportion of permuted D-statistics that are
	# as extreme as or more extreme than the observed D-statistic.
	# For KS test, 'more extreme' means 'greater than or equal to'.
	permutation_p_value <- sum(permutation_D_statistics >= observed_D_statistic) / num_permutations

	cat("\n--- Permutation Test Results --- \n")
	cat("Number of permutations:", num_permutations, "\n")
	cat("Permutation p-value:", permutation_p_value, "\n")
	# Permutation p-value: 0.0016

	# --- 5. Optional: Visualize the permutation distribution ---
	hist(permutation_D_statistics,
	main = "Permutation Distribution of KS D-Statistic",
	xlab = "KS D-Statistic",
	freq = FALSE, # Use frequency = FALSE for density histogram
	col = "lightblue",
	border = "white")
	abline(v = observed_D_statistic, col = "red", lty = 2, lwd = 2)
	text(x = observed_D_statistic, y = max(hist(permutation_D_statistics, plot = FALSE)$density) * 0.9,
	labels = paste("Observed D =", round(observed_D_statistic, 3)),
	col = "red", pos = 4)