JotaRata · December 16, 2024 04:54 · JotaRata · Dec 16, 2024
diff --git a/bootstrap.py b/bootstrap.py
 import math
 import random
 from typing import Any, Callable, Concatenate, List, ParamSpec, ParamSpecKwargs, Tuple, cast
 import numpy as np
 from numpy.typing import NDArray,ArrayLike
 from pandas import DataFrame, Series

 # ---------------- Bootstrapping -------------------------------
 _ArrayLike = NDArray[np.floating] | None
 _NumFunction = Callable[..., float]

 """
 Bootstrap Module for Scientific Data Analysis.

 This module provides three utility functions—`bootstrap.sampler`, `bootstrap.sampler_mv`, and `bootstrap.split`—designed for scientific analysis of noisy data, particularly for tasks involving resampling, error estimation, and data chunking. These functions are optimized for large datasets and can be compiled with `mypyc` or other acceleration tools for performance enhancements.

 The core functionality revolves around applying bootstrap techniques to resample data, compute statistics, and estimate errors, which is useful in various scientific applications like curve fitting, noise reduction, and uncertainty quantification.

 Functions
 ---------
 - `sampler`: Performs bootstrap resampling on a 1D array of values, computing an estimator function and returning the specified statistic with optional error estimation.
 - `sampler_mv`: A multivariate extension of `sampler`, allowing for bootstrap sampling on two lists or arrays of data vectors.
 - `split`: Splits an input array into smaller chunks based on a specified window size, useful for partitioning time series or other data types.

 Example
 -------
 ```python
 # Extracting a clean curve from noisy data with corresponding errors
 import bootstrap
 from bootstrap import split
 import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt

 # Load data from file
 data = pd.read_csv("lightcurves.csv")
 time = data['time_days']
 flux = data['flux']

 # Prepare a list to store resampled data
 resampled_data = []

 # Split the data using a window size of 20 days, chunking data with the `indices` parameter
 for indices in split(time, 20, indices=data.index):
    if len(indices) == 0:
        continue
    
    # Compute the mean time value for the chunk
    mean_time = time[indices].mean()
    
    # Compute bootstrapped flux values and their errors
    flux_value, err = bootstrap.sampler(flux[indices], include_error=True, iter=1000)
    
    # Append results (mean time, bootstrapped flux, error)
    resampled_data.append([mean_time, flux_value, err])

 # Convert the list into a 2D NumPy array for easier access
 resampled_data = np.vstack(resampled_data)

 # Plot the results with error bars
 plt.errorbar(resampled_data[:, 0], resampled_data[:, 1], resampled_data[:, 2])
 plt.xlabel('Time (days)')
 plt.ylabel('Flux')
 plt.show()
 ```

 # Notes

 The functions in this module are intended for handling noisy data commonly encountered in scientific research. They are particularly useful in contexts such as:

 	* Time series analysis
 	* Bootstrapping for uncertainty quantification
 	* Handling non-parametric distributions

 For optimal performance, consider compiling the module using mypyc or other similar tools.
 """

 def sampler(sample :  List[float],
 				stat : _NumFunction = np.nanmedian,
 				est : _NumFunction = np.nanmedian,
 				iter : int = 1000,
 				include_error : bool = False,
 				print_values : bool = False) -> Tuple[float, float] | float:
 	"""
 		Bootstrap Sampler.

 		This function performs bootstrap resampling on a list of values by randomly resampling the data,
 		applying an estimator function to each sub-sample, and returning the statistic computed from these
 		estimations. Optionally, it can return the standard deviation of the estimates as an error measure.

 		Parameters
 		----------
 		sample : List[float]
 			A list of numerical values to be resampled.
 		stat : Callable, optional
 			A function that computes a statistic from the sub-sample estimates. Defaults to `np.nanmedian`.
 		est : Callable, optional
 			The estimator function applied to each sub-sample. Defaults to `np.nanmedian`.
 		iter : int, optional
 			The number of bootstrap iterations to perform. Defaults to 1000.
 		include_error : bool, optional
 			If `True`, the function will return a tuple of two floats: the computed statistic and the standard
 			deviation of the estimated values. Defaults to `False`.
 		print_values : bool, optional
 			If `True`, the function will print the estimated values at each iteration (for debugging purposes). Defaults to `False`.

 		Returns
 		-------
 		float or Tuple[float, float]
 			Returns the computed statistic as a single float. If `include_error` is `True`, it returns a tuple
 			containing the statistic and the standard deviation of the estimates.

 		Notes
 		-----
 		The function generates sub-samples by randomly selecting values from the input list with replacement.
 		The estimator function is applied to each sub-sample, and the result is appended to a list. After `iter`
 		iterations, the specified statistic is computed from the list of results.

 		Both the `est` and `stat` functions must accept a list of values and return a single float.

 		This process can be slow for a large number of iterations. For performance improvements,
 		consider compiling the module with `mypyc` or using other optimization methods.

 		For multivariate bootstrap sampling, refer to `sampler_mv`.
 	"""

 	
 	values : List[float] = list[float]()
 	_size = len(sample)
 	if _size < 3:
 		if include_error: 
 			return np.nan, np.nan
 		else: 
 			return np.nan
 	i : int 
 	for i in range(iter):
 		boot_sample : List[float] = np.random.choice(sample, replace = True, size = len(sample)).tolist()
 		boot_mean = est(boot_sample)
 		values.append(boot_mean)

 	if print_values:
 		print('values: ', values)
 	if include_error:
 		return stat(values), cast(float, np.nanstd(values))
 	else:
 		return stat(values)

 def sampler_mv(sample : Tuple[ List[float],  List[float]],
 				stat : _NumFunction = np.nanmedian,
 				est : Callable[..., float] | None = None,
 				iter : int = 1000,
 				indices : List[int]|None = None,
 				include_error : bool = False,
 				print_values : bool = False) -> Tuple[float, float] | float:
 	"""
 		Multivariate Bootstrap Sampler.

 		This function performs bootstrap sampling on two lists of vectors (or 2D arrays) by randomly resampling 
 		the data, applying an estimator function to each sub-sample, and returning the statistic computed 
 		from these estimations. Optionally, it can return the standard deviation of the estimates as an error measure.

 		Parameters
 		----------
 		sample : Tuple[List[float], List[float]]
 			A tuple containing two lists of float values representing the input data vectors.
 		stat : Callable, optional
 			A function that computes a statistic from the sub-sample estimates. Defaults to `np.nanmedian`.
 		est : Callable[..., float], optional
 			The estimator function applied to each sub-sample. It must accept two lists of float values
 			and return a single float. Required.
 		iter : int, optional
 			The number of bootstrap iterations to perform. Defaults to 1000.
 		indices : List[int], optional
 			A list of indices used to select the sub-sample from the input. If not provided, all elements
 			are used for sampling.
 		include_error : bool, optional
 			If `True`, the function will return a tuple of two floats: the computed statistic and the standard
 			deviation of the estimated values. Defaults to `False`.
 		print_values : bool, optional
 			If `True`, the function will print the estimated values at each iteration (for debugging purposes). Defaults to `False`.

 		Returns
 		-------
 		float or Tuple[float, float]
 			Returns the computed statistic as a single float. If `include_error` is `True`, it returns a tuple
 			containing the statistic and the standard deviation of the estimates.

 		Notes
 		-----
 		The function performs bootstrap sampling by generating sub-samples of the same length as the input,
 		applying the estimator to each sub-sample, and collecting the results. After `iter` iterations, 
 		the specified statistic is computed from the list of results.

 		The `est` function must be provided, and it should accept two lists of float values, returning a single float.
 		The `stat` function can be any function that takes a list of values and returns a single value (e.g., mean, median).

 		Due to the number of iterations, this process can be slow. For performance improvements, consider
 		compiling the module with `mypyc` or other optimization techniques.
 	"""

 	values : List[float] = list[float]()
 	assert stat is not None and est is not None
 	_sample0, _sample1 = sample
 	assert _sample0 is not None and _sample1 is not None
 	_size : int = len(_sample0)
 	assert _size == len(_sample1)
 	assert est is not None
 	if _size < 3:
 		if include_error: 
 			return np.nan, np.nan
 		else: 
 			return np.nan
 	if indices is None:
 		indices = list(range(_size))
 	for _ in range(iter):
 		boot_sample : List[int] = np.random.choice(indices, replace = True, size = len(indices)).tolist()
 		_s0 = [ _sample0[i] for i in boot_sample ]
 		_s1 = [ _sample1[i] for i in boot_sample ]
 		boot = est(_s0, _s1)
 		values.append(boot)

 	if print_values:
 		print('values: ', values)
 	if include_error:
 		return stat(values), cast(float, np.nanstd(values))
 	else:
 		return stat(values)


 def split(data : List[float], window : float|int, data_range : Tuple | None= None,
 			 indices : List[int]|None = None):
 	"""
 		Splits the input data into chunks based on a specified window size.

 		This function divides an array into smaller sub-arrays (chunks) where each chunk contains values
 		that fall within consecutive ranges of a defined `window` size. It can optionally apply the splitting
 		on a subset of indices while using the original data for comparison. This is typically used alongside
 		a bootstrap sampler.

 		Parameters
 		----------
 		data : List[float]
 			The input array of numerical data to be split.
 		window : float or int
 			The range or width of each chunk. The window defines the boundary for each chunk's
 			minimum and maximum values, not the number of elements.
 		data_range : Tuple[float, float], optional
 			A tuple specifying the range (min, max) to clip the input data before chunking. If not provided,
 			the range is determined by the minimum and maximum values of the input data.
 		indices : List[int], optional
 			An optional list of indices corresponding to the data. If provided, the function will
 			chunk the indices based on the data values.

 		Returns
 		-------
 		List[List[float]] or List[List[int]]
 			A list of lists where each inner list represents a chunk of data or indices depending
 			on whether `indices` is provided.
 	"""


 	if data_range is None: 
 		data_range = min(data), max(data)

 	count = math.ceil((data_range[1] - data_range[0]) / window)

 	if indices is None:
 		chunks_f = list[list[float]]()
 		for i in range(1, count + 1):
 			mask = [ (data[j] > (i - 1) * window + data_range[0]) \
 			  			& (data[j] < i * window + data_range[0]) for j in range(len(data)) ]
 			chunks_f.append([ data[i] for i, m in enumerate(mask) if m ])
 		return chunks_f
 	else:
 		chunks_i = list[list[int]]()
 		for i in range(1, count + 1):
 			mask = [ (data[j] > (i - 1) * window + data_range[0]) \
 						& (data[j] < i * window + data_range[0]) for j in range(len(data)) ]
 			chunks_i.append([ indices[i] for i, m in enumerate(mask) if m ])
 		return chunks_i
	import math
	import random
	from typing import Any, Callable, Concatenate, List, ParamSpec, ParamSpecKwargs, Tuple, cast
	import numpy as np
	from numpy.typing import NDArray,ArrayLike
	from pandas import DataFrame, Series

	# ---------------- Bootstrapping -------------------------------
	_ArrayLike = NDArray[np.floating] \| None
	_NumFunction = Callable[..., float]

	"""
	Bootstrap Module for Scientific Data Analysis.

	This module provides three utility functions—`bootstrap.sampler`, `bootstrap.sampler_mv`, and `bootstrap.split`—designed for scientific analysis of noisy data, particularly for tasks involving resampling, error estimation, and data chunking. These functions are optimized for large datasets and can be compiled with `mypyc` or other acceleration tools for performance enhancements.

	The core functionality revolves around applying bootstrap techniques to resample data, compute statistics, and estimate errors, which is useful in various scientific applications like curve fitting, noise reduction, and uncertainty quantification.

	Functions
	---------
	- `sampler`: Performs bootstrap resampling on a 1D array of values, computing an estimator function and returning the specified statistic with optional error estimation.
	- `sampler_mv`: A multivariate extension of `sampler`, allowing for bootstrap sampling on two lists or arrays of data vectors.
	- `split`: Splits an input array into smaller chunks based on a specified window size, useful for partitioning time series or other data types.

	Example
	-------
	```python
	# Extracting a clean curve from noisy data with corresponding errors
	import bootstrap
	from bootstrap import split
	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt

	# Load data from file
	data = pd.read_csv("lightcurves.csv")
	time = data['time_days']
	flux = data['flux']

	# Prepare a list to store resampled data
	resampled_data = []

	# Split the data using a window size of 20 days, chunking data with the `indices` parameter
	for indices in split(time, 20, indices=data.index):
	if len(indices) == 0:
	continue

	# Compute the mean time value for the chunk
	mean_time = time[indices].mean()

	# Compute bootstrapped flux values and their errors
	flux_value, err = bootstrap.sampler(flux[indices], include_error=True, iter=1000)

	# Append results (mean time, bootstrapped flux, error)
	resampled_data.append([mean_time, flux_value, err])

	# Convert the list into a 2D NumPy array for easier access
	resampled_data = np.vstack(resampled_data)

	# Plot the results with error bars
	plt.errorbar(resampled_data[:, 0], resampled_data[:, 1], resampled_data[:, 2])
	plt.xlabel('Time (days)')
	plt.ylabel('Flux')
	plt.show()
	```

	# Notes

	The functions in this module are intended for handling noisy data commonly encountered in scientific research. They are particularly useful in contexts such as:

	* Time series analysis
	* Bootstrapping for uncertainty quantification
	* Handling non-parametric distributions

	For optimal performance, consider compiling the module using mypyc or other similar tools.
	"""

	def sampler(sample : List[float],
	stat : _NumFunction = np.nanmedian,
	est : _NumFunction = np.nanmedian,
	iter : int = 1000,
	include_error : bool = False,
	print_values : bool = False) -> Tuple[float, float] \| float:
	"""
	Bootstrap Sampler.

	This function performs bootstrap resampling on a list of values by randomly resampling the data,
	applying an estimator function to each sub-sample, and returning the statistic computed from these
	estimations. Optionally, it can return the standard deviation of the estimates as an error measure.

	Parameters
	----------
	sample : List[float]
	A list of numerical values to be resampled.
	stat : Callable, optional
	A function that computes a statistic from the sub-sample estimates. Defaults to `np.nanmedian`.
	est : Callable, optional
	The estimator function applied to each sub-sample. Defaults to `np.nanmedian`.
	iter : int, optional
	The number of bootstrap iterations to perform. Defaults to 1000.
	include_error : bool, optional
	If `True`, the function will return a tuple of two floats: the computed statistic and the standard
	deviation of the estimated values. Defaults to `False`.
	print_values : bool, optional
	If `True`, the function will print the estimated values at each iteration (for debugging purposes). Defaults to `False`.

	Returns
	-------
	float or Tuple[float, float]
	Returns the computed statistic as a single float. If `include_error` is `True`, it returns a tuple
	containing the statistic and the standard deviation of the estimates.

	Notes
	-----
	The function generates sub-samples by randomly selecting values from the input list with replacement.
	The estimator function is applied to each sub-sample, and the result is appended to a list. After `iter`
	iterations, the specified statistic is computed from the list of results.

	Both the `est` and `stat` functions must accept a list of values and return a single float.

	This process can be slow for a large number of iterations. For performance improvements,
	consider compiling the module with `mypyc` or using other optimization methods.

	For multivariate bootstrap sampling, refer to `sampler_mv`.
	"""


	values : List[float] = list[float]()
	_size = len(sample)
	if _size < 3:
	if include_error:
	return np.nan, np.nan
	else:
	return np.nan
	i : int
	for i in range(iter):
	boot_sample : List[float] = np.random.choice(sample, replace = True, size = len(sample)).tolist()
	boot_mean = est(boot_sample)
	values.append(boot_mean)

	if print_values:
	print('values: ', values)
	if include_error:
	return stat(values), cast(float, np.nanstd(values))
	else:
	return stat(values)

	def sampler_mv(sample : Tuple[ List[float], List[float]],
	stat : _NumFunction = np.nanmedian,
	est : Callable[..., float] \| None = None,
	iter : int = 1000,
	indices : List[int]\|None = None,
	include_error : bool = False,
	print_values : bool = False) -> Tuple[float, float] \| float:
	"""
	Multivariate Bootstrap Sampler.

	This function performs bootstrap sampling on two lists of vectors (or 2D arrays) by randomly resampling
	the data, applying an estimator function to each sub-sample, and returning the statistic computed
	from these estimations. Optionally, it can return the standard deviation of the estimates as an error measure.

	Parameters
	----------
	sample : Tuple[List[float], List[float]]
	A tuple containing two lists of float values representing the input data vectors.
	stat : Callable, optional
	A function that computes a statistic from the sub-sample estimates. Defaults to `np.nanmedian`.
	est : Callable[..., float], optional
	The estimator function applied to each sub-sample. It must accept two lists of float values
	and return a single float. Required.
	iter : int, optional
	The number of bootstrap iterations to perform. Defaults to 1000.
	indices : List[int], optional
	A list of indices used to select the sub-sample from the input. If not provided, all elements
	are used for sampling.
	include_error : bool, optional
	If `True`, the function will return a tuple of two floats: the computed statistic and the standard
	deviation of the estimated values. Defaults to `False`.
	print_values : bool, optional
	If `True`, the function will print the estimated values at each iteration (for debugging purposes). Defaults to `False`.

	Returns
	-------
	float or Tuple[float, float]
	Returns the computed statistic as a single float. If `include_error` is `True`, it returns a tuple
	containing the statistic and the standard deviation of the estimates.

	Notes
	-----
	The function performs bootstrap sampling by generating sub-samples of the same length as the input,
	applying the estimator to each sub-sample, and collecting the results. After `iter` iterations,
	the specified statistic is computed from the list of results.

	The `est` function must be provided, and it should accept two lists of float values, returning a single float.
	The `stat` function can be any function that takes a list of values and returns a single value (e.g., mean, median).

	Due to the number of iterations, this process can be slow. For performance improvements, consider
	compiling the module with `mypyc` or other optimization techniques.
	"""

	values : List[float] = list[float]()
	assert stat is not None and est is not None
	_sample0, _sample1 = sample
	assert _sample0 is not None and _sample1 is not None
	_size : int = len(_sample0)
	assert _size == len(_sample1)
	assert est is not None
	if _size < 3:
	if include_error:
	return np.nan, np.nan
	else:
	return np.nan
	if indices is None:
	indices = list(range(_size))
	for _ in range(iter):
	boot_sample : List[int] = np.random.choice(indices, replace = True, size = len(indices)).tolist()
	_s0 = [ _sample0[i] for i in boot_sample ]
	_s1 = [ _sample1[i] for i in boot_sample ]
	boot = est(_s0, _s1)
	values.append(boot)

	if print_values:
	print('values: ', values)
	if include_error:
	return stat(values), cast(float, np.nanstd(values))
	else:
	return stat(values)


	def split(data : List[float], window : float\|int, data_range : Tuple \| None= None,
	indices : List[int]\|None = None):
	"""
	Splits the input data into chunks based on a specified window size.

	This function divides an array into smaller sub-arrays (chunks) where each chunk contains values
	that fall within consecutive ranges of a defined `window` size. It can optionally apply the splitting
	on a subset of indices while using the original data for comparison. This is typically used alongside
	a bootstrap sampler.

	Parameters
	----------
	data : List[float]
	The input array of numerical data to be split.
	window : float or int
	The range or width of each chunk. The window defines the boundary for each chunk's
	minimum and maximum values, not the number of elements.
	data_range : Tuple[float, float], optional
	A tuple specifying the range (min, max) to clip the input data before chunking. If not provided,
	the range is determined by the minimum and maximum values of the input data.
	indices : List[int], optional
	An optional list of indices corresponding to the data. If provided, the function will
	chunk the indices based on the data values.

	Returns
	-------
	List[List[float]] or List[List[int]]
	A list of lists where each inner list represents a chunk of data or indices depending
	on whether `indices` is provided.
	"""


	if data_range is None:
	data_range = min(data), max(data)

	count = math.ceil((data_range[1] - data_range[0]) / window)

	if indices is None:
	chunks_f = list[list[float]]()
	for i in range(1, count + 1):
	mask = [ (data[j] > (i - 1) * window + data_range[0]) \
	& (data[j] < i * window + data_range[0]) for j in range(len(data)) ]
	chunks_f.append([ data[i] for i, m in enumerate(mask) if m ])
	return chunks_f
	else:
	chunks_i = list[list[int]]()
	for i in range(1, count + 1):
	mask = [ (data[j] > (i - 1) * window + data_range[0]) \
	& (data[j] < i * window + data_range[0]) for j in range(len(data)) ]
	chunks_i.append([ indices[i] for i, m in enumerate(mask) if m ])
	return chunks_i