vhaasteren · March 7, 2024 16:12
diff --git a/gistfile1.txt b/gistfile1.txt
 #!/usr/bin/python
 # -*- coding: utf-8 -*-
 # vim: tabstop=4:softtabstop=4:shiftwidth=4:expandtab

 """
 cuda_interpolation: Fast bulk interpolation using numpy, cuda, and cupy

 author:     Rutger van Haasteren
 email:      [email protected]
 date:       March, 2024
 """

 import numpy as np
 import scipy.interpolate as sint
 import scipy.stats as sstats
 import cupy as cp
 import os

 # This is the code for a CUDA kernel that does a parallel binary search
 kernel_code = '''
 extern "C"
 __global__ void binary_search_kernel(const float* rho, const float* knots, int* left_indices, int n_rhos, int n_knots) {
    int rho_idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (rho_idx >= n_rhos) return;

    float rho_val = rho[rho_idx];
    const float* row_knots = knots + rho_idx * n_knots;

    int left = 0;
    int right = n_knots - 1;
    int mid = 0;
    while (left <= right) {
        mid = left + (right - left) / 2;
        if (row_knots[mid] <= rho_val) {
            left = mid + 1;
        } else {
            right = mid - 1;
        }
    }
    left_indices[rho_idx] = max(0, left - 1);
 }
 '''

 # This is the code for a CUDA kernel that does the full parallel interpolation
 bulk_interpolation_kernel_code = '''
 extern "C"
 __global__ void bulk_interpolation_kernel(const float* rho, const float* knots, const float* coeffs, float* y, int n_rhos, int n_knots, int n_order) {
    int rho_idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (rho_idx >= n_rhos) return;

    float rho_val = rho[rho_idx];
    const float* row_knots = knots + rho_idx * n_knots;
    const float* row_coeffs = coeffs + rho_idx * (n_knots-1) * n_order;

    int left = 0;
    int right = n_knots - 1;
    int mid = 0;
    int left_index = 0;
    while (left <= right) {
        mid = left + (right - left) / 2;
        if (row_knots[mid] <= rho_val) {
            left = mid + 1;
        } else {
            right = mid - 1;
        }
    }
    left_index = max(0, left - 1);

    /* We have the left-indices. Now do the interpolation */
    float delta = rho_val - row_knots[left_index];
    float multiply = 1.0;
    int order = 0;
    y[rho_idx] = 0;

    for(order=n_order-1; order>=0; order--) {
        y[rho_idx] += row_coeffs[left_index*n_order+order] * multiply;
        multiply = multiply * delta;
    }
 }
 '''

 # Compile the CUDA kernels
 binary_search_kernel = cp.RawKernel(kernel_code, 'binary_search_kernel')
 bulk_interpolation_kernel = cp.RawKernel(bulk_interpolation_kernel_code, 'bulk_interpolation_kernel')

 # The custom CUDA kernel for the bulk binary tree search

 def linear_interp_to_spline(spline_func):
    """Convert a scipy.interp1d linear representation to spline rep"""
    delta_x = spline_func.x[1:] - spline_func.x[:-1]
    delta_y = spline_func.y[1:] - spline_func.y[:-1]

    spline_func.c = np.stack([delta_y/delta_x, spline_func.y[:-1]], axis=0)

 def get_spline_knots_coeffs(spline_funcs, order=3):
    """Get the knots and coefficients of the splines"""
    coeffs = np.stack([s.c.T for s in spline_funcs], axis=0)
    knots = np.stack([s.x for s in spline_funcs], axis=0)

    return knots, coeffs[:,:,:order]

 def find_segment_indices(rho, knots):
    """Find the segment indices for each rho value in its corresponding set of knots."""
    # Expand rho to match knots shape for broadcasting
    rho_expanded = rho[:, None]

    # Find the knot indices
    is_less_equal = rho_expanded <= knots
    right_knot_indices = np.argmax(is_less_equal, axis=1)
    left_knot_indices = right_knot_indices - 1

    # Adjust indices to ensure they are within bounds
    return np.clip(left_knot_indices, 0, knots.shape[1] - 2)

 def find_segment_indices_cupy(rho, knots):
    """Find the segment indices for each rho value in its corresponding set of knots using CuPy."""
    rho_expanded = cp.asarray(rho, dtype=np.float32)[:, None]
    knots = cp.asarray(knots, dtype=np.float32)

    # Find the knot indices
    is_less_equal = rho_expanded <= knots
    right_knot_indices = cp.argmax(is_less_equal, axis=1)
    left_knot_indices = right_knot_indices - 1

    return cp.clip(left_knot_indices, 0, knots.shape[1] - 2)

 def find_segment_indices_cuda(rho, knots):
    """Find the segment indices for each rho value in its corresponding set of knots using CuPy, optimized with binary search."""
    # Ensure rho and knots are CuPy arrays, single precision
    rho_cp = cp.asarray(rho, dtype=np.float32)
    knots_cp = cp.asarray(knots, dtype=np.float32)

    # Prepare output array for indices
    left_knot_indices = cp.empty(rho_cp.size, dtype=cp.int32)

    # Launch the custom kernel to find the knot indices
    threads_per_block = 256
    blocks_per_grid = (rho_cp.size + (threads_per_block - 1)) // threads_per_block
    binary_search_kernel((blocks_per_grid,), (threads_per_block,), (rho_cp, knots_cp, left_knot_indices, knots_cp.shape[0], knots_cp.shape[1]))

    return left_knot_indices

 def polynomial_spline_eval_bulk(rho, knots, coeffs):
    """Efficient polynomial interpolation using vectorized operations.

    Parameters:
    ----------
    - rho: array of points at which to evaluate the interpolation.
    - knots: array of x-values where the interpolator changes ('knots').
    - coeffs: array of coefficients for each polynomial segment. For a linear
      segment, coeffs[..., 1] is the y-value at the start of the segment, and
      coeffs[..., 0] is the slope of the segment.

    Assumes the first dimension of knots and coeffs correspond to different interpolators,
    and that knots for each interpolator are sorted.
    """

    # Find the index of the left knot for each rho
    left_knot_indices = find_segment_indices(rho, knots)

    # Gather the coefficients for the relevant segment
    segment_coeffs = np.take_along_axis(coeffs, left_knot_indices[:, None, None], axis=1).squeeze(axis=1)

    # Gather the x values for the left knots
    x0 = np.take_along_axis(knots, left_knot_indices[:, None], axis=1).flatten()

    # Evaluate the polynomial at rho
    delta = rho - x0
    y = np.sum(segment_coeffs * delta[..., None] ** np.arange(segment_coeffs.shape[1])[::-1], axis=-1)

    return y

 def polynomial_spline_eval_bulk_cupy(rho, knots, coeffs):
    """Efficient polynomial interpolation using GPU operations. Make sure that the
    input arrays are already on the GPU in np.float32 for efficiency

    Parameters:
    ----------
    - rho: array of points at which to evaluate the interpolation.
    - knots: array of x-values where the interpolator changes ('knots').
    - coeffs: array of coefficients for each polynomial segment. For a linear
      segment, coeffs[..., 1] is the y-value at the start of the segment, and
      coeffs[..., 0] is the slope of the segment.

    Assumes the first dimension of knots and coeffs correspond to different interpolators,
    and that knots for each interpolator are sorted.
    """
    # Make sure these are all cupy arrays
    rho = cp.asarray(rho, dtype=np.float32)
    knots = cp.asarray(knots, dtype=np.float32)
    coeffs = cp.asarray(coeffs, dtype=np.float32)

    # Find the index of the left knot for each rho
    left_knot_indices = find_segment_indices_cupy(rho, knots)

    # Ensuring that the indexing matches the NumPy approach more closely
    segment_coeffs = cp.take_along_axis(coeffs, left_knot_indices[:, None, None], axis=1).squeeze(axis=1)

    # Gather the x values for the left knots
    x0 = cp.take_along_axis(knots, left_knot_indices[:, None], axis=1).flatten()

    # Do the interpolation step
    delta = rho - x0
    powers = cp.arange(segment_coeffs.shape[1])[::-1]
    delta_powers = delta[..., None] ** powers
    y = cp.sum(segment_coeffs * delta_powers, axis=-1)

    return y

 def polynomial_spline_eval_bulk_cuda(rho, knots, coeffs):
    """Efficient polynomial interpolation using custom GPU operations.
    This one is identical to polynomial_spline_eval_bulk_cupy, except
    that the binary tree search is done with a custom CUDA kernel here

    Parameters:
    ----------
    - rho: array of points at which to evaluate the interpolation.
    - knots: array of x-values where the interpolator changes ('knots').
    - coeffs: array of coefficients for each polynomial segment. For a linear
      segment, coeffs[..., 1] is the y-value at the start of the segment, and
      coeffs[..., 0] is the slope of the segment.

    Assumes the first dimension of knots and coeffs correspond to different interpolators,
    and that knots for each interpolator are sorted.
    """
    # Make sure these are all cupy arrays
    rho = cp.asarray(rho, dtype=np.float32)
    knots = cp.asarray(knots, dtype=np.float32)
    coeffs = cp.asarray(coeffs, dtype=np.float32)

    # Find the index of the left knot for each rho
    left_knot_indices = find_segment_indices_cuda(rho, knots)

    # Ensuring that the indexing matches the NumPy approach more closely
    segment_coeffs = cp.take_along_axis(coeffs, left_knot_indices[:, None, None], axis=1).squeeze(axis=1)

    # Gather the x values for the left knots
    x0 = cp.take_along_axis(knots, left_knot_indices[:, None], axis=1).flatten()

    # Do the interpolation step
    delta = rho - x0
    powers = cp.arange(segment_coeffs.shape[1])[::-1]
    delta_powers = delta[..., None] ** powers
    y = cp.sum(segment_coeffs * delta_powers, axis=-1)

    return y

 def polynomial_spline_eval_custom_cuda_kernel(rho, knots, coeffs):
    """Efficient polynomial interpolation using custom GPU operations.
    This one is identical to polynomial_spline_eval_bulk_cupy, except
    that the binary tree search is done with a custom CUDA kernel here

    Parameters:
    ----------
    - rho: array of points at which to evaluate the interpolation.
    - knots: array of x-values where the interpolator changes ('knots').
    - coeffs: array of coefficients for each polynomial segment. For a linear
      segment, coeffs[..., 1] is the y-value at the start of the segment, and
      coeffs[..., 0] is the slope of the segment.

    Assumes the first dimension of knots and coeffs correspond to different interpolators,
    and that knots for each interpolator are sorted.
    """
    # Make sure these are all cupy arrays
    rho_cp = cp.asarray(rho, dtype=np.float32)
    knots_cp = cp.asarray(knots, dtype=np.float32)
    coeffs_cp = cp.asarray(coeffs, dtype=np.float32)
    y_cp = cp.empty(rho_cp.size, dtype=np.float32)

    # Do it all in the CUDA kernel
    threads_per_block = 256
    blocks_per_grid = (rho_cp.size + (threads_per_block - 1)) // threads_per_block
    bulk_interpolation_kernel((blocks_per_grid,), (threads_per_block,), (rho_cp, knots_cp, coeffs_cp, y_cp, knots_cp.shape[0], knots_cp.shape[1], coeffs_cp.shape[2]))

    return y_cp


 if __name__=="__main__":
    # Set up a test problem
    N_bins = 300                    # Bins for the interpolator
    N_psr, N_freqs = 100, 15        # Pulsars and frequency modes

    N_pars = int(0.5 * N_freqs * N_psr * (N_psr+1))
    print("Optimal counting:", N_pars)              # Output number of parameters of Sigma
    x_min, x_max = -10, 10                          # Range of parameters (just for demo)
    spread = 1                                      # Spread of mean values (should get posterior from KDE)
    mu = np.random.randn(N_pars)*spread             # Mean of posterior (should get posterior from KDE)
    sigma = 1.0                                     # Standard deviation of posterior ('')
    dist = sstats.norm(loc=0, scale=spread)

    print("Generating interpolators")
    x_vals = np.linspace(x_min, x_max, N_bins, endpoint=True)           # Knots of the interpolators
    y_vals = dist.pdf(x_vals[None,:]-mu[:,None])                        # PDF values at knots
    f_ys = [sint.interp1d(x_vals, y, kind='linear') for y in y_vals]    # Create linear interpolators

    # Convert splines and get knots and coefficients (could do higher order)
    for f_y in f_ys:
        linear_interp_to_spline(f_y)

    # Because we only included the 'c' for a linear interpolator, the 'order' below here
    # will not go to order=3, but only as high as is available
    knots, coeffs = get_spline_knots_coeffs(f_ys, order=3)
    rho = np.random.rand(N_pars) * 20 - 10

    # Move all these quantities to the GPU
    rho_cp = cp.asarray(rho, dtype=np.float32)
    knots_cp = cp.asarray(knots, dtype=np.float32)
    coeffs_cp = cp.asarray(coeffs, dtype=np.float32)

    print("Calculating traditionally with scipy")
    logp_vals_traditional = [float(f_y(rho_val)) for (f_y, rho_val) in zip(f_ys, rho)]

    print("Calculating bulk")
    logp_vals_fast = polynomial_spline_eval_bulk(rho, knots, coeffs)

    print("Calculating with cupy")
    logp_vals_cupy = polynomial_spline_eval_bulk_cupy(rho_cp, knots_cp, coeffs_cp)

    print("Calculating with custom cuda kernel")
    logp_vals_cuda = polynomial_spline_eval_bulk_cuda(rho_cp, knots_cp, coeffs_cp)

    print("Calculating with full custom cuda kernel")
    logp_vals_custom = polynomial_spline_eval_custom_cuda_kernel(rho_cp, knots_cp, coeffs_cp)

    # Compare the values
    print(logp_vals_traditional[:4])
    print(logp_vals_fast[:4])
    print(logp_vals_cupy[:4])
    print(logp_vals_cuda[:4])
    print(logp_vals_custom[:4])
diff --git a/gistfile2.txt b/gistfile2.txt
 #!/usr/bin/python
 # -*- coding: utf-8 -*-
 # vim: tabstop=4:softtabstop=4:shiftwidth=4:expandtab

 """
 metal_interpolation: Fast bulk interpolation using numpy and Apple Metal

 author:     Rutger van Haasteren
 email:      [email protected]
 date:       March, 2024
 """

 import numpy as np
 import array
 import scipy.interpolate as sint
 import scipy.stats as sstats
 import metalcompute as mc

 # Metal shader code as a string
 metal_shader = """
 #include <metal_stdlib>
 using namespace metal;

 struct ScalarParams {
    uint n_rhos;
    uint n_knots;
    uint n_order;
 };

 kernel void bulk_interpolation_kernel(
    device const float* rho [[buffer(0)]],
    device const float* knots [[buffer(1)]],
    device const float* coeffs [[buffer(2)]],
    device float* y [[buffer(3)]],
    constant ScalarParams& params [[buffer(4)]],
    uint grid_position [[thread_position_in_grid]]
 ) {
    uint n_rhos = params.n_rhos;
    uint n_knots = params.n_knots;
    uint n_order = params.n_order;

    if (grid_position >= n_rhos) return;

    float rho_val = rho[grid_position];
    const device float* row_knots = knots + grid_position * n_knots;
    const device float* row_coeffs = coeffs + grid_position * (n_knots-1) * n_order;

    int left = 0;
    int right = n_knots - 1;
    while (left <= right) {
        int mid = left + (right - left) / 2;
        if (row_knots[mid] <= rho_val) {
            left = mid + 1;
        } else {
            right = mid - 1;
        }
    }
    int left_index = max(0, left - 1);

    /* Interpolation */
    float delta = rho_val - row_knots[left_index];
    float multiply = 1.0f;
    y[grid_position] = 0.0f;

    for(int order = n_order - 1; order >= 0; --order) {
        y[grid_position] += row_coeffs[left_index * n_order + order] * multiply;
        multiply *= delta;
    }
 }
 """
 metal_dev = mc.Device()
 metal_interpolation_kernel = metal_dev.kernel(metal_shader).function('bulk_interpolation_kernel')


 def linear_interp_to_spline(spline_func):
    """Convert a scipy.interp1d linear representation to spline rep"""
    delta_x = spline_func.x[1:] - spline_func.x[:-1]
    delta_y = spline_func.y[1:] - spline_func.y[:-1]

    spline_func.c = np.stack([delta_y/delta_x, spline_func.y[:-1]], axis=0)

 def get_spline_knots_coeffs(spline_funcs, order=3):
    """Get the knots and coefficients of the splines"""
    coeffs = np.stack([s.c.T for s in spline_funcs], axis=0)
    knots = np.stack([s.x for s in spline_funcs], axis=0)

    return knots, coeffs[:,:,:order]

 def find_segment_indices(rho, knots):
    """Find the segment indices for each rho value in its corresponding set of knots."""
    # Expand rho to match knots shape for broadcasting
    rho_expanded = rho[:, None]

    # Find the knot indices
    is_less_equal = rho_expanded <= knots
    right_knot_indices = np.argmax(is_less_equal, axis=1)
    left_knot_indices = right_knot_indices - 1

    # Adjust indices to ensure they are within bounds
    return np.clip(left_knot_indices, 0, knots.shape[1] - 2)

 def polynomial_spline_eval_bulk(rho, knots, coeffs):
    """Efficient polynomial interpolation using vectorized operations.

    Parameters:
    ----------
    - rho: array of points at which to evaluate the interpolation.
    - knots: array of x-values where the interpolator changes ('knots').
    - coeffs: array of coefficients for each polynomial segment. For a linear
      segment, coeffs[..., 1] is the y-value at the start of the segment, and
      coeffs[..., 0] is the slope of the segment.

    Assumes the first dimension of knots and coeffs correspond to different interpolators,
    and that knots for each interpolator are sorted.
    """

    # Find the index of the left knot for each rho
    left_knot_indices = find_segment_indices(rho, knots)

    # Gather the coefficients for the relevant segment
    segment_coeffs = np.take_along_axis(coeffs, left_knot_indices[:, None, None], axis=1).squeeze(axis=1)

    # Gather the x values for the left knots
    x0 = np.take_along_axis(knots, left_knot_indices[:, None], axis=1).flatten()

    # Evaluate the polynomial at rho
    delta = rho - x0
    y = np.sum(segment_coeffs * delta[..., None] ** np.arange(segment_coeffs.shape[1])[::-1], axis=-1)

    return y


 def polynomial_spline_eval_bulk_metal(rho, knots_buf, coeffs_buf, n_rhos, n_knots, n_order):
    """Efficient polynomial interpolation using custom GPU operations.
    This one is identical to polynomial_spline_eval_bulk_cupy, except
    that the binary tree search is done with a custom CUDA kernel here

    Parameters:
    ----------
    - rho: array of points at which to evaluate the interpolation.
    - knots: array of x-values where the interpolator changes ('knots').
    - coeffs: array of coefficients for each polynomial segment. For a linear
      segment, coeffs[..., 1] is the y-value at the start of the segment, and
      coeffs[..., 0] is the slope of the segment.

    Assumes the first dimension of knots and coeffs correspond to different interpolators,
    and that knots for each interpolator are sorted.
    """
    # Convert data to suitable format
    rho = rho.astype(np.float32, copy=False)

    ivals = metal_dev.buffer(n_rhos * 4) # 4 because a float32 needs 4 bytes 
    ivals_view = memoryview(ivals).cast('f')

    # Parameters for the kernel
    scalar_params = np.array([n_rhos, n_knots, n_order], dtype=np.uint32)

    # Execute the kernel
    metal_interpolation_kernel(n_rhos, rho, knots_buf, coeffs_buf, ivals, scalar_params)

    return np.frombuffer(ivals_view, dtype=np.float32)


 if __name__=="__main__":
    # Set up a test problem
    N_bins = 300                    # Bins for the interpolator
    N_psr, N_freqs = 100, 15        # Pulsars and frequency modes

    N_pars = int(0.5 * N_freqs * N_psr * (N_psr+1))
    print("Optimal counting:", N_pars)              # Output number of parameters of Sigma
    x_min, x_max = -10, 10                          # Range of parameters (just for demo)
    spread = 1                                      # Spread of mean values (should get posterior from KDE)
    mu = np.random.randn(N_pars)*spread             # Mean of posterior (should get posterior from KDE)
    sigma = 1.0                                     # Standard deviation of posterior ('')
    dist = sstats.norm(loc=0, scale=spread)

    print("Generating interpolators")
    x_vals = np.linspace(x_min, x_max, N_bins, endpoint=True)           # Knots of the interpolators
    y_vals = dist.pdf(x_vals[None,:]-mu[:,None])                        # PDF values at knots
    f_ys = [sint.interp1d(x_vals, y, kind='linear') for y in y_vals]    # Create linear interpolators

    # Convert splines and get knots and coefficients (could do higher order)
    for f_y in f_ys:
        linear_interp_to_spline(f_y)

    # Because we only included the 'c' for a linear interpolator, the 'order' below here
    # will not go to order=3, but only as high as is available
    knots, coeffs = get_spline_knots_coeffs(f_ys, order=3)
    rho = np.random.rand(N_pars) * 20 - 10

    rho = rho.astype(np.float32)
    knots = knots.astype(np.float32)
    coeffs = coeffs.astype(np.float32)

    # Move these quantities to the GPU
    knots_buf = metal_dev.buffer(knots)
    coeffs_buf = metal_dev.buffer(coeffs.flatten())

    n_rhos, n_knots, n_order = rho.shape[0], knots.shape[1], coeffs.shape[2]


    print("Calculating traditionally with scipy")
    logp_vals_traditional = [float(f_y(rho_val)) for (f_y, rho_val) in zip(f_ys, rho)]

    print("Calculating bulk")
    logp_vals_fast = polynomial_spline_eval_bulk(rho, knots, coeffs)

    print("Calculating with full custom Metal kernel")
    logp_vals_metal = polynomial_spline_eval_bulk_metal(rho, knots_buf, coeffs_buf, n_rhos, n_knots, n_order)

    # Compare the values
    print(logp_vals_traditional[:4])
    print(logp_vals_fast[:4])
    print(logp_vals_metal[:4])
	#!/usr/bin/python
	# -- coding: utf-8 --
	# vim: tabstop=4:softtabstop=4:shiftwidth=4:expandtab

	"""
	cuda_interpolation: Fast bulk interpolation using numpy, cuda, and cupy

	author: Rutger van Haasteren
	email: [email protected]
	date: March, 2024
	"""

	import numpy as np
	import scipy.interpolate as sint
	import scipy.stats as sstats
	import cupy as cp
	import os

	# This is the code for a CUDA kernel that does a parallel binary search
	kernel_code = '''
	extern "C"
	__global__ void binary_search_kernel(const float* rho, const float* knots, int* left_indices, int n_rhos, int n_knots) {
	int rho_idx = blockIdx.x * blockDim.x + threadIdx.x;
	if (rho_idx >= n_rhos) return;

	float rho_val = rho[rho_idx];
	const float* row_knots = knots + rho_idx * n_knots;

	int left = 0;
	int right = n_knots - 1;
	int mid = 0;
	while (left <= right) {
	mid = left + (right - left) / 2;
	if (row_knots[mid] <= rho_val) {
	left = mid + 1;
	} else {
	right = mid - 1;
	}
	}
	left_indices[rho_idx] = max(0, left - 1);
	}
	'''

	# This is the code for a CUDA kernel that does the full parallel interpolation
	bulk_interpolation_kernel_code = '''
	extern "C"
	__global__ void bulk_interpolation_kernel(const float* rho, const float* knots, const float* coeffs, float* y, int n_rhos, int n_knots, int n_order) {
	int rho_idx = blockIdx.x * blockDim.x + threadIdx.x;
	if (rho_idx >= n_rhos) return;

	float rho_val = rho[rho_idx];
	const float* row_knots = knots + rho_idx * n_knots;
	const float* row_coeffs = coeffs + rho_idx * (n_knots-1) * n_order;

	int left = 0;
	int right = n_knots - 1;
	int mid = 0;
	int left_index = 0;
	while (left <= right) {
	mid = left + (right - left) / 2;
	if (row_knots[mid] <= rho_val) {
	left = mid + 1;
	} else {
	right = mid - 1;
	}
	}
	left_index = max(0, left - 1);

	/* We have the left-indices. Now do the interpolation */
	float delta = rho_val - row_knots[left_index];
	float multiply = 1.0;
	int order = 0;
	y[rho_idx] = 0;

	for(order=n_order-1; order>=0; order--) {
	y[rho_idx] += row_coeffs[left_indexn_order+order] multiply;
	multiply = multiply * delta;
	}
	}
	'''

	# Compile the CUDA kernels
	binary_search_kernel = cp.RawKernel(kernel_code, 'binary_search_kernel')
	bulk_interpolation_kernel = cp.RawKernel(bulk_interpolation_kernel_code, 'bulk_interpolation_kernel')

	# The custom CUDA kernel for the bulk binary tree search

	def linear_interp_to_spline(spline_func):
	"""Convert a scipy.interp1d linear representation to spline rep"""
	delta_x = spline_func.x[1:] - spline_func.x[:-1]
	delta_y = spline_func.y[1:] - spline_func.y[:-1]

	spline_func.c = np.stack([delta_y/delta_x, spline_func.y[:-1]], axis=0)

	def get_spline_knots_coeffs(spline_funcs, order=3):
	"""Get the knots and coefficients of the splines"""
	coeffs = np.stack([s.c.T for s in spline_funcs], axis=0)
	knots = np.stack([s.x for s in spline_funcs], axis=0)

	return knots, coeffs[:,:,:order]

	def find_segment_indices(rho, knots):
	"""Find the segment indices for each rho value in its corresponding set of knots."""
	# Expand rho to match knots shape for broadcasting
	rho_expanded = rho[:, None]

	# Find the knot indices
	is_less_equal = rho_expanded <= knots
	right_knot_indices = np.argmax(is_less_equal, axis=1)
	left_knot_indices = right_knot_indices - 1

	# Adjust indices to ensure they are within bounds
	return np.clip(left_knot_indices, 0, knots.shape[1] - 2)

	def find_segment_indices_cupy(rho, knots):
	"""Find the segment indices for each rho value in its corresponding set of knots using CuPy."""
	rho_expanded = cp.asarray(rho, dtype=np.float32)[:, None]
	knots = cp.asarray(knots, dtype=np.float32)

	# Find the knot indices
	is_less_equal = rho_expanded <= knots
	right_knot_indices = cp.argmax(is_less_equal, axis=1)
	left_knot_indices = right_knot_indices - 1

	return cp.clip(left_knot_indices, 0, knots.shape[1] - 2)

	def find_segment_indices_cuda(rho, knots):
	"""Find the segment indices for each rho value in its corresponding set of knots using CuPy, optimized with binary search."""
	# Ensure rho and knots are CuPy arrays, single precision
	rho_cp = cp.asarray(rho, dtype=np.float32)
	knots_cp = cp.asarray(knots, dtype=np.float32)

	# Prepare output array for indices
	left_knot_indices = cp.empty(rho_cp.size, dtype=cp.int32)

	# Launch the custom kernel to find the knot indices
	threads_per_block = 256
	blocks_per_grid = (rho_cp.size + (threads_per_block - 1)) // threads_per_block
	binary_search_kernel((blocks_per_grid,), (threads_per_block,), (rho_cp, knots_cp, left_knot_indices, knots_cp.shape[0], knots_cp.shape[1]))

	return left_knot_indices

	def polynomial_spline_eval_bulk(rho, knots, coeffs):
	"""Efficient polynomial interpolation using vectorized operations.

	Parameters:
	----------
	- rho: array of points at which to evaluate the interpolation.
	- knots: array of x-values where the interpolator changes ('knots').
	- coeffs: array of coefficients for each polynomial segment. For a linear
	segment, coeffs[..., 1] is the y-value at the start of the segment, and
	coeffs[..., 0] is the slope of the segment.

	Assumes the first dimension of knots and coeffs correspond to different interpolators,
	and that knots for each interpolator are sorted.
	"""

	# Find the index of the left knot for each rho
	left_knot_indices = find_segment_indices(rho, knots)

	# Gather the coefficients for the relevant segment
	segment_coeffs = np.take_along_axis(coeffs, left_knot_indices[:, None, None], axis=1).squeeze(axis=1)

	# Gather the x values for the left knots
	x0 = np.take_along_axis(knots, left_knot_indices[:, None], axis=1).flatten()

	# Evaluate the polynomial at rho
	delta = rho - x0
	y = np.sum(segment_coeffs * delta[..., None] ** np.arange(segment_coeffs.shape[1])[::-1], axis=-1)

	return y

	def polynomial_spline_eval_bulk_cupy(rho, knots, coeffs):
	"""Efficient polynomial interpolation using GPU operations. Make sure that the
	input arrays are already on the GPU in np.float32 for efficiency

	Parameters:
	----------
	- rho: array of points at which to evaluate the interpolation.
	- knots: array of x-values where the interpolator changes ('knots').
	- coeffs: array of coefficients for each polynomial segment. For a linear
	segment, coeffs[..., 1] is the y-value at the start of the segment, and
	coeffs[..., 0] is the slope of the segment.

	Assumes the first dimension of knots and coeffs correspond to different interpolators,
	and that knots for each interpolator are sorted.
	"""
	# Make sure these are all cupy arrays
	rho = cp.asarray(rho, dtype=np.float32)
	knots = cp.asarray(knots, dtype=np.float32)
	coeffs = cp.asarray(coeffs, dtype=np.float32)

	# Find the index of the left knot for each rho
	left_knot_indices = find_segment_indices_cupy(rho, knots)

	# Ensuring that the indexing matches the NumPy approach more closely
	segment_coeffs = cp.take_along_axis(coeffs, left_knot_indices[:, None, None], axis=1).squeeze(axis=1)

	# Gather the x values for the left knots
	x0 = cp.take_along_axis(knots, left_knot_indices[:, None], axis=1).flatten()

	# Do the interpolation step
	delta = rho - x0
	powers = cp.arange(segment_coeffs.shape[1])[::-1]
	delta_powers = delta[..., None] ** powers
	y = cp.sum(segment_coeffs * delta_powers, axis=-1)

	return y

	def polynomial_spline_eval_bulk_cuda(rho, knots, coeffs):
	"""Efficient polynomial interpolation using custom GPU operations.
	This one is identical to polynomial_spline_eval_bulk_cupy, except
	that the binary tree search is done with a custom CUDA kernel here

	Parameters:
	----------
	- rho: array of points at which to evaluate the interpolation.
	- knots: array of x-values where the interpolator changes ('knots').
	- coeffs: array of coefficients for each polynomial segment. For a linear
	segment, coeffs[..., 1] is the y-value at the start of the segment, and
	coeffs[..., 0] is the slope of the segment.

	Assumes the first dimension of knots and coeffs correspond to different interpolators,
	and that knots for each interpolator are sorted.
	"""
	# Make sure these are all cupy arrays
	rho = cp.asarray(rho, dtype=np.float32)
	knots = cp.asarray(knots, dtype=np.float32)
	coeffs = cp.asarray(coeffs, dtype=np.float32)

	# Find the index of the left knot for each rho
	left_knot_indices = find_segment_indices_cuda(rho, knots)

	# Ensuring that the indexing matches the NumPy approach more closely
	segment_coeffs = cp.take_along_axis(coeffs, left_knot_indices[:, None, None], axis=1).squeeze(axis=1)

	# Gather the x values for the left knots
	x0 = cp.take_along_axis(knots, left_knot_indices[:, None], axis=1).flatten()

	# Do the interpolation step
	delta = rho - x0
	powers = cp.arange(segment_coeffs.shape[1])[::-1]
	delta_powers = delta[..., None] ** powers
	y = cp.sum(segment_coeffs * delta_powers, axis=-1)

	return y

	def polynomial_spline_eval_custom_cuda_kernel(rho, knots, coeffs):
	"""Efficient polynomial interpolation using custom GPU operations.
	This one is identical to polynomial_spline_eval_bulk_cupy, except
	that the binary tree search is done with a custom CUDA kernel here

	Parameters:
	----------
	- rho: array of points at which to evaluate the interpolation.
	- knots: array of x-values where the interpolator changes ('knots').
	- coeffs: array of coefficients for each polynomial segment. For a linear
	segment, coeffs[..., 1] is the y-value at the start of the segment, and
	coeffs[..., 0] is the slope of the segment.

	Assumes the first dimension of knots and coeffs correspond to different interpolators,
	and that knots for each interpolator are sorted.
	"""
	# Make sure these are all cupy arrays
	rho_cp = cp.asarray(rho, dtype=np.float32)
	knots_cp = cp.asarray(knots, dtype=np.float32)
	coeffs_cp = cp.asarray(coeffs, dtype=np.float32)
	y_cp = cp.empty(rho_cp.size, dtype=np.float32)

	# Do it all in the CUDA kernel
	threads_per_block = 256
	blocks_per_grid = (rho_cp.size + (threads_per_block - 1)) // threads_per_block
	bulk_interpolation_kernel((blocks_per_grid,), (threads_per_block,), (rho_cp, knots_cp, coeffs_cp, y_cp, knots_cp.shape[0], knots_cp.shape[1], coeffs_cp.shape[2]))

	return y_cp


	if __name__=="__main__":
	# Set up a test problem
	N_bins = 300 # Bins for the interpolator
	N_psr, N_freqs = 100, 15 # Pulsars and frequency modes

	N_pars = int(0.5 * N_freqs * N_psr * (N_psr+1))
	print("Optimal counting:", N_pars) # Output number of parameters of Sigma
	x_min, x_max = -10, 10 # Range of parameters (just for demo)
	spread = 1 # Spread of mean values (should get posterior from KDE)
	mu = np.random.randn(N_pars)*spread # Mean of posterior (should get posterior from KDE)
	sigma = 1.0 # Standard deviation of posterior ('')
	dist = sstats.norm(loc=0, scale=spread)

	print("Generating interpolators")
	x_vals = np.linspace(x_min, x_max, N_bins, endpoint=True) # Knots of the interpolators
	y_vals = dist.pdf(x_vals[None,:]-mu[:,None]) # PDF values at knots
	f_ys = [sint.interp1d(x_vals, y, kind='linear') for y in y_vals] # Create linear interpolators

	# Convert splines and get knots and coefficients (could do higher order)
	for f_y in f_ys:
	linear_interp_to_spline(f_y)

	# Because we only included the 'c' for a linear interpolator, the 'order' below here
	# will not go to order=3, but only as high as is available
	knots, coeffs = get_spline_knots_coeffs(f_ys, order=3)
	rho = np.random.rand(N_pars) * 20 - 10

	# Move all these quantities to the GPU
	rho_cp = cp.asarray(rho, dtype=np.float32)
	knots_cp = cp.asarray(knots, dtype=np.float32)
	coeffs_cp = cp.asarray(coeffs, dtype=np.float32)

	print("Calculating traditionally with scipy")
	logp_vals_traditional = [float(f_y(rho_val)) for (f_y, rho_val) in zip(f_ys, rho)]

	print("Calculating bulk")
	logp_vals_fast = polynomial_spline_eval_bulk(rho, knots, coeffs)

	print("Calculating with cupy")
	logp_vals_cupy = polynomial_spline_eval_bulk_cupy(rho_cp, knots_cp, coeffs_cp)

	print("Calculating with custom cuda kernel")
	logp_vals_cuda = polynomial_spline_eval_bulk_cuda(rho_cp, knots_cp, coeffs_cp)

	print("Calculating with full custom cuda kernel")
	logp_vals_custom = polynomial_spline_eval_custom_cuda_kernel(rho_cp, knots_cp, coeffs_cp)

	# Compare the values
	print(logp_vals_traditional[:4])
	print(logp_vals_fast[:4])
	print(logp_vals_cupy[:4])
	print(logp_vals_cuda[:4])
	print(logp_vals_custom[:4])