Last active
March 7, 2024 16:12
-
-
Save vhaasteren/1fb63796fcd3cc813a9a00b05af80bf4 to your computer and use it in GitHub Desktop.
Fast bulk interpolation using a CUDA and Apple Metal
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
# vim: tabstop=4:softtabstop=4:shiftwidth=4:expandtab | |
""" | |
cuda_interpolation: Fast bulk interpolation using numpy, cuda, and cupy | |
author: Rutger van Haasteren | |
email: [email protected] | |
date: March, 2024 | |
""" | |
import numpy as np | |
import scipy.interpolate as sint | |
import scipy.stats as sstats | |
import cupy as cp | |
import os | |
# This is the code for a CUDA kernel that does a parallel binary search | |
kernel_code = ''' | |
extern "C" | |
__global__ void binary_search_kernel(const float* rho, const float* knots, int* left_indices, int n_rhos, int n_knots) { | |
int rho_idx = blockIdx.x * blockDim.x + threadIdx.x; | |
if (rho_idx >= n_rhos) return; | |
float rho_val = rho[rho_idx]; | |
const float* row_knots = knots + rho_idx * n_knots; | |
int left = 0; | |
int right = n_knots - 1; | |
int mid = 0; | |
while (left <= right) { | |
mid = left + (right - left) / 2; | |
if (row_knots[mid] <= rho_val) { | |
left = mid + 1; | |
} else { | |
right = mid - 1; | |
} | |
} | |
left_indices[rho_idx] = max(0, left - 1); | |
} | |
''' | |
# This is the code for a CUDA kernel that does the full parallel interpolation | |
bulk_interpolation_kernel_code = ''' | |
extern "C" | |
__global__ void bulk_interpolation_kernel(const float* rho, const float* knots, const float* coeffs, float* y, int n_rhos, int n_knots, int n_order) { | |
int rho_idx = blockIdx.x * blockDim.x + threadIdx.x; | |
if (rho_idx >= n_rhos) return; | |
float rho_val = rho[rho_idx]; | |
const float* row_knots = knots + rho_idx * n_knots; | |
const float* row_coeffs = coeffs + rho_idx * (n_knots-1) * n_order; | |
int left = 0; | |
int right = n_knots - 1; | |
int mid = 0; | |
int left_index = 0; | |
while (left <= right) { | |
mid = left + (right - left) / 2; | |
if (row_knots[mid] <= rho_val) { | |
left = mid + 1; | |
} else { | |
right = mid - 1; | |
} | |
} | |
left_index = max(0, left - 1); | |
/* We have the left-indices. Now do the interpolation */ | |
float delta = rho_val - row_knots[left_index]; | |
float multiply = 1.0; | |
int order = 0; | |
y[rho_idx] = 0; | |
for(order=n_order-1; order>=0; order--) { | |
y[rho_idx] += row_coeffs[left_index*n_order+order] * multiply; | |
multiply = multiply * delta; | |
} | |
} | |
''' | |
# Compile the CUDA kernels | |
binary_search_kernel = cp.RawKernel(kernel_code, 'binary_search_kernel') | |
bulk_interpolation_kernel = cp.RawKernel(bulk_interpolation_kernel_code, 'bulk_interpolation_kernel') | |
# The custom CUDA kernel for the bulk binary tree search | |
def linear_interp_to_spline(spline_func): | |
"""Convert a scipy.interp1d linear representation to spline rep""" | |
delta_x = spline_func.x[1:] - spline_func.x[:-1] | |
delta_y = spline_func.y[1:] - spline_func.y[:-1] | |
spline_func.c = np.stack([delta_y/delta_x, spline_func.y[:-1]], axis=0) | |
def get_spline_knots_coeffs(spline_funcs, order=3): | |
"""Get the knots and coefficients of the splines""" | |
coeffs = np.stack([s.c.T for s in spline_funcs], axis=0) | |
knots = np.stack([s.x for s in spline_funcs], axis=0) | |
return knots, coeffs[:,:,:order] | |
def find_segment_indices(rho, knots): | |
"""Find the segment indices for each rho value in its corresponding set of knots.""" | |
# Expand rho to match knots shape for broadcasting | |
rho_expanded = rho[:, None] | |
# Find the knot indices | |
is_less_equal = rho_expanded <= knots | |
right_knot_indices = np.argmax(is_less_equal, axis=1) | |
left_knot_indices = right_knot_indices - 1 | |
# Adjust indices to ensure they are within bounds | |
return np.clip(left_knot_indices, 0, knots.shape[1] - 2) | |
def find_segment_indices_cupy(rho, knots): | |
"""Find the segment indices for each rho value in its corresponding set of knots using CuPy.""" | |
rho_expanded = cp.asarray(rho, dtype=np.float32)[:, None] | |
knots = cp.asarray(knots, dtype=np.float32) | |
# Find the knot indices | |
is_less_equal = rho_expanded <= knots | |
right_knot_indices = cp.argmax(is_less_equal, axis=1) | |
left_knot_indices = right_knot_indices - 1 | |
return cp.clip(left_knot_indices, 0, knots.shape[1] - 2) | |
def find_segment_indices_cuda(rho, knots): | |
"""Find the segment indices for each rho value in its corresponding set of knots using CuPy, optimized with binary search.""" | |
# Ensure rho and knots are CuPy arrays, single precision | |
rho_cp = cp.asarray(rho, dtype=np.float32) | |
knots_cp = cp.asarray(knots, dtype=np.float32) | |
# Prepare output array for indices | |
left_knot_indices = cp.empty(rho_cp.size, dtype=cp.int32) | |
# Launch the custom kernel to find the knot indices | |
threads_per_block = 256 | |
blocks_per_grid = (rho_cp.size + (threads_per_block - 1)) // threads_per_block | |
binary_search_kernel((blocks_per_grid,), (threads_per_block,), (rho_cp, knots_cp, left_knot_indices, knots_cp.shape[0], knots_cp.shape[1])) | |
return left_knot_indices | |
def polynomial_spline_eval_bulk(rho, knots, coeffs): | |
"""Efficient polynomial interpolation using vectorized operations. | |
Parameters: | |
---------- | |
- rho: array of points at which to evaluate the interpolation. | |
- knots: array of x-values where the interpolator changes ('knots'). | |
- coeffs: array of coefficients for each polynomial segment. For a linear | |
segment, coeffs[..., 1] is the y-value at the start of the segment, and | |
coeffs[..., 0] is the slope of the segment. | |
Assumes the first dimension of knots and coeffs correspond to different interpolators, | |
and that knots for each interpolator are sorted. | |
""" | |
# Find the index of the left knot for each rho | |
left_knot_indices = find_segment_indices(rho, knots) | |
# Gather the coefficients for the relevant segment | |
segment_coeffs = np.take_along_axis(coeffs, left_knot_indices[:, None, None], axis=1).squeeze(axis=1) | |
# Gather the x values for the left knots | |
x0 = np.take_along_axis(knots, left_knot_indices[:, None], axis=1).flatten() | |
# Evaluate the polynomial at rho | |
delta = rho - x0 | |
y = np.sum(segment_coeffs * delta[..., None] ** np.arange(segment_coeffs.shape[1])[::-1], axis=-1) | |
return y | |
def polynomial_spline_eval_bulk_cupy(rho, knots, coeffs): | |
"""Efficient polynomial interpolation using GPU operations. Make sure that the | |
input arrays are already on the GPU in np.float32 for efficiency | |
Parameters: | |
---------- | |
- rho: array of points at which to evaluate the interpolation. | |
- knots: array of x-values where the interpolator changes ('knots'). | |
- coeffs: array of coefficients for each polynomial segment. For a linear | |
segment, coeffs[..., 1] is the y-value at the start of the segment, and | |
coeffs[..., 0] is the slope of the segment. | |
Assumes the first dimension of knots and coeffs correspond to different interpolators, | |
and that knots for each interpolator are sorted. | |
""" | |
# Make sure these are all cupy arrays | |
rho = cp.asarray(rho, dtype=np.float32) | |
knots = cp.asarray(knots, dtype=np.float32) | |
coeffs = cp.asarray(coeffs, dtype=np.float32) | |
# Find the index of the left knot for each rho | |
left_knot_indices = find_segment_indices_cupy(rho, knots) | |
# Ensuring that the indexing matches the NumPy approach more closely | |
segment_coeffs = cp.take_along_axis(coeffs, left_knot_indices[:, None, None], axis=1).squeeze(axis=1) | |
# Gather the x values for the left knots | |
x0 = cp.take_along_axis(knots, left_knot_indices[:, None], axis=1).flatten() | |
# Do the interpolation step | |
delta = rho - x0 | |
powers = cp.arange(segment_coeffs.shape[1])[::-1] | |
delta_powers = delta[..., None] ** powers | |
y = cp.sum(segment_coeffs * delta_powers, axis=-1) | |
return y | |
def polynomial_spline_eval_bulk_cuda(rho, knots, coeffs): | |
"""Efficient polynomial interpolation using custom GPU operations. | |
This one is identical to polynomial_spline_eval_bulk_cupy, except | |
that the binary tree search is done with a custom CUDA kernel here | |
Parameters: | |
---------- | |
- rho: array of points at which to evaluate the interpolation. | |
- knots: array of x-values where the interpolator changes ('knots'). | |
- coeffs: array of coefficients for each polynomial segment. For a linear | |
segment, coeffs[..., 1] is the y-value at the start of the segment, and | |
coeffs[..., 0] is the slope of the segment. | |
Assumes the first dimension of knots and coeffs correspond to different interpolators, | |
and that knots for each interpolator are sorted. | |
""" | |
# Make sure these are all cupy arrays | |
rho = cp.asarray(rho, dtype=np.float32) | |
knots = cp.asarray(knots, dtype=np.float32) | |
coeffs = cp.asarray(coeffs, dtype=np.float32) | |
# Find the index of the left knot for each rho | |
left_knot_indices = find_segment_indices_cuda(rho, knots) | |
# Ensuring that the indexing matches the NumPy approach more closely | |
segment_coeffs = cp.take_along_axis(coeffs, left_knot_indices[:, None, None], axis=1).squeeze(axis=1) | |
# Gather the x values for the left knots | |
x0 = cp.take_along_axis(knots, left_knot_indices[:, None], axis=1).flatten() | |
# Do the interpolation step | |
delta = rho - x0 | |
powers = cp.arange(segment_coeffs.shape[1])[::-1] | |
delta_powers = delta[..., None] ** powers | |
y = cp.sum(segment_coeffs * delta_powers, axis=-1) | |
return y | |
def polynomial_spline_eval_custom_cuda_kernel(rho, knots, coeffs): | |
"""Efficient polynomial interpolation using custom GPU operations. | |
This one is identical to polynomial_spline_eval_bulk_cupy, except | |
that the binary tree search is done with a custom CUDA kernel here | |
Parameters: | |
---------- | |
- rho: array of points at which to evaluate the interpolation. | |
- knots: array of x-values where the interpolator changes ('knots'). | |
- coeffs: array of coefficients for each polynomial segment. For a linear | |
segment, coeffs[..., 1] is the y-value at the start of the segment, and | |
coeffs[..., 0] is the slope of the segment. | |
Assumes the first dimension of knots and coeffs correspond to different interpolators, | |
and that knots for each interpolator are sorted. | |
""" | |
# Make sure these are all cupy arrays | |
rho_cp = cp.asarray(rho, dtype=np.float32) | |
knots_cp = cp.asarray(knots, dtype=np.float32) | |
coeffs_cp = cp.asarray(coeffs, dtype=np.float32) | |
y_cp = cp.empty(rho_cp.size, dtype=np.float32) | |
# Do it all in the CUDA kernel | |
threads_per_block = 256 | |
blocks_per_grid = (rho_cp.size + (threads_per_block - 1)) // threads_per_block | |
bulk_interpolation_kernel((blocks_per_grid,), (threads_per_block,), (rho_cp, knots_cp, coeffs_cp, y_cp, knots_cp.shape[0], knots_cp.shape[1], coeffs_cp.shape[2])) | |
return y_cp | |
if __name__=="__main__": | |
# Set up a test problem | |
N_bins = 300 # Bins for the interpolator | |
N_psr, N_freqs = 100, 15 # Pulsars and frequency modes | |
N_pars = int(0.5 * N_freqs * N_psr * (N_psr+1)) | |
print("Optimal counting:", N_pars) # Output number of parameters of Sigma | |
x_min, x_max = -10, 10 # Range of parameters (just for demo) | |
spread = 1 # Spread of mean values (should get posterior from KDE) | |
mu = np.random.randn(N_pars)*spread # Mean of posterior (should get posterior from KDE) | |
sigma = 1.0 # Standard deviation of posterior ('') | |
dist = sstats.norm(loc=0, scale=spread) | |
print("Generating interpolators") | |
x_vals = np.linspace(x_min, x_max, N_bins, endpoint=True) # Knots of the interpolators | |
y_vals = dist.pdf(x_vals[None,:]-mu[:,None]) # PDF values at knots | |
f_ys = [sint.interp1d(x_vals, y, kind='linear') for y in y_vals] # Create linear interpolators | |
# Convert splines and get knots and coefficients (could do higher order) | |
for f_y in f_ys: | |
linear_interp_to_spline(f_y) | |
# Because we only included the 'c' for a linear interpolator, the 'order' below here | |
# will not go to order=3, but only as high as is available | |
knots, coeffs = get_spline_knots_coeffs(f_ys, order=3) | |
rho = np.random.rand(N_pars) * 20 - 10 | |
# Move all these quantities to the GPU | |
rho_cp = cp.asarray(rho, dtype=np.float32) | |
knots_cp = cp.asarray(knots, dtype=np.float32) | |
coeffs_cp = cp.asarray(coeffs, dtype=np.float32) | |
print("Calculating traditionally with scipy") | |
logp_vals_traditional = [float(f_y(rho_val)) for (f_y, rho_val) in zip(f_ys, rho)] | |
print("Calculating bulk") | |
logp_vals_fast = polynomial_spline_eval_bulk(rho, knots, coeffs) | |
print("Calculating with cupy") | |
logp_vals_cupy = polynomial_spline_eval_bulk_cupy(rho_cp, knots_cp, coeffs_cp) | |
print("Calculating with custom cuda kernel") | |
logp_vals_cuda = polynomial_spline_eval_bulk_cuda(rho_cp, knots_cp, coeffs_cp) | |
print("Calculating with full custom cuda kernel") | |
logp_vals_custom = polynomial_spline_eval_custom_cuda_kernel(rho_cp, knots_cp, coeffs_cp) | |
# Compare the values | |
print(logp_vals_traditional[:4]) | |
print(logp_vals_fast[:4]) | |
print(logp_vals_cupy[:4]) | |
print(logp_vals_cuda[:4]) | |
print(logp_vals_custom[:4]) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
# vim: tabstop=4:softtabstop=4:shiftwidth=4:expandtab | |
""" | |
metal_interpolation: Fast bulk interpolation using numpy and Apple Metal | |
author: Rutger van Haasteren | |
email: [email protected] | |
date: March, 2024 | |
""" | |
import numpy as np | |
import array | |
import scipy.interpolate as sint | |
import scipy.stats as sstats | |
import metalcompute as mc | |
# Metal shader code as a string | |
metal_shader = """ | |
#include <metal_stdlib> | |
using namespace metal; | |
struct ScalarParams { | |
uint n_rhos; | |
uint n_knots; | |
uint n_order; | |
}; | |
kernel void bulk_interpolation_kernel( | |
device const float* rho [[buffer(0)]], | |
device const float* knots [[buffer(1)]], | |
device const float* coeffs [[buffer(2)]], | |
device float* y [[buffer(3)]], | |
constant ScalarParams& params [[buffer(4)]], | |
uint grid_position [[thread_position_in_grid]] | |
) { | |
uint n_rhos = params.n_rhos; | |
uint n_knots = params.n_knots; | |
uint n_order = params.n_order; | |
if (grid_position >= n_rhos) return; | |
float rho_val = rho[grid_position]; | |
const device float* row_knots = knots + grid_position * n_knots; | |
const device float* row_coeffs = coeffs + grid_position * (n_knots-1) * n_order; | |
int left = 0; | |
int right = n_knots - 1; | |
while (left <= right) { | |
int mid = left + (right - left) / 2; | |
if (row_knots[mid] <= rho_val) { | |
left = mid + 1; | |
} else { | |
right = mid - 1; | |
} | |
} | |
int left_index = max(0, left - 1); | |
/* Interpolation */ | |
float delta = rho_val - row_knots[left_index]; | |
float multiply = 1.0f; | |
y[grid_position] = 0.0f; | |
for(int order = n_order - 1; order >= 0; --order) { | |
y[grid_position] += row_coeffs[left_index * n_order + order] * multiply; | |
multiply *= delta; | |
} | |
} | |
""" | |
metal_dev = mc.Device() | |
metal_interpolation_kernel = metal_dev.kernel(metal_shader).function('bulk_interpolation_kernel') | |
def linear_interp_to_spline(spline_func): | |
"""Convert a scipy.interp1d linear representation to spline rep""" | |
delta_x = spline_func.x[1:] - spline_func.x[:-1] | |
delta_y = spline_func.y[1:] - spline_func.y[:-1] | |
spline_func.c = np.stack([delta_y/delta_x, spline_func.y[:-1]], axis=0) | |
def get_spline_knots_coeffs(spline_funcs, order=3): | |
"""Get the knots and coefficients of the splines""" | |
coeffs = np.stack([s.c.T for s in spline_funcs], axis=0) | |
knots = np.stack([s.x for s in spline_funcs], axis=0) | |
return knots, coeffs[:,:,:order] | |
def find_segment_indices(rho, knots): | |
"""Find the segment indices for each rho value in its corresponding set of knots.""" | |
# Expand rho to match knots shape for broadcasting | |
rho_expanded = rho[:, None] | |
# Find the knot indices | |
is_less_equal = rho_expanded <= knots | |
right_knot_indices = np.argmax(is_less_equal, axis=1) | |
left_knot_indices = right_knot_indices - 1 | |
# Adjust indices to ensure they are within bounds | |
return np.clip(left_knot_indices, 0, knots.shape[1] - 2) | |
def polynomial_spline_eval_bulk(rho, knots, coeffs): | |
"""Efficient polynomial interpolation using vectorized operations. | |
Parameters: | |
---------- | |
- rho: array of points at which to evaluate the interpolation. | |
- knots: array of x-values where the interpolator changes ('knots'). | |
- coeffs: array of coefficients for each polynomial segment. For a linear | |
segment, coeffs[..., 1] is the y-value at the start of the segment, and | |
coeffs[..., 0] is the slope of the segment. | |
Assumes the first dimension of knots and coeffs correspond to different interpolators, | |
and that knots for each interpolator are sorted. | |
""" | |
# Find the index of the left knot for each rho | |
left_knot_indices = find_segment_indices(rho, knots) | |
# Gather the coefficients for the relevant segment | |
segment_coeffs = np.take_along_axis(coeffs, left_knot_indices[:, None, None], axis=1).squeeze(axis=1) | |
# Gather the x values for the left knots | |
x0 = np.take_along_axis(knots, left_knot_indices[:, None], axis=1).flatten() | |
# Evaluate the polynomial at rho | |
delta = rho - x0 | |
y = np.sum(segment_coeffs * delta[..., None] ** np.arange(segment_coeffs.shape[1])[::-1], axis=-1) | |
return y | |
def polynomial_spline_eval_bulk_metal(rho, knots_buf, coeffs_buf, n_rhos, n_knots, n_order): | |
"""Efficient polynomial interpolation using custom GPU operations. | |
This one is identical to polynomial_spline_eval_bulk_cupy, except | |
that the binary tree search is done with a custom CUDA kernel here | |
Parameters: | |
---------- | |
- rho: array of points at which to evaluate the interpolation. | |
- knots: array of x-values where the interpolator changes ('knots'). | |
- coeffs: array of coefficients for each polynomial segment. For a linear | |
segment, coeffs[..., 1] is the y-value at the start of the segment, and | |
coeffs[..., 0] is the slope of the segment. | |
Assumes the first dimension of knots and coeffs correspond to different interpolators, | |
and that knots for each interpolator are sorted. | |
""" | |
# Convert data to suitable format | |
rho = rho.astype(np.float32, copy=False) | |
ivals = metal_dev.buffer(n_rhos * 4) # 4 because a float32 needs 4 bytes | |
ivals_view = memoryview(ivals).cast('f') | |
# Parameters for the kernel | |
scalar_params = np.array([n_rhos, n_knots, n_order], dtype=np.uint32) | |
# Execute the kernel | |
metal_interpolation_kernel(n_rhos, rho, knots_buf, coeffs_buf, ivals, scalar_params) | |
return np.frombuffer(ivals_view, dtype=np.float32) | |
if __name__=="__main__": | |
# Set up a test problem | |
N_bins = 300 # Bins for the interpolator | |
N_psr, N_freqs = 100, 15 # Pulsars and frequency modes | |
N_pars = int(0.5 * N_freqs * N_psr * (N_psr+1)) | |
print("Optimal counting:", N_pars) # Output number of parameters of Sigma | |
x_min, x_max = -10, 10 # Range of parameters (just for demo) | |
spread = 1 # Spread of mean values (should get posterior from KDE) | |
mu = np.random.randn(N_pars)*spread # Mean of posterior (should get posterior from KDE) | |
sigma = 1.0 # Standard deviation of posterior ('') | |
dist = sstats.norm(loc=0, scale=spread) | |
print("Generating interpolators") | |
x_vals = np.linspace(x_min, x_max, N_bins, endpoint=True) # Knots of the interpolators | |
y_vals = dist.pdf(x_vals[None,:]-mu[:,None]) # PDF values at knots | |
f_ys = [sint.interp1d(x_vals, y, kind='linear') for y in y_vals] # Create linear interpolators | |
# Convert splines and get knots and coefficients (could do higher order) | |
for f_y in f_ys: | |
linear_interp_to_spline(f_y) | |
# Because we only included the 'c' for a linear interpolator, the 'order' below here | |
# will not go to order=3, but only as high as is available | |
knots, coeffs = get_spline_knots_coeffs(f_ys, order=3) | |
rho = np.random.rand(N_pars) * 20 - 10 | |
rho = rho.astype(np.float32) | |
knots = knots.astype(np.float32) | |
coeffs = coeffs.astype(np.float32) | |
# Move these quantities to the GPU | |
knots_buf = metal_dev.buffer(knots) | |
coeffs_buf = metal_dev.buffer(coeffs.flatten()) | |
n_rhos, n_knots, n_order = rho.shape[0], knots.shape[1], coeffs.shape[2] | |
print("Calculating traditionally with scipy") | |
logp_vals_traditional = [float(f_y(rho_val)) for (f_y, rho_val) in zip(f_ys, rho)] | |
print("Calculating bulk") | |
logp_vals_fast = polynomial_spline_eval_bulk(rho, knots, coeffs) | |
print("Calculating with full custom Metal kernel") | |
logp_vals_metal = polynomial_spline_eval_bulk_metal(rho, knots_buf, coeffs_buf, n_rhos, n_knots, n_order) | |
# Compare the values | |
print(logp_vals_traditional[:4]) | |
print(logp_vals_fast[:4]) | |
print(logp_vals_metal[:4]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment