Last active
April 28, 2025 17:45
-
-
Save vurtun/f8e6742d9b5edc00a55219a08d539e7d to your computer and use it in GitHub Desktop.
chain simulation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <immintrin.h> | |
#include <pthread.h> | |
#include <assert.h> | |
#include <string.h> | |
enum { | |
MAX_CHAINS = 256, | |
PARTICLES_PER_CHAIN = 16, | |
SPHERES_PER_CHAIN = 8, | |
MAX_PARTICLES = (MAX_CHAINS * PARTICLES_PER_CHAIN), | |
MAX_SPHERES = (MAX_CHAINS * SPHERES_PER_CHAIN) , | |
MAX_REST_LENGTHS = (MAX_PARTICLES), | |
CONSTRAINTS_PER_CHAIN = 16, | |
MAX_ITERATIONS = 16, | |
}; | |
struct chain_cfg { | |
float gravity_x, gravity_y, gravity_z; | |
float wind_x, wind_y, wind_z; | |
float damping, drag, stiffness, stretch_factor, tether_max_length; | |
}; | |
struct chain_sim { | |
unsigned short free_idx_cnt; | |
unsigned short free_idx[MAX_CHAINS]; | |
struct chain_cfg chain_configs[MAX_CHAINS] __attribute__((aligned(32))); | |
float particles_x[MAX_PARTICLES] __attribute__((aligned(32))); | |
float particles_y[MAX_PARTICLES] __attribute__((aligned(32))); | |
float particles_z[MAX_PARTICLES] __attribute__((aligned(32))); | |
float inv_mass[MAX_PARTICLES] __attribute__((aligned(32))); | |
float prev_particles_x[MAX_PARTICLES] __attribute__((aligned(32))); | |
float prev_particles_y[MAX_PARTICLES] __attribute__((aligned(32))); | |
float prev_particles_z[MAX_PARTICLES] __attribute__((aligned(32))); | |
float rest_lengths[MAX_REST_LENGTHS] __attribute__((aligned(32))); | |
float sphere_x[MAX_SPHERES] __attribute__((aligned(32))); | |
float sphere_y[MAX_SPHERES] __attribute__((aligned(32))); | |
float sphere_z[MAX_SPHERES] __attribute__((aligned(32))); | |
float sphere_radius[MAX_SPHERES] __attribute__((aligned(32))); | |
}; | |
void simulate_chains(struct chain_sim *cs, float dt) { | |
const __m256 dt2_vec = _mm256_set1_ps(dt * dt); | |
const __m256 two_vec = _mm256_set1_ps(2.0f); | |
const __m256 one_vec = _mm256_set1_ps(1.0f); | |
const __m256 eps_vec = _mm256_set1_ps(1e-6f); | |
const __m256 half = _mm256_set1_ps(0.5f); | |
const __m256 three_halves = _mm256_set1_ps(1.5f); | |
// Step 2 arrays moved up, 17 elements, only 16th zeroed | |
float px[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32))); | |
float py[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32))); | |
float pz[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32))); | |
float pm[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32))); | |
// zero dummy elements | |
px[PARTICLES_PER_CHAIN] = 0.0f; | |
py[PARTICLES_PER_CHAIN] = 0.0f; | |
pz[PARTICLES_PER_CHAIN] = 0.0f; | |
pm[PARTICLES_PER_CHAIN] = 0.0f; | |
// Step 1: Verlet Integration | |
for (int c = 0; c < MAX_CHAINS; c++) { | |
int base_idx = c * PARTICLES_PER_CHAIN; | |
struct chain_cfg *cfg = &cs->chain_configs[c]; | |
__m256 gravity_x = _mm256_set1_ps(cfg->gravity_x); | |
__m256 gravity_y = _mm256_set1_ps(cfg->gravity_y); | |
__m256 gravity_z = _mm256_set1_ps(cfg->gravity_z); | |
__m256 wind_x = _mm256_set1_ps(cfg->wind_x); | |
__m256 wind_y = _mm256_set1_ps(cfg->wind_y); | |
__m256 wind_z = _mm256_set1_ps(cfg->wind_z); | |
__m256 damping = _mm256_set1_ps(cfg->damping); | |
for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) { | |
__m256 px = _mm256_load_ps(&cs->particles_x[base_idx + i]); | |
__m256 py = _mm256_load_ps(&cs->particles_y[base_idx + i]); | |
__m256 pz = _mm256_load_ps(&cs->particles_z[base_idx + i]); | |
__m256 pm = _mm256_load_ps(&cs->inv_mass[base_idx + i]); | |
__m256 ppx = _mm256_load_ps(&cs->prev_particles_x[base_idx + i]); | |
__m256 ppy = _mm256_load_ps(&cs->prev_particles_y[base_idx + i]); | |
__m256 ppz = _mm256_load_ps(&cs->prev_particles_z[base_idx + i]); | |
__m256 fx = _mm256_add_ps(gravity_x, wind_x); | |
__m256 fy = _mm256_add_ps(gravity_y, wind_y); | |
__m256 fz = _mm256_add_ps(gravity_z, wind_z); | |
__m256 ax = _mm256_mul_ps(fx, pm); | |
__m256 ay = _mm256_mul_ps(fy, pm); | |
__m256 az = _mm256_mul_ps(fz, pm); | |
__m256 new_px = _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(two_vec, px), ppx), _mm256_mul_ps(ax, dt2_vec)); | |
__m256 new_py = _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(two_vec, py), ppy), _mm256_mul_ps(ay, dt2_vec)); | |
__m256 new_pz = _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(two_vec, pz), ppz), _mm256_mul_ps(az, dt2_vec)); | |
new_px = _mm256_mul_ps(new_px, damping); | |
new_py = _mm256_mul_ps(new_py, damping); | |
new_pz = _mm256_mul_ps(new_pz, damping); | |
_mm256_store_ps(&cs->particles_x[base_idx + i], new_px); | |
_mm256_store_ps(&cs->particles_y[base_idx + i], new_py); | |
_mm256_store_ps(&cs->particles_z[base_idx + i], new_pz); | |
_mm256_store_ps(&cs->prev_particles_x[base_idx + i], px); | |
_mm256_store_ps(&cs->prev_particles_y[base_idx + i], py); | |
_mm256_store_ps(&cs->prev_particles_z[base_idx + i], pz); | |
} | |
} | |
// Step 2: Constraints | |
for (int c = 0; c < MAX_CHAINS; c++) { | |
int p_base = c * PARTICLES_PER_CHAIN; | |
int r_base = c * CONSTRAINTS_PER_CHAIN; | |
struct chain_cfg *cfg = &cs->chain_configs[c]; | |
__m256 stiffness_vec = _mm256_set1_ps(cfg->stiffness); | |
__m256 stretch_vec = _mm256_set1_ps(cfg->stretch_factor); | |
// Copy 0-15, 16 remains 0 | |
for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) { | |
_mm256_store_ps(&px[i], _mm256_load_ps(&cs->particles_x[p_base + i])); | |
_mm256_store_ps(&py[i], _mm256_load_ps(&cs->particles_y[p_base + i])); | |
_mm256_store_ps(&pz[i], _mm256_load_ps(&cs->particles_z[p_base + i])); | |
_mm256_store_ps(&pm[i], _mm256_load_ps(&cs->inv_mass[p_base + i])); | |
} | |
__m256 rest_vec0 = _mm256_load_ps(&cs->rest_lengths[r_base]); | |
__m256 rest_vec1 = _mm256_load_ps(&cs->rest_lengths[r_base + 8]); | |
for (int iter = 0; iter < MAX_ITERATIONS; iter++) { | |
// Block 0-7 | |
__m256 p0x = _mm256_load_ps(&px[0]); | |
__m256 p0y = _mm256_load_ps(&py[0]); | |
__m256 p0z = _mm256_load_ps(&pz[0]); | |
__m256 p0m = _mm256_load_ps(&pm[0]); | |
__m256 p1x = _mm256_load_ps(&px[8]); | |
__m256 p1y = _mm256_load_ps(&py[8]); | |
__m256 p1z = _mm256_load_ps(&pz[8]); | |
__m256 p1m = _mm256_load_ps(&pm[8]); | |
__m256 dx = _mm256_sub_ps(p1x, p0x); | |
__m256 dy = _mm256_sub_ps(p1y, p0y); | |
__m256 dz = _mm256_sub_ps(p1z, p0z); | |
__m256 dist_sq = _mm256_add_ps(_mm256_mul_ps(dx, dx), _mm256_add_ps(_mm256_mul_ps(dy, dy), _mm256_mul_ps(dz, dz))); | |
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); | |
__m256 temp = _mm256_mul_ps(dist_sq, _mm256_mul_ps(inv_dist, inv_dist)); | |
inv_dist = _mm256_mul_ps(inv_dist, _mm256_sub_ps(three_halves, _mm256_mul_ps(half, temp))); | |
__m256 dist = _mm256_mul_ps(one_vec, inv_dist); | |
__m256 w_sum = _mm256_add_ps(p0m, p1m); | |
__m256 w_sum_clamped = _mm256_max_ps(w_sum, eps_vec); | |
__m256 rcp_w = _mm256_rcp_ps(w_sum_clamped); | |
rcp_w = _mm256_mul_ps(rcp_w, _mm256_sub_ps(two_vec, _mm256_mul_ps(w_sum_clamped, rcp_w))); | |
__m256 diff = _mm256_sub_ps(dist, rest_vec0); | |
__m256 corr_factor = _mm256_mul_ps(diff, _mm256_mul_ps(stiffness_vec, stretch_vec)); | |
corr_factor = _mm256_mul_ps(corr_factor, rcp_w); | |
__m256 delta_x = _mm256_mul_ps(_mm256_mul_ps(dx, inv_dist), corr_factor); | |
__m256 delta_y = _mm256_mul_ps(_mm256_mul_ps(dy, inv_dist), corr_factor); | |
__m256 delta_z = _mm256_mul_ps(_mm256_mul_ps(dz, inv_dist), corr_factor); | |
_mm256_store_ps(&px[0], _mm256_add_ps(p0x, delta_x)); | |
_mm256_store_ps(&py[0], _mm256_add_ps(p0y, delta_y)); | |
_mm256_store_ps(&pz[0], _mm256_add_ps(p0z, delta_z)); | |
_mm256_store_ps(&px[8], _mm256_sub_ps(p1x, delta_x)); | |
_mm256_store_ps(&py[8], _mm256_sub_ps(p1y, delta_y)); | |
_mm256_store_ps(&pz[8], _mm256_sub_ps(p1z, delta_z)); | |
// Block 8-15 | |
p0x = _mm256_load_ps(&px[8]); | |
p0y = _mm256_load_ps(&py[8]); | |
p0z = _mm256_load_ps(&pz[8]); | |
p0m = _mm256_load_ps(&pm[8]); | |
p1x = _mm256_loadu_ps(&px[9]); | |
p1y = _mm256_loadu_ps(&py[9]); | |
p1z = _mm256_loadu_ps(&pz[9]); | |
p1m = _mm256_loadu_ps(&pm[9]); | |
dx = _mm256_sub_ps(p1x, p0x); | |
dy = _mm256_sub_ps(p1y, p0y); | |
dz = _mm256_sub_ps(p1z, p0z); | |
dist_sq = _mm256_add_ps(_mm256_mul_ps(dx, dx), _mm256_add_ps(_mm256_mul_ps(dy, dy), _mm256_mul_ps(dz, dz))); | |
inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); | |
temp = _mm256_mul_ps(dist_sq, _mm256_mul_ps(inv_dist, inv_dist)); | |
inv_dist = _mm256_mul_ps(inv_dist, _mm256_sub_ps(three_halves, _mm256_mul_ps(half, temp))); | |
dist = _mm256_mul_ps(one_vec, inv_dist); | |
w_sum = _mm256_add_ps(p0m, p1m); | |
w_sum_clamped = _mm256_max_ps(w_sum, eps_vec); | |
rcp_w = _mm256_rcp_ps(w_sum_clamped); | |
rcp_w = _mm256_mul_ps(rcp_w, _mm256_sub_ps(two_vec, _mm256_mul_ps(w_sum_clamped, rcp_w))); | |
diff = _mm256_sub_ps(dist, rest_vec1); | |
corr_factor = _mm256_mul_ps(diff, _mm256_mul_ps(stiffness_vec, stretch_vec)); | |
corr_factor = _mm256_mul_ps(corr_factor, rcp_w); | |
delta_x = _mm256_mul_ps(_mm256_mul_ps(dx, inv_dist), corr_factor); | |
delta_y = _mm256_mul_ps(_mm256_mul_ps(dy, inv_dist), corr_factor); | |
delta_z = _mm256_mul_ps(_mm256_mul_ps(dz, inv_dist), corr_factor); | |
_mm256_store_ps(&px[8], _mm256_add_ps(p0x, delta_x)); | |
_mm256_store_ps(&py[8], _mm256_add_ps(p0y, delta_y)); | |
_mm256_store_ps(&pz[8], _mm256_add_ps(p0z, delta_z)); | |
} | |
for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) { | |
_mm256_store_ps(&cs->particles_x[p_base + i], _mm256_load_ps(&px[i])); | |
_mm256_store_ps(&cs->particles_y[p_base + i], _mm256_load_ps(&py[i])); | |
_mm256_store_ps(&cs->particles_z[p_base + i], _mm256_load_ps(&pz[i])); | |
} | |
} | |
// Step 3: Sphere Collisions | |
for (int c = 0; c < MAX_CHAINS; c++) { | |
int p_base = c * PARTICLES_PER_CHAIN; | |
int s_base = c * SPHERES_PER_CHAIN; | |
__m256 sx = _mm256_load_ps(&cs->sphere_x[s_base]); | |
__m256 sy = _mm256_load_ps(&cs->sphere_y[s_base]); | |
__m256 sz = _mm256_load_ps(&cs->sphere_z[s_base]); | |
__m256 sr = _mm256_load_ps(&cs->sphere_radius[s_base]); | |
__m256 px0 = _mm256_load_ps(&cs->particles_x[p_base]); | |
__m256 py0 = _mm256_load_ps(&cs->particles_y[p_base]); | |
__m256 pz0 = _mm256_load_ps(&cs->particles_z[p_base]); | |
__m256 delta_x0 = _mm256_setzero_ps(); | |
__m256 delta_y0 = _mm256_setzero_ps(); | |
__m256 delta_z0 = _mm256_setzero_ps(); | |
for (int s = 0; s < 8; s++) { | |
__m256i idx = _mm256_set1_epi32(s); | |
__m256 px_s = _mm256_permutevar8x32_ps(px0, idx); | |
__m256 py_s = _mm256_permutevar8x32_ps(py0, idx); | |
__m256 pz_s = _mm256_permutevar8x32_ps(pz0, idx); | |
__m256 dx = _mm256_sub_ps(px_s, sx); | |
__m256 dy = _mm256_sub_ps(py_s, sy); | |
__m256 dz = _mm256_sub_ps(pz_s, sz); | |
__m256 dist_sq = _mm256_add_ps(_mm256_mul_ps(dx, dx), _mm256_add_ps(_mm256_mul_ps(dy, dy), _mm256_mul_ps(dz, dz))); | |
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); | |
__m256 dist = _mm256_mul_ps(one_vec, inv_dist); | |
__m256 penetration = _mm256_sub_ps(sr, dist); | |
__m256 mask = _mm256_cmp_ps(penetration, _mm256_setzero_ps(), _CMP_GT_OQ); | |
delta_x0 = _mm256_add_ps(delta_x0, _mm256_and_ps(_mm256_mul_ps(_mm256_mul_ps(dx, inv_dist), penetration), mask)); | |
delta_y0 = _mm256_add_ps(delta_y0, _mm256_and_ps(_mm256_mul_ps(_mm256_mul_ps(dy, inv_dist), penetration), mask)); | |
delta_z0 = _mm256_add_ps(delta_z0, _mm256_and_ps(_mm256_mul_ps(_mm256_mul_ps(dz, inv_dist), penetration), mask)); | |
} | |
px0 = _mm256_add_ps(px0, delta_x0); | |
py0 = _mm256_add_ps(py0, delta_y0); | |
pz0 = _mm256_add_ps(pz0, delta_z0); | |
__m256 px1 = _mm256_load_ps(&cs->particles_x[p_base + 8]); | |
__m256 py1 = _mm256_load_ps(&cs->particles_y[p_base + 8]); | |
__m256 pz1 = _mm256_load_ps(&cs->particles_z[p_base + 8]); | |
__m256 delta_x1 = _mm256_setzero_ps(); | |
__m256 delta_y1 = _mm256_setzero_ps(); | |
__m256 delta_z1 = _mm256_setzero_ps(); | |
for (int s = 0; s < 8; s++) { | |
__m256i idx = _mm256_set1_epi32(s); | |
__m256 px_s = _mm256_permutevar8x32_ps(px1, idx); | |
__m256 py_s = _mm256_permutevar8x32_ps(py1, idx); | |
__m256 pz_s = _mm256_permutevar8x32_ps(pz1, idx); | |
__m256 dx = _mm256_sub_ps(px_s, sx); | |
__m256 dy = _mm256_sub_ps(py_s, sy); | |
__m256 dz = _mm256_sub_ps(pz_s, sz); | |
__m256 dist_sq = _mm256_add_ps(_mm256_mul_ps(dx, dx), _mm256_add_ps(_mm256_mul_ps(dy, dy), _mm256_mul_ps(dz, dz))); | |
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); | |
__m256 dist = _mm256_mul_ps(one_vec, inv_dist); | |
__m256 penetration = _mm256_sub_ps(sr, dist); | |
__m256 mask = _mm256_cmp_ps(penetration, _mm256_setzero_ps(), _CMP_GT_OQ); | |
delta_x1 = _mm256_add_ps(delta_x1, _mm256_and_ps(_mm256_mul_ps(_mm256_mul_ps(dx, inv_dist), penetration), mask)); | |
delta_y1 = _mm256_add_ps(delta_y1, _mm256_and_ps(_mm256_mul_ps(_mm256_mul_ps(dy, inv_dist), penetration), mask)); | |
delta_z1 = _mm256_add_ps(delta_z1, _mm256_and_ps(_mm256_mul_ps(_mm256_mul_ps(dz, inv_dist), penetration), mask)); | |
} | |
px1 = _mm256_add_ps(px1, delta_x1); | |
py1 = _mm256_add_ps(py1, delta_y1); | |
pz1 = _mm256_add_ps(pz1, delta_z1); | |
_mm256_store_ps(&cs->particles_x[p_base], px0); | |
_mm256_store_ps(&cs->particles_y[p_base], py0); | |
_mm256_store_ps(&cs->particles_z[p_base], pz0); | |
_mm256_store_ps(&cs->particles_x[p_base + 8], px1); | |
_mm256_store_ps(&cs->particles_y[p_base + 8], py1); | |
_mm256_store_ps(&cs->particles_z[p_base + 8], pz1); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment