Skip to content

Instantly share code, notes, and snippets.

@vurtun
Last active April 28, 2025 17:45
Show Gist options
  • Save vurtun/f8e6742d9b5edc00a55219a08d539e7d to your computer and use it in GitHub Desktop.
Save vurtun/f8e6742d9b5edc00a55219a08d539e7d to your computer and use it in GitHub Desktop.
chain simulation
#include <immintrin.h>
#include <pthread.h>
#include <assert.h>
#include <string.h>
enum {
MAX_CHAINS = 256,
PARTICLES_PER_CHAIN = 16,
SPHERES_PER_CHAIN = 8,
MAX_PARTICLES = (MAX_CHAINS * PARTICLES_PER_CHAIN),
MAX_SPHERES = (MAX_CHAINS * SPHERES_PER_CHAIN) ,
MAX_REST_LENGTHS = (MAX_PARTICLES),
CONSTRAINTS_PER_CHAIN = 16,
MAX_ITERATIONS = 16,
};
struct chain_cfg {
float gravity_x, gravity_y, gravity_z;
float wind_x, wind_y, wind_z;
float damping, drag, stiffness, stretch_factor, tether_max_length;
};
struct chain_sim {
unsigned short free_idx_cnt;
unsigned short free_idx[MAX_CHAINS];
struct chain_cfg chain_configs[MAX_CHAINS] __attribute__((aligned(32)));
float particles_x[MAX_PARTICLES] __attribute__((aligned(32)));
float particles_y[MAX_PARTICLES] __attribute__((aligned(32)));
float particles_z[MAX_PARTICLES] __attribute__((aligned(32)));
float inv_mass[MAX_PARTICLES] __attribute__((aligned(32)));
float prev_particles_x[MAX_PARTICLES] __attribute__((aligned(32)));
float prev_particles_y[MAX_PARTICLES] __attribute__((aligned(32)));
float prev_particles_z[MAX_PARTICLES] __attribute__((aligned(32)));
float rest_lengths[MAX_REST_LENGTHS] __attribute__((aligned(32)));
float sphere_x[MAX_SPHERES] __attribute__((aligned(32)));
float sphere_y[MAX_SPHERES] __attribute__((aligned(32)));
float sphere_z[MAX_SPHERES] __attribute__((aligned(32)));
float sphere_radius[MAX_SPHERES] __attribute__((aligned(32)));
};
void simulate_chains(struct chain_sim *cs, float dt) {
const __m256 dt2_vec = _mm256_set1_ps(dt * dt);
const __m256 two_vec = _mm256_set1_ps(2.0f);
const __m256 one_vec = _mm256_set1_ps(1.0f);
const __m256 eps_vec = _mm256_set1_ps(1e-6f);
const __m256 half = _mm256_set1_ps(0.5f);
const __m256 three_halves = _mm256_set1_ps(1.5f);
// Step 2 arrays moved up, 17 elements, only 16th zeroed
float px[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
float py[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
float pz[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
float pm[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
// zero dummy elements
px[PARTICLES_PER_CHAIN] = 0.0f;
py[PARTICLES_PER_CHAIN] = 0.0f;
pz[PARTICLES_PER_CHAIN] = 0.0f;
pm[PARTICLES_PER_CHAIN] = 0.0f;
// Step 1: Verlet Integration
for (int c = 0; c < MAX_CHAINS; c++) {
int base_idx = c * PARTICLES_PER_CHAIN;
struct chain_cfg *cfg = &cs->chain_configs[c];
__m256 gravity_x = _mm256_set1_ps(cfg->gravity_x);
__m256 gravity_y = _mm256_set1_ps(cfg->gravity_y);
__m256 gravity_z = _mm256_set1_ps(cfg->gravity_z);
__m256 wind_x = _mm256_set1_ps(cfg->wind_x);
__m256 wind_y = _mm256_set1_ps(cfg->wind_y);
__m256 wind_z = _mm256_set1_ps(cfg->wind_z);
__m256 damping = _mm256_set1_ps(cfg->damping);
for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
__m256 px = _mm256_load_ps(&cs->particles_x[base_idx + i]);
__m256 py = _mm256_load_ps(&cs->particles_y[base_idx + i]);
__m256 pz = _mm256_load_ps(&cs->particles_z[base_idx + i]);
__m256 pm = _mm256_load_ps(&cs->inv_mass[base_idx + i]);
__m256 ppx = _mm256_load_ps(&cs->prev_particles_x[base_idx + i]);
__m256 ppy = _mm256_load_ps(&cs->prev_particles_y[base_idx + i]);
__m256 ppz = _mm256_load_ps(&cs->prev_particles_z[base_idx + i]);
__m256 fx = _mm256_add_ps(gravity_x, wind_x);
__m256 fy = _mm256_add_ps(gravity_y, wind_y);
__m256 fz = _mm256_add_ps(gravity_z, wind_z);
__m256 ax = _mm256_mul_ps(fx, pm);
__m256 ay = _mm256_mul_ps(fy, pm);
__m256 az = _mm256_mul_ps(fz, pm);
__m256 new_px = _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(two_vec, px), ppx), _mm256_mul_ps(ax, dt2_vec));
__m256 new_py = _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(two_vec, py), ppy), _mm256_mul_ps(ay, dt2_vec));
__m256 new_pz = _mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(two_vec, pz), ppz), _mm256_mul_ps(az, dt2_vec));
new_px = _mm256_mul_ps(new_px, damping);
new_py = _mm256_mul_ps(new_py, damping);
new_pz = _mm256_mul_ps(new_pz, damping);
_mm256_store_ps(&cs->particles_x[base_idx + i], new_px);
_mm256_store_ps(&cs->particles_y[base_idx + i], new_py);
_mm256_store_ps(&cs->particles_z[base_idx + i], new_pz);
_mm256_store_ps(&cs->prev_particles_x[base_idx + i], px);
_mm256_store_ps(&cs->prev_particles_y[base_idx + i], py);
_mm256_store_ps(&cs->prev_particles_z[base_idx + i], pz);
}
}
// Step 2: Constraints
for (int c = 0; c < MAX_CHAINS; c++) {
int p_base = c * PARTICLES_PER_CHAIN;
int r_base = c * CONSTRAINTS_PER_CHAIN;
struct chain_cfg *cfg = &cs->chain_configs[c];
__m256 stiffness_vec = _mm256_set1_ps(cfg->stiffness);
__m256 stretch_vec = _mm256_set1_ps(cfg->stretch_factor);
// Copy 0-15, 16 remains 0
for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
_mm256_store_ps(&px[i], _mm256_load_ps(&cs->particles_x[p_base + i]));
_mm256_store_ps(&py[i], _mm256_load_ps(&cs->particles_y[p_base + i]));
_mm256_store_ps(&pz[i], _mm256_load_ps(&cs->particles_z[p_base + i]));
_mm256_store_ps(&pm[i], _mm256_load_ps(&cs->inv_mass[p_base + i]));
}
__m256 rest_vec0 = _mm256_load_ps(&cs->rest_lengths[r_base]);
__m256 rest_vec1 = _mm256_load_ps(&cs->rest_lengths[r_base + 8]);
for (int iter = 0; iter < MAX_ITERATIONS; iter++) {
// Block 0-7
__m256 p0x = _mm256_load_ps(&px[0]);
__m256 p0y = _mm256_load_ps(&py[0]);
__m256 p0z = _mm256_load_ps(&pz[0]);
__m256 p0m = _mm256_load_ps(&pm[0]);
__m256 p1x = _mm256_load_ps(&px[8]);
__m256 p1y = _mm256_load_ps(&py[8]);
__m256 p1z = _mm256_load_ps(&pz[8]);
__m256 p1m = _mm256_load_ps(&pm[8]);
__m256 dx = _mm256_sub_ps(p1x, p0x);
__m256 dy = _mm256_sub_ps(p1y, p0y);
__m256 dz = _mm256_sub_ps(p1z, p0z);
__m256 dist_sq = _mm256_add_ps(_mm256_mul_ps(dx, dx), _mm256_add_ps(_mm256_mul_ps(dy, dy), _mm256_mul_ps(dz, dz)));
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
__m256 temp = _mm256_mul_ps(dist_sq, _mm256_mul_ps(inv_dist, inv_dist));
inv_dist = _mm256_mul_ps(inv_dist, _mm256_sub_ps(three_halves, _mm256_mul_ps(half, temp)));
__m256 dist = _mm256_mul_ps(one_vec, inv_dist);
__m256 w_sum = _mm256_add_ps(p0m, p1m);
__m256 w_sum_clamped = _mm256_max_ps(w_sum, eps_vec);
__m256 rcp_w = _mm256_rcp_ps(w_sum_clamped);
rcp_w = _mm256_mul_ps(rcp_w, _mm256_sub_ps(two_vec, _mm256_mul_ps(w_sum_clamped, rcp_w)));
__m256 diff = _mm256_sub_ps(dist, rest_vec0);
__m256 corr_factor = _mm256_mul_ps(diff, _mm256_mul_ps(stiffness_vec, stretch_vec));
corr_factor = _mm256_mul_ps(corr_factor, rcp_w);
__m256 delta_x = _mm256_mul_ps(_mm256_mul_ps(dx, inv_dist), corr_factor);
__m256 delta_y = _mm256_mul_ps(_mm256_mul_ps(dy, inv_dist), corr_factor);
__m256 delta_z = _mm256_mul_ps(_mm256_mul_ps(dz, inv_dist), corr_factor);
_mm256_store_ps(&px[0], _mm256_add_ps(p0x, delta_x));
_mm256_store_ps(&py[0], _mm256_add_ps(p0y, delta_y));
_mm256_store_ps(&pz[0], _mm256_add_ps(p0z, delta_z));
_mm256_store_ps(&px[8], _mm256_sub_ps(p1x, delta_x));
_mm256_store_ps(&py[8], _mm256_sub_ps(p1y, delta_y));
_mm256_store_ps(&pz[8], _mm256_sub_ps(p1z, delta_z));
// Block 8-15
p0x = _mm256_load_ps(&px[8]);
p0y = _mm256_load_ps(&py[8]);
p0z = _mm256_load_ps(&pz[8]);
p0m = _mm256_load_ps(&pm[8]);
p1x = _mm256_loadu_ps(&px[9]);
p1y = _mm256_loadu_ps(&py[9]);
p1z = _mm256_loadu_ps(&pz[9]);
p1m = _mm256_loadu_ps(&pm[9]);
dx = _mm256_sub_ps(p1x, p0x);
dy = _mm256_sub_ps(p1y, p0y);
dz = _mm256_sub_ps(p1z, p0z);
dist_sq = _mm256_add_ps(_mm256_mul_ps(dx, dx), _mm256_add_ps(_mm256_mul_ps(dy, dy), _mm256_mul_ps(dz, dz)));
inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
temp = _mm256_mul_ps(dist_sq, _mm256_mul_ps(inv_dist, inv_dist));
inv_dist = _mm256_mul_ps(inv_dist, _mm256_sub_ps(three_halves, _mm256_mul_ps(half, temp)));
dist = _mm256_mul_ps(one_vec, inv_dist);
w_sum = _mm256_add_ps(p0m, p1m);
w_sum_clamped = _mm256_max_ps(w_sum, eps_vec);
rcp_w = _mm256_rcp_ps(w_sum_clamped);
rcp_w = _mm256_mul_ps(rcp_w, _mm256_sub_ps(two_vec, _mm256_mul_ps(w_sum_clamped, rcp_w)));
diff = _mm256_sub_ps(dist, rest_vec1);
corr_factor = _mm256_mul_ps(diff, _mm256_mul_ps(stiffness_vec, stretch_vec));
corr_factor = _mm256_mul_ps(corr_factor, rcp_w);
delta_x = _mm256_mul_ps(_mm256_mul_ps(dx, inv_dist), corr_factor);
delta_y = _mm256_mul_ps(_mm256_mul_ps(dy, inv_dist), corr_factor);
delta_z = _mm256_mul_ps(_mm256_mul_ps(dz, inv_dist), corr_factor);
_mm256_store_ps(&px[8], _mm256_add_ps(p0x, delta_x));
_mm256_store_ps(&py[8], _mm256_add_ps(p0y, delta_y));
_mm256_store_ps(&pz[8], _mm256_add_ps(p0z, delta_z));
}
for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
_mm256_store_ps(&cs->particles_x[p_base + i], _mm256_load_ps(&px[i]));
_mm256_store_ps(&cs->particles_y[p_base + i], _mm256_load_ps(&py[i]));
_mm256_store_ps(&cs->particles_z[p_base + i], _mm256_load_ps(&pz[i]));
}
}
// Step 3: Sphere Collisions
for (int c = 0; c < MAX_CHAINS; c++) {
int p_base = c * PARTICLES_PER_CHAIN;
int s_base = c * SPHERES_PER_CHAIN;
__m256 sx = _mm256_load_ps(&cs->sphere_x[s_base]);
__m256 sy = _mm256_load_ps(&cs->sphere_y[s_base]);
__m256 sz = _mm256_load_ps(&cs->sphere_z[s_base]);
__m256 sr = _mm256_load_ps(&cs->sphere_radius[s_base]);
__m256 px0 = _mm256_load_ps(&cs->particles_x[p_base]);
__m256 py0 = _mm256_load_ps(&cs->particles_y[p_base]);
__m256 pz0 = _mm256_load_ps(&cs->particles_z[p_base]);
__m256 delta_x0 = _mm256_setzero_ps();
__m256 delta_y0 = _mm256_setzero_ps();
__m256 delta_z0 = _mm256_setzero_ps();
for (int s = 0; s < 8; s++) {
__m256i idx = _mm256_set1_epi32(s);
__m256 px_s = _mm256_permutevar8x32_ps(px0, idx);
__m256 py_s = _mm256_permutevar8x32_ps(py0, idx);
__m256 pz_s = _mm256_permutevar8x32_ps(pz0, idx);
__m256 dx = _mm256_sub_ps(px_s, sx);
__m256 dy = _mm256_sub_ps(py_s, sy);
__m256 dz = _mm256_sub_ps(pz_s, sz);
__m256 dist_sq = _mm256_add_ps(_mm256_mul_ps(dx, dx), _mm256_add_ps(_mm256_mul_ps(dy, dy), _mm256_mul_ps(dz, dz)));
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
__m256 dist = _mm256_mul_ps(one_vec, inv_dist);
__m256 penetration = _mm256_sub_ps(sr, dist);
__m256 mask = _mm256_cmp_ps(penetration, _mm256_setzero_ps(), _CMP_GT_OQ);
delta_x0 = _mm256_add_ps(delta_x0, _mm256_and_ps(_mm256_mul_ps(_mm256_mul_ps(dx, inv_dist), penetration), mask));
delta_y0 = _mm256_add_ps(delta_y0, _mm256_and_ps(_mm256_mul_ps(_mm256_mul_ps(dy, inv_dist), penetration), mask));
delta_z0 = _mm256_add_ps(delta_z0, _mm256_and_ps(_mm256_mul_ps(_mm256_mul_ps(dz, inv_dist), penetration), mask));
}
px0 = _mm256_add_ps(px0, delta_x0);
py0 = _mm256_add_ps(py0, delta_y0);
pz0 = _mm256_add_ps(pz0, delta_z0);
__m256 px1 = _mm256_load_ps(&cs->particles_x[p_base + 8]);
__m256 py1 = _mm256_load_ps(&cs->particles_y[p_base + 8]);
__m256 pz1 = _mm256_load_ps(&cs->particles_z[p_base + 8]);
__m256 delta_x1 = _mm256_setzero_ps();
__m256 delta_y1 = _mm256_setzero_ps();
__m256 delta_z1 = _mm256_setzero_ps();
for (int s = 0; s < 8; s++) {
__m256i idx = _mm256_set1_epi32(s);
__m256 px_s = _mm256_permutevar8x32_ps(px1, idx);
__m256 py_s = _mm256_permutevar8x32_ps(py1, idx);
__m256 pz_s = _mm256_permutevar8x32_ps(pz1, idx);
__m256 dx = _mm256_sub_ps(px_s, sx);
__m256 dy = _mm256_sub_ps(py_s, sy);
__m256 dz = _mm256_sub_ps(pz_s, sz);
__m256 dist_sq = _mm256_add_ps(_mm256_mul_ps(dx, dx), _mm256_add_ps(_mm256_mul_ps(dy, dy), _mm256_mul_ps(dz, dz)));
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
__m256 dist = _mm256_mul_ps(one_vec, inv_dist);
__m256 penetration = _mm256_sub_ps(sr, dist);
__m256 mask = _mm256_cmp_ps(penetration, _mm256_setzero_ps(), _CMP_GT_OQ);
delta_x1 = _mm256_add_ps(delta_x1, _mm256_and_ps(_mm256_mul_ps(_mm256_mul_ps(dx, inv_dist), penetration), mask));
delta_y1 = _mm256_add_ps(delta_y1, _mm256_and_ps(_mm256_mul_ps(_mm256_mul_ps(dy, inv_dist), penetration), mask));
delta_z1 = _mm256_add_ps(delta_z1, _mm256_and_ps(_mm256_mul_ps(_mm256_mul_ps(dz, inv_dist), penetration), mask));
}
px1 = _mm256_add_ps(px1, delta_x1);
py1 = _mm256_add_ps(py1, delta_y1);
pz1 = _mm256_add_ps(pz1, delta_z1);
_mm256_store_ps(&cs->particles_x[p_base], px0);
_mm256_store_ps(&cs->particles_y[p_base], py0);
_mm256_store_ps(&cs->particles_z[p_base], pz0);
_mm256_store_ps(&cs->particles_x[p_base + 8], px1);
_mm256_store_ps(&cs->particles_y[p_base + 8], py1);
_mm256_store_ps(&cs->particles_z[p_base + 8], pz1);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment