Skip to content

Instantly share code, notes, and snippets.

@vurtun
Last active March 5, 2026 13:17
Show Gist options
  • Select an option

  • Save vurtun/f8e6742d9b5edc00a55219a08d539e7d to your computer and use it in GitHub Desktop.

Select an option

Save vurtun/f8e6742d9b5edc00a55219a08d539e7d to your computer and use it in GitHub Desktop.
3D XPBD CHAIN SIMULATION
#include <immintrin.h>
#include <math.h>
#include <assert.h>
#include <string.h>
/* --- Compiler Feature Macros --- */
#if defined(_MSC_VER)
#define ALWAYS_INLINE __forceinline
#define RESTRICT __restrict
#define ALIGN_VAR(x) __declspec(align(x))
#define UNROLL_HINT __pragma(loop(unroll))
#elif defined(__GNUC__) || defined(__clang__)
#define ALWAYS_INLINE __attribute__((always_inline)) inline
#define RESTRICT __restrict__
#define ALIGN_VAR(x) __attribute__((aligned(x)))
#define UNROLL_HINT _Pragma("GCC unroll 2")
#else
#define ALWAYS_INLINE inline
#define RESTRICT
#define ALIGN_VAR(x)
#define UNROLL_HINT
#endif
/* --- Constants --- */
#define MAX_MOTION_RADIUS 10000.0f
enum {
MAX_CHNS = 1024,
PTC_PER_CHN = 16,
SPH_PER_CHN = 8,
MAX_PTC = (MAX_CHNS * PTC_PER_CHN),
MAX_SPH = (MAX_CHNS * SPH_PER_CHN),
CON_PER_CHN = PTC_PER_CHN,
MAX_REST_LEN = (MAX_CHNS * CON_PER_CHN),
MAX_ITR = 16,
};
struct chn_cfg {
float damping;
float stiffness;
float friction;
float linear_inertia;
float angular_inertia;
float centrifugal_inertia;
float bend_stiffness;
float rest_curvature;
float _pad[8];
} ALIGN_VAR(32);
struct chn_storage_slot {
uint16_t config_id; // 0:2 - Physics Preset ID
uint16_t local_extent; // 2:4 - Max radius in Millimeters (Fixed Point)
uint32_t ptc_packed[15]; // 4:64 - 15 particles, 10-10-10 format
}; // Exactly 64 Bytes
struct chn_sim {
unsigned char free_idx_cnt;
unsigned char free_idx[MAX_CHNS];
// World Transforms (Anchor Point)
float chn_pos_x[MAX_CHNS] ALIGN_VAR(32);
float chn_pos_y[MAX_CHNS] ALIGN_VAR(32);
float chn_pos_z[MAX_CHNS] ALIGN_VAR(32);
float chn_quat_x[MAX_CHNS] ALIGN_VAR(32);
float chn_quat_y[MAX_CHNS] ALIGN_VAR(32);
float chn_quat_z[MAX_CHNS] ALIGN_VAR(32);
float chn_quat_w[MAX_CHNS] ALIGN_VAR(32);
float chn_prev_pos_x[MAX_CHNS] ALIGN_VAR(32);
float chn_prev_pos_y[MAX_CHNS] ALIGN_VAR(32);
float chn_prev_pos_z[MAX_CHNS] ALIGN_VAR(32);
float chn_prev_quat_x[MAX_CHNS] ALIGN_VAR(32);
float chn_prev_quat_y[MAX_CHNS] ALIGN_VAR(32);
float chn_prev_quat_z[MAX_CHNS] ALIGN_VAR(32);
float chn_prev_quat_w[MAX_CHNS] ALIGN_VAR(32);
float chn_grav_x[MAX_CHNS] ALIGN_VAR(32);
float chn_grav_y[MAX_CHNS] ALIGN_VAR(32);
float chn_grav_z[MAX_CHNS] ALIGN_VAR(32);
float chn_wind_x[MAX_CHNS] ALIGN_VAR(32);
float chn_wind_y[MAX_CHNS] ALIGN_VAR(32);
float chn_wind_z[MAX_CHNS] ALIGN_VAR(32);
struct chn_cfg chn_cfg[MAX_CHNS] ALIGN_VAR(32);
// Local Particles
float ptc_pos_x[MAX_PTC] ALIGN_VAR(32);
float ptc_pos_y[MAX_PTC] ALIGN_VAR(32);
float ptc_pos_z[MAX_PTC] ALIGN_VAR(32);
float ptc_prev_pos_x[MAX_PTC] ALIGN_VAR(32);
float ptc_prev_pos_y[MAX_PTC] ALIGN_VAR(32);
float ptc_prev_pos_z[MAX_PTC] ALIGN_VAR(32);
float ptc_inv_mass[MAX_PTC] ALIGN_VAR(32);
// Motion Radius (Hard Constraints)
float rest_pos_x[MAX_PTC] ALIGN_VAR(32);
float rest_pos_y[MAX_PTC] ALIGN_VAR(32);
float rest_pos_z[MAX_PTC] ALIGN_VAR(32);
float motion_radius[MAX_PTC] ALIGN_VAR(32);
// Constraints
float ptc_rest_len[MAX_REST_LEN] ALIGN_VAR(32);
float ptc_lambda[MAX_REST_LEN] ALIGN_VAR(32);
float ptc_bend_lambda[MAX_REST_LEN] ALIGN_VAR(32);
// Colliders
float sph_x[MAX_SPH] ALIGN_VAR(32);
float sph_y[MAX_SPH] ALIGN_VAR(32);
float sph_z[MAX_SPH] ALIGN_VAR(32);
float sph_r[MAX_SPH] ALIGN_VAR(32);
};
static ALWAYS_INLINE unsigned
chn_sys__enq_val(float val, unsigned ext) {
// Convert extent from mm to meters
float scale_meters = (float)ext / 1000.0f;
// Normalized coordinate in range [-1, 1] (roughly)
// Protect against div-by-zero if extent is 0
float inv_scale = (ext > 0) ? (1.0f / scale_meters) : 0.0f;
float normalized = val * inv_scale;
// Map to 10-bit signed integer range [-512, 511]
int q = (int)roundf(normalized * 511.0f);
// Hard clamp to prevent bit-bleeding into other channels
if (q > 511) q = 511;
if (q < -512) q = -512;
// Mask to 10 bits (bit-level representation of the signed int)
return (unsigned)q & 0x3FFu;
}
static ALWAYS_INLINE unsigned
chn_sys__enq_ptc(const float *RESTRICT pos3, unsigned ext) {
unsigned x = chn_sys__enq_val(pos3[0], ext);
unsigned y = chn_sys__enq_val(pos3[1], ext);
unsigned z = chn_sys__enq_val(pos3[2], ext);
// Pack into 10-10-10-2
return x | (y << 10u) | (z << 20u);
}
static ALWAYS_INLINE float
chn_sys__deq_val(unsigned val_paq, unsigned ext) {
// 1. Sign-extend the 10-bit value to a 32-bit signed integer
// Shift left 22 bits to put the 10th bit at the sign position,
// then arithmetic shift right 22 bits to fill the top with the sign bit.
int s_val = (int)(val_paq << 22u) >> 22;
// 2. Map back to float
float normalized = (float)s_val / 511.0f;
float scale_meters = (float)ext / 1000.0f;
return normalized * scale_meters;
}
static ALWAYS_INLINE void
chn_sys__deq_ptc(float *RESTRICT pos3, unsigned ptc, unsigned ext) {
// Extract and dequantize each component with the correct sign extension
pos3[0] = chn_sys__deq_val(ptc & 0x3FFu, ext);
pos3[1] = chn_sys__deq_val((ptc >> 10u) & 0x3FFu, ext);
pos3[2] = chn_sys__deq_val((ptc >> 20u) & 0x3FFu, ext);
}
extern void
chn_sim_init(struct chn_sim *cns) {
assert(cns);
memset(cns, 0, sizeof(*cns));
cns->free_idx_cnt = MAX_CHNS;
for (int i = 0; i < MAX_CHNS; ++i) {
cns->free_idx[i] = MAX_CHNS - i - 1;
cns->chn_quat_w[i] = 1.0f;
cns->chn_prev_quat_w[i] = 1.0f;
}
// 2. Initialize Quaternions using AVX2
__m256 one_v = _mm256_set1_ps(1.0f);
for (int i = 0; i < MAX_CHNS; i += 8) {
_mm256_store_ps(&cns->chn_quat_w[i], one_v);
_mm256_store_ps(&cns->chn_prev_quat_w[i], one_v);
}
// 3. Initialize Motion Radius using AVX2
__m256 rad_v = _mm256_set1_ps(MAX_MOTION_RADIUS);
for (int i = 0; i < MAX_PTC; i += 8) {
_mm256_store_ps(&cns->motion_radius[i], rad_v);
}
for (int i = 0; i < MAX_PTC; ++i) {
cns->motion_radius[i] = MAX_MOTION_RADIUS;
}
}
extern int
chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *RESTRICT rest_pos, int cnt) {
assert(cns);
assert(cfg);
assert(rest_pos);
assert(cns->free_idx_cnt > 0);
int chn = cns->free_idx[--cns->free_idx_cnt];
cns->chn_cfg[chn] = *cfg;
int p_base = (chn * PTC_PER_CHN);
int r_base = (chn * CON_PER_CHN);
for (int i = 0; i < PTC_PER_CHN; ++i) {
int logical_i = (i < 8) ? (i * 2) : ((i - 8) * 2 + 1);
int physical_i = p_base + i;
if (logical_i < cnt) {
cns->ptc_pos_x[physical_i] = rest_pos[logical_i*4+0];
cns->ptc_pos_y[physical_i] = rest_pos[logical_i*4+1];
cns->ptc_pos_z[physical_i] = rest_pos[logical_i*4+2];
cns->ptc_inv_mass[physical_i] = rest_pos[logical_i*4+3];
cns->motion_radius[physical_i] = MAX_MOTION_RADIUS;
} else {
cns->ptc_pos_x[physical_i] = 0.0f;
cns->ptc_pos_y[physical_i] = 0.0f;
cns->ptc_pos_z[physical_i] = 0.0f;
cns->ptc_inv_mass[physical_i] = 0.0f;
cns->motion_radius[physical_i] = MAX_MOTION_RADIUS;
}
cns->rest_pos_x[physical_i] = cns->ptc_pos_x[physical_i];
cns->rest_pos_y[physical_i] = cns->ptc_pos_y[physical_i];
cns->rest_pos_z[physical_i] = cns->ptc_pos_z[physical_i];
cns->ptc_prev_pos_x[physical_i] = cns->ptc_pos_x[physical_i];
cns->ptc_prev_pos_y[physical_i] = cns->ptc_pos_y[physical_i];
cns->ptc_prev_pos_z[physical_i] = cns->ptc_pos_z[physical_i];
}
for (int i = 0; i < PTC_PER_CHN; ++i) {
int physical_r = r_base + i;
cns->ptc_rest_len[physical_r] = 0.0f;
cns->ptc_lambda[physical_r] = 0.0f;
cns->ptc_bend_lambda[physical_r] = 0.0f;
int logical_con_start = (i < 8) ? (i * 2) : ((i - 8) * 2 + 1);
if (logical_con_start + 1 < cnt) {
float dx = rest_pos[(logical_con_start+1)*4+0] - rest_pos[logical_con_start*4+0];
float dy = rest_pos[(logical_con_start+1)*4+1] - rest_pos[logical_con_start*4+1];
float dz = rest_pos[(logical_con_start+1)*4+2] - rest_pos[logical_con_start*4+2];
float len = sqrtf(dx*dx + dy*dy + dz*dz);
cns->ptc_rest_len[physical_r] = (len > 1e-6f) ? len : 1e-6f;
}
}
return chn;
}
extern void
chn_sim_del(struct chn_sim *cns, int chn) {
assert(cns);
assert(chn >= 0);
assert(chn < MAX_CHNS);
int p_base = (chn * PTC_PER_CHN);
int r_base = (chn * CON_PER_CHN);
int s_base = (chn * SPH_PER_CHN);
cns->free_idx[cns->free_idx_cnt++] = chn;
memset(&cns->chn_cfg[chn], 0, sizeof(cns->chn_cfg[0]));
cns->chn_quat_x[chn] = 0.0f;
cns->chn_quat_y[chn] = 0.0f;
cns->chn_quat_z[chn] = 0.0f;
cns->chn_quat_w[chn] = 1.0f;
cns->chn_prev_quat_x[chn] = 0.0f;
cns->chn_prev_quat_y[chn] = 0.0f;
cns->chn_prev_quat_z[chn] = 0.0f;
cns->chn_prev_quat_w[chn] = 1.0f;
memset(&cns->ptc_pos_x[p_base], 0, PTC_PER_CHN * sizeof(float));
memset(&cns->ptc_pos_y[p_base], 0, PTC_PER_CHN * sizeof(float));
memset(&cns->ptc_pos_z[p_base], 0, PTC_PER_CHN * sizeof(float));
memset(&cns->ptc_inv_mass[p_base], 0, PTC_PER_CHN * sizeof(float));
memset(&cns->ptc_rest_len[r_base], 0, CON_PER_CHN * sizeof(float));
for (int i = 0; i < PTC_PER_CHN; ++i) {
cns->motion_radius[p_base + i] = MAX_MOTION_RADIUS;
}
memset(&cns->sph_r[s_base], 0, SPH_PER_CHN * sizeof(float));
}
extern void
chn_sim_mov(struct chn_sim *cns, int chn, const float *RESTRICT pos3, const float *restrict rot4) {
assert(cns);
assert(pos3);
assert(rot4);
assert(chn >= 0);
assert(chn < MAX_CHNS);
cns->chn_pos_x[chn] = pos3[0];
cns->chn_pos_y[chn] = pos3[1];
cns->chn_pos_z[chn] = pos3[2];
cns->chn_quat_x[chn] = rot4[0];
cns->chn_quat_y[chn] = rot4[1];
cns->chn_quat_z[chn] = rot4[2];
cns->chn_quat_w[chn] = rot4[3];
}
extern void
chn_sim_tel(struct chn_sim *cns, int chn, const float *RESTRICT pos3, const float *restrict rot4) {
assert(cns);
assert(pos3);
assert(rot4);
assert(chn >= 0);
assert(chn < MAX_CHNS);
chn_sim_mov(cns, chn, pos3, rot4);
cns->chn_prev_pos_x[chn] = pos3[0];
cns->chn_prev_pos_y[chn] = pos3[1];
cns->chn_prev_pos_z[chn] = pos3[2];
cns->chn_prev_quat_x[chn] = rot4[0];
cns->chn_prev_quat_y[chn] = rot4[1];
cns->chn_prev_quat_z[chn] = rot4[2];
cns->chn_prev_quat_w[chn] = rot4[3];
}
extern void
chn_sim_grav(struct chn_sim *cns, int chn, const float *RESTRICT grav3) {
assert(cns);
assert(grav3);
assert(chn >= 0);
assert(chn < MAX_CHNS);
cns->chn_grav_x[chn] = grav3[0];
cns->chn_grav_y[chn] = grav3[1];
cns->chn_grav_z[chn] = grav3[2];
}
extern void
chn_sim_wind(struct chn_sim *cns, int chn, const float *RESTRICT wind3) {
assert(cns);
assert(wind3);
assert(chn >= 0);
assert(chn < MAX_CHNS);
cns->chn_wind_x[chn] = wind3[0];
cns->chn_wind_y[chn] = wind3[1];
cns->chn_wind_z[chn] = wind3[2];
}
extern void
chn_sim_set_sph(struct chn_sim *cns, int chn, const float *RESTRICT spheres, int cnt) {
assert(cns);
assert(spheres);
assert(chn >= 0);
assert(chn < MAX_CHNS);
assert(spheres);
assert(cnt >= 0);
assert(cnt < MAX_SPH);
int s_base = chn * SPH_PER_CHN;
for (int i = 0; i < cnt && i < SPH_PER_CHN; ++i) {
cns->sph_x[s_base + i] = spheres[i*4+0];
cns->sph_y[s_base + i] = spheres[i*4+1];
cns->sph_z[s_base + i] = spheres[i*4+2];
cns->sph_r[s_base + i] = spheres[i*4+3];
}
for (int i = cnt; i < SPH_PER_CHN; ++i) {
cns->sph_r[s_base + i] = 0.0f;
}
}
extern void
chn_sim_con_motion(struct chn_sim *cns, int chn, const float *RESTRICT radius, int cnt) {
assert(cns);
assert(radius);
assert(chn >= 0);
assert(chn < MAX_CHNS);
assert(cnt >= 0);
assert(cnt < PTC_PER_CHN);
int p_base = chn * PTC_PER_CHN;
for (int i = 0; i < cnt && i < PTC_PER_CHN; ++i) {
int physical_i = (i < 8) ? (p_base + i) : (p_base + 8 + (i-8));
cns->motion_radius[physical_i] = radius[i];
}
}
static ALWAYS_INLINE void
quat_mul_batch8(__m256 *RESTRICT rx, __m256 *RESTRICT ry,
__m256 *RESTRICT rz, __m256 *RESTRICT rw,
__m256 x1, __m256 y1, __m256 z1, __m256 w1,
__m256 x2, __m256 y2, __m256 z2, __m256 w2) {
// 1. Compute Cross Product terms (Q1.xyz x Q2.xyz)
__m256 cross_x = _mm256_sub_ps(_mm256_mul_ps(y1, z2), _mm256_mul_ps(z1, y2));
__m256 cross_y = _mm256_sub_ps(_mm256_mul_ps(z1, x2), _mm256_mul_ps(x1, z2));
__m256 cross_z = _mm256_sub_ps(_mm256_mul_ps(x1, y2), _mm256_mul_ps(y1, x2));
// 2. Compute Dot Product term (Q1.xyz . Q2.xyz)
__m256 dot_x = _mm256_mul_ps(x1, x2);
__m256 dot_y = _mm256_mul_ps(y1, y2);
__m256 dot_z = _mm256_mul_ps(z1, z2);
__m256 dot_sum = _mm256_add_ps(dot_x, _mm256_add_ps(dot_y, dot_z));
// 3. Compute W scale terms
__m256 w1x2 = _mm256_mul_ps(w1, x2);
__m256 x1w2 = _mm256_mul_ps(x1, w2);
__m256 w1y2 = _mm256_mul_ps(w1, y2);
__m256 y1w2 = _mm256_mul_ps(y1, w2);
__m256 w1z2 = _mm256_mul_ps(w1, z2);
__m256 z1w2 = _mm256_mul_ps(z1, w2);
// 4. Final Assembly
// X = (w1*x2 + x1*w2) - cross_x
*rx = _mm256_sub_ps(_mm256_add_ps(w1x2, x1w2), cross_x);
// Y = (w1*y2 + y1*w2) - cross_y
*ry = _mm256_sub_ps(_mm256_add_ps(w1y2, y1w2), cross_y);
// Z = (w1*z2 + z1*w2) - cross_z
*rz = _mm256_sub_ps(_mm256_add_ps(w1z2, z1w2), cross_z);
// W = (w1*w2) - dot_sum
*rw = _mm256_sub_ps(_mm256_mul_ps(w1, w2), dot_sum);
}
static ALWAYS_INLINE void
rot_vec_batch8(__m256 *RESTRICT rx, __m256 *RESTRICT ry, __m256 *RESTRICT rz,
__m256 qx, __m256 qy, __m256 qz, __m256 qw,
__m256 vx, __m256 vy, __m256 vz) {
__m256 two = _mm256_set1_ps(2.0f);
// 1. Calculate Q.xyz x V
__m256 q_cross_v_x = _mm256_sub_ps(_mm256_mul_ps(qy, vz), _mm256_mul_ps(qz, vy));
__m256 q_cross_v_y = _mm256_sub_ps(_mm256_mul_ps(qz, vx), _mm256_mul_ps(qx, vz));
__m256 q_cross_v_z = _mm256_sub_ps(_mm256_mul_ps(qx, vy), _mm256_mul_ps(qy, vx));
// 2. Calculate T = 2 * (Q.xyz x V)
__m256 tx = _mm256_mul_ps(two, q_cross_v_x);
__m256 ty = _mm256_mul_ps(two, q_cross_v_y);
__m256 tz = _mm256_mul_ps(two, q_cross_v_z);
// 3. Calculate Q.xyz x T
__m256 q_cross_t_x = _mm256_sub_ps(_mm256_mul_ps(qy, tz), _mm256_mul_ps(qz, ty));
__m256 q_cross_t_y = _mm256_sub_ps(_mm256_mul_ps(qz, tx), _mm256_mul_ps(qx, tz));
__m256 q_cross_t_z = _mm256_sub_ps(_mm256_mul_ps(qx, ty), _mm256_mul_ps(qy, tx));
// 4. Calculate Q.w * T
__m256 w_times_t_x = _mm256_mul_ps(qw, tx);
__m256 w_times_t_y = _mm256_mul_ps(qw, ty);
__m256 w_times_t_z = _mm256_mul_ps(qw, tz);
// 5. Final Result: V + (Q.w * T) + (Q.xyz x T)
*rx = _mm256_add_ps(vx, _mm256_add_ps(w_times_t_x, q_cross_t_x));
*ry = _mm256_add_ps(vy, _mm256_add_ps(w_times_t_y, q_cross_t_y));
*rz = _mm256_add_ps(vz, _mm256_add_ps(w_times_t_z, q_cross_t_z));
}
static ALWAYS_INLINE void
rot_vec_inv_batch8(__m256 *RESTRICT rx, __m256 *RESTRICT ry, __m256 *RESTRICT rz,
__m256 qx, __m256 qy, __m256 qz, __m256 qw,
__m256 vx, __m256 vy, __m256 vz) {
// Conjugate the quaternion (-x, -y, -z, w)
__m256 neg_qx = _mm256_sub_ps(_mm256_setzero_ps(), qx);
__m256 neg_qy = _mm256_sub_ps(_mm256_setzero_ps(), qy);
__m256 neg_qz = _mm256_sub_ps(_mm256_setzero_ps(), qz);
// Perform standard rotation using the conjugate
rot_vec_batch8(rx, ry, rz, neg_qx, neg_qy, neg_qz, qw, vx, vy, vz);
}
static ALWAYS_INLINE void
solve_dist(__m256 *RESTRICT p1x, __m256 *RESTRICT p1y, __m256 *RESTRICT p1z,
__m256 invm1, __m256 *RESTRICT p2x,
__m256 *RESTRICT p2y, __m256 *RESTRICT p2z,
__m256 invm2, __m256 rest_len, __m256 compliance,
__m256 *RESTRICT lambda, __m256 eps) {
__m256 dx = _mm256_sub_ps(*p1x, *p2x);
__m256 dy = _mm256_sub_ps(*p1y, *p2y);
__m256 dz = _mm256_sub_ps(*p1z, *p2z);
__m256 d2 = _mm256_fmadd_ps(dx, dx, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dz, dz)));
__m256 inv_d = _mm256_rsqrt_ps(_mm256_max_ps(d2, eps));
__m256 dL = _mm256_div_ps(_mm256_sub_ps(_mm256_setzero_ps(),
_mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(d2, inv_d), rest_len),
_mm256_mul_ps(compliance, *lambda))),
_mm256_add_ps(_mm256_add_ps(invm1, invm2), compliance));
dL = _mm256_and_ps(dL, _mm256_cmp_ps(rest_len, _mm256_setzero_ps(), _CMP_GT_OQ));
*lambda = _mm256_add_ps(*lambda, dL);
__m256 px = _mm256_mul_ps(_mm256_mul_ps(dx, inv_d), dL);
__m256 py = _mm256_mul_ps(_mm256_mul_ps(dy, inv_d), dL);
__m256 pz = _mm256_mul_ps(_mm256_mul_ps(dz, inv_d), dL);
*p1x = _mm256_fmadd_ps(px, invm1, *p1x);
*p1y = _mm256_fmadd_ps(py, invm1, *p1y);
*p1z = _mm256_fmadd_ps(pz, invm1, *p1z);
*p2x = _mm256_fnmadd_ps(px, invm2, *p2x);
*p2y = _mm256_fnmadd_ps(py, invm2, *p2y);
*p2z = _mm256_fnmadd_ps(pz, invm2, *p2z);
}
static ALWAYS_INLINE void
solve_angle(__m256* RESTRICT p0x, __m256* RESTRICT p0y, __m256 *RESTRICT p0z,
__m256 invm0, __m256 *RESTRICT p1x, __m256 *RESTRICT p1y,
__m256 *RESTRICT p1z, __m256 invm1, __m256 *RESTRICT p2x,
__m256 *RESTRICT p2y, __m256 *RESTRICT p2z, __m256 invm2,
__m256 compliance, __m256 *RESTRICT lambda, __m256 target_cos,
__m256 mask, __m256 eps) {
__m256 e1x = _mm256_sub_ps(*p1x, *p0x);
__m256 e1y = _mm256_sub_ps(*p1y, *p0y);
__m256 e1z = _mm256_sub_ps(*p1z, *p0z);
__m256 l1_sq = _mm256_fmadd_ps(e1x, e1x, _mm256_fmadd_ps(e1y, e1y, _mm256_mul_ps(e1z, e1z)));
__m256 i1 = _mm256_rsqrt_ps(_mm256_max_ps(l1_sq, eps));
__m256 e2x = _mm256_sub_ps(*p2x, *p1x);
__m256 e2y = _mm256_sub_ps(*p2y, *p1y);
__m256 e2z = _mm256_sub_ps(*p2z, *p1z);
__m256 l2_sq = _mm256_fmadd_ps(e2x, e2x, _mm256_fmadd_ps(e2y, e2y, _mm256_mul_ps(e2z, e2z)));
__m256 i2 = _mm256_rsqrt_ps(_mm256_max_ps(l2_sq, eps));
__m256 n1x = _mm256_mul_ps(e1x, i1);
__m256 n1y = _mm256_mul_ps(e1y, i1);
__m256 n1z = _mm256_mul_ps(e1z, i1);
__m256 n2x = _mm256_mul_ps(e2x, i2);
__m256 n2y = _mm256_mul_ps(e2y, i2);
__m256 n2z = _mm256_mul_ps(e2z, i2);
__m256 dot = _mm256_fmadd_ps(n1x, n2x, _mm256_fmadd_ps(n1y, n2y, _mm256_mul_ps(n1z, n2z)));
__m256 C = _mm256_sub_ps(dot, target_cos);
__m256 gp0x = _mm256_mul_ps(_mm256_fmsub_ps(dot, n1x, n2x), i1);
__m256 gp0y = _mm256_mul_ps(_mm256_fmsub_ps(dot, n1y, n2y), i1);
__m256 gp0z = _mm256_mul_ps(_mm256_fmsub_ps(dot, n1z, n2z), i1);
__m256 gp2x = _mm256_mul_ps(_mm256_fmsub_ps(dot, n2x, n1x), _mm256_sub_ps(_mm256_setzero_ps(), i2));
__m256 gp2y = _mm256_mul_ps(_mm256_fmsub_ps(dot, n2y, n1y), _mm256_sub_ps(_mm256_setzero_ps(), i2));
__m256 gp2z = _mm256_mul_ps(_mm256_fmsub_ps(dot, n2z, n1z), _mm256_sub_ps(_mm256_setzero_ps(), i2));
__m256 gp1x = _mm256_sub_ps(_mm256_setzero_ps(), _mm256_add_ps(gp0x, gp2x));
__m256 gp1y = _mm256_sub_ps(_mm256_setzero_ps(), _mm256_add_ps(gp0y, gp2y));
__m256 gp1z = _mm256_sub_ps(_mm256_setzero_ps(), _mm256_add_ps(gp0z, gp2z));
__m256 w = _mm256_add_ps(_mm256_mul_ps(invm0, _mm256_fmadd_ps(gp0x, gp0x, _mm256_fmadd_ps(gp0y, gp0y, _mm256_mul_ps(gp0z, gp0z)))),
_mm256_add_ps(_mm256_mul_ps(invm1, _mm256_fmadd_ps(gp1x, gp1x, _mm256_fmadd_ps(gp1y, gp1y, _mm256_mul_ps(gp1z, gp1z)))),
_mm256_mul_ps(invm2, _mm256_fmadd_ps(gp2x, gp2x, _mm256_fmadd_ps(gp2y, gp2y, _mm256_mul_ps(gp2z, gp2z))))));
__m256 dL = _mm256_div_ps(_mm256_sub_ps(_mm256_setzero_ps(), _mm256_add_ps(C, _mm256_mul_ps(compliance, *lambda))), _mm256_add_ps(w, compliance));
dL = _mm256_and_ps(dL, mask); // CRITICAL: Mask out the delta lambda for the wrap-around lane
*lambda = _mm256_add_ps(*lambda, dL);
*p0x = _mm256_fmadd_ps(gp0x, _mm256_mul_ps(dL, invm0), *p0x);
*p0y = _mm256_fmadd_ps(gp0y, _mm256_mul_ps(dL, invm0), *p0y);
*p0z = _mm256_fmadd_ps(gp0z, _mm256_mul_ps(dL, invm0), *p0z);
*p1x = _mm256_fmadd_ps(gp1x, _mm256_mul_ps(dL, invm1), *p1x);
*p1y = _mm256_fmadd_ps(gp1y, _mm256_mul_ps(dL, invm1), *p1y);
*p1z = _mm256_fmadd_ps(gp1z, _mm256_mul_ps(dL, invm1), *p1z);
*p2x = _mm256_fmadd_ps(gp2x, _mm256_mul_ps(dL, invm2), *p2x);
*p2y = _mm256_fmadd_ps(gp2y, _mm256_mul_ps(dL, invm2), *p2y);
*p2z = _mm256_fmadd_ps(gp2z, _mm256_mul_ps(dL, invm2), *p2z);
}
/* --- Simulation Loop --- */
extern void
chn_sim_run(struct chn_sim *cns, float dt) {
const __m256 dt2_v = _mm256_set1_ps(dt * dt);
const __m256 eps_v = _mm256_set1_ps(1e-7f);
const __m256 zero_v = _mm256_setzero_ps();
const __m256i sl = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
const __m256i sr = _mm256_setr_epi32(7, 0, 1, 2, 3, 4, 5, 6);
ALIGN_VAR(32) float b_fx[8], b_fy[8], b_fz[8];
ALIGN_VAR(32) float b_vx[8], b_vy[8], b_vz[8];
ALIGN_VAR(32) float b_qrx[8], b_qry[8], b_qrz[8], b_qrw[8];
for (int c_blk = 0; c_blk < MAX_CHNS; c_blk += 8) {
/* --- PHASE 1: Batched Anchor Updates --- */
{
__m256 qx = _mm256_load_ps(&cns->chn_quat_x[c_blk]);
__m256 qy = _mm256_load_ps(&cns->chn_quat_y[c_blk]);
__m256 qz = _mm256_load_ps(&cns->chn_quat_z[c_blk]);
__m256 qw = _mm256_load_ps(&cns->chn_quat_w[c_blk]);
__m256 pqx = _mm256_load_ps(&cns->chn_prev_quat_x[c_blk]);
__m256 pqy = _mm256_load_ps(&cns->chn_prev_quat_y[c_blk]);
__m256 pqz = _mm256_load_ps(&cns->chn_prev_quat_z[c_blk]);
__m256 pqw = _mm256_load_ps(&cns->chn_prev_quat_w[c_blk]);
__m256 cx = _mm256_load_ps(&cns->chn_pos_x[c_blk]);
__m256 cy = _mm256_load_ps(&cns->chn_pos_y[c_blk]);
__m256 cz = _mm256_load_ps(&cns->chn_pos_z[c_blk]);
__m256 px = _mm256_load_ps(&cns->chn_prev_pos_x[c_blk]);
__m256 py = _mm256_load_ps(&cns->chn_prev_pos_y[c_blk]);
__m256 pz = _mm256_load_ps(&cns->chn_prev_pos_z[c_blk]);
__m256 fx = _mm256_add_ps(_mm256_load_ps(&cns->chn_grav_x[c_blk]),_mm256_load_ps(&cns->chn_wind_x[c_blk]));
__m256 fy = _mm256_add_ps(_mm256_load_ps(&cns->chn_grav_y[c_blk]),_mm256_load_ps(&cns->chn_wind_y[c_blk]));
__m256 fz = _mm256_add_ps(_mm256_load_ps(&cns->chn_grav_z[c_blk]),_mm256_load_ps(&cns->chn_wind_z[c_blk]));
__m256 wfx, wfy, wfz;
rot_vec_inv_batch8(&wfx, &wfy, &wfz, qx, qy, qz, qw, fx, fy, fz);
_mm256_store_ps(b_fx, wfx);
_mm256_store_ps(b_fy, wfy);
_mm256_store_ps(b_fz, wfz);
__m256 lvx, lvy, lvz;
rot_vec_inv_batch8(&lvx, &lvy, &lvz, qx, qy, qz, qw,
_mm256_sub_ps(cx, px), _mm256_sub_ps(cy, py), _mm256_sub_ps(cz, pz));
_mm256_store_ps(b_vx, lvx);
_mm256_store_ps(b_vy, lvy);
_mm256_store_ps(b_vz, lvz);
__m256 qrx, qry, qrz, qrw;
quat_mul_batch8(&qrx, &qry, &qrz, &qrw,
_mm256_sub_ps(zero_v, qx), _mm256_sub_ps(zero_v, qy), _mm256_sub_ps(zero_v, qz),
qw, pqx, pqy, pqz, pqw);
_mm256_store_ps(b_qrx, qrx);
_mm256_store_ps(b_qry, qry);
_mm256_store_ps(b_qrz, qrz);
_mm256_store_ps(b_qrw, qrw);
_mm256_store_ps(&cns->chn_prev_pos_x[c_blk], cx);
_mm256_store_ps(&cns->chn_prev_pos_y[c_blk], cy);
_mm256_store_ps(&cns->chn_prev_pos_z[c_blk], cz);
_mm256_store_ps(&cns->chn_prev_quat_x[c_blk], qx);
_mm256_store_ps(&cns->chn_prev_quat_y[c_blk], qy);
_mm256_store_ps(&cns->chn_prev_quat_z[c_blk], qz);
_mm256_store_ps(&cns->chn_prev_quat_w[c_blk], qw);
}
/* --- PHASE 2: Local Chain Simulation --- */
for (int i = 0; i < 8; ++i) {
int c = c_blk + i;
int p_base = c * PTC_PER_CHN, r_base = c * CON_PER_CHN;
struct chn_cfg *cfg = &cns->chn_cfg[c];
__m256 damp = _mm256_set1_ps(cfg->damping);
__m256 compl = _mm256_div_ps(dt2_v, _mm256_max_ps(_mm256_set1_ps(cfg->stiffness), eps_v));
__m256 b_compl = _mm256_div_ps(dt2_v, _mm256_max_ps(_mm256_set1_ps(cfg->bend_stiffness), eps_v));
__m256 t_cos = _mm256_set1_ps(1.0f - (cfg->rest_curvature * 0.5f));
__m256 qrx = _mm256_set1_ps(b_qrx[i]);
__m256 qry = _mm256_set1_ps(b_qry[i]);
__m256 qrz = _mm256_set1_ps(b_qrz[i]);
__m256 qrw = _mm256_set1_ps(b_qrw[i]);
__m256 o_sq = _mm256_add_ps(_mm256_mul_ps(qrx, qrx), _mm256_add_ps(_mm256_mul_ps(qry, qry), _mm256_mul_ps(qrz, qrz)));
__m256 lv_x = _mm256_set1_ps(b_vx[i]);
__m256 lv_y = _mm256_set1_ps(b_vy[i]);
__m256 lv_z = _mm256_set1_ps(b_vz[i]);
__m256 i_l = _mm256_set1_ps(cfg->linear_inertia);
__m256 i_a = _mm256_set1_ps(cfg->angular_inertia);
__m256 i_c = _mm256_set1_ps(cfg->centrifugal_inertia);
__m256 p_pos[3][2], p_prev[3][2];
__m256 inv_m[2], rl[2], lm[2], blm[2], r_pos[3][2], m_rad[2];
UNROLL_HINT
for (int k = 0; k < 2; ++k) {
int off = p_base + k*8;
int r_off = r_base + k*8;
p_pos[0][k] = _mm256_load_ps(&cns->ptc_pos_x[off]);
p_pos[1][k] = _mm256_load_ps(&cns->ptc_pos_y[off]);
p_pos[2][k] = _mm256_load_ps(&cns->ptc_pos_z[off]);
p_prev[0][k] = _mm256_load_ps(&cns->ptc_prev_pos_x[off]);
p_prev[1][k] = _mm256_load_ps(&cns->ptc_prev_pos_y[off]);
p_prev[2][k] = _mm256_load_ps(&cns->ptc_prev_pos_z[off]);
inv_m[k] = _mm256_load_ps(&cns->ptc_inv_mass[off]);
rl[k] = _mm256_load_ps(&cns->ptc_rest_len[r_off]);
lm[k] = _mm256_load_ps(&cns->ptc_lambda[r_off]);
blm[k] = _mm256_load_ps(&cns->ptc_bend_lambda[r_off]);
r_pos[0][k] = _mm256_load_ps(&cns->rest_pos_x[off]);
r_pos[1][k] = _mm256_load_ps(&cns->rest_pos_y[off]);
r_pos[2][k] = _mm256_load_ps(&cns->rest_pos_z[off]);
m_rad[k] = _mm256_load_ps(&cns->motion_radius[off]);
}
/* --- Predict + Inertia --- */
UNROLL_HINT
for (int k = 0; k < 2; ++k) {
__m256 dyn = _mm256_cmp_ps(inv_m[k], zero_v, _CMP_GT_OQ);
__m256 vx = _mm256_sub_ps(p_pos[0][k], p_prev[0][k]);
__m256 vy = _mm256_sub_ps(p_pos[1][k], p_prev[1][k]);
__m256 vz = _mm256_sub_ps(p_pos[2][k], p_prev[2][k]);
vx = _mm256_sub_ps(vx, _mm256_and_ps(_mm256_mul_ps(lv_x, i_l), dyn));
vy = _mm256_sub_ps(vy, _mm256_and_ps(_mm256_mul_ps(lv_y, i_l), dyn));
vz = _mm256_sub_ps(vz, _mm256_and_ps(_mm256_mul_ps(lv_z, i_l), dyn));
__m256 rax, ray, raz;
rot_vec_batch8(&rax, &ray, &raz, qrx, qry, qrz, qrw, p_pos[0][k], p_pos[1][k], p_pos[2][k]);
vx = _mm256_sub_ps(vx, _mm256_and_ps(_mm256_mul_ps(_mm256_sub_ps(p_pos[0][k], rax), i_a), dyn));
vy = _mm256_sub_ps(vy, _mm256_and_ps(_mm256_mul_ps(_mm256_sub_ps(p_pos[1][k], ray), i_a), dyn));
vz = _mm256_sub_ps(vz, _mm256_and_ps(_mm256_mul_ps(_mm256_sub_ps(p_pos[2][k], raz), i_a), dyn));
vx = _mm256_add_ps(vx, _mm256_and_ps(_mm256_mul_ps(_mm256_mul_ps(p_pos[0][k], o_sq), i_c), dyn));
vy = _mm256_add_ps(vy, _mm256_and_ps(_mm256_mul_ps(_mm256_mul_ps(p_pos[1][k], o_sq), i_c), dyn));
vz = _mm256_add_ps(vz, _mm256_and_ps(_mm256_mul_ps(_mm256_mul_ps(p_pos[2][k], o_sq), i_c), dyn));
p_prev[0][k] = p_pos[0][k];
p_prev[1][k] = p_pos[1][k];
p_prev[2][k] = p_pos[2][k];
__m256 fx = _mm256_set1_ps(b_fx[i]);
__m256 fy = _mm256_set1_ps(b_fy[i]);
__m256 fz = _mm256_set1_ps(b_fz[i]);
p_pos[0][k] = _mm256_fmadd_ps(_mm256_mul_ps(fx, inv_m[k]), dt2_v, _mm256_fmadd_ps(vx, damp, p_pos[0][k]));
p_pos[1][k] = _mm256_fmadd_ps(_mm256_mul_ps(fy, inv_m[k]), dt2_v, _mm256_fmadd_ps(vy, damp, p_pos[1][k]));
p_pos[2][k] = _mm256_fmadd_ps(_mm256_mul_ps(fz, inv_m[k]), dt2_v, _mm256_fmadd_ps(vz, damp, p_pos[2][k]));
}
/* --- XPBD Solver Iterations --- */
for (int it = 0; it < MAX_ITR; ++it) {
{
// Even(i) <-> Odd(i)
__m256 p2x = p_pos[0][1];
__m256 p2y = p_pos[1][1];
__m256 p2z = p_pos[2][1];
__m256 p2m = inv_m[1];
solve_dist(&p_pos[0][0], &p_pos[1][0], &p_pos[2][0], inv_m[0],
&p2x, &p2y, &p2z, p2m, rl[0], compl, &lm[0], eps_v);
// Direct write-back for k=0
p_pos[0][1] = p2x;
p_pos[1][1] = p2y;
p_pos[2][1] = p2z;
}
{
// Odd(i) <-> Shifted Even(i+1)
__m256 p2x = _mm256_permutevar8x32_ps(p_pos[0][0], sl);
__m256 p2y = _mm256_permutevar8x32_ps(p_pos[1][0], sl);
__m256 p2z = _mm256_permutevar8x32_ps(p_pos[2][0], sl);
__m256 p2m = _mm256_permutevar8x32_ps(inv_m[0], sl);
__m256 o2x = p2x;
__m256 o2y = p2y;
__m256 o2z = p2z;
solve_dist(&p_pos[0][1], &p_pos[1][1], &p_pos[2][1], inv_m[1],
&p2x, &p2y, &p2z, p2m, rl[1], compl, &lm[1], eps_v);
// Shifted accumulation for k=1
p_pos[0][0] = _mm256_add_ps(p_pos[0][0], _mm256_permutevar8x32_ps(_mm256_sub_ps(p2x, o2x), sr));
p_pos[1][0] = _mm256_add_ps(p_pos[1][0], _mm256_permutevar8x32_ps(_mm256_sub_ps(p2y, o2y), sr));
p_pos[2][0] = _mm256_add_ps(p_pos[2][0], _mm256_permutevar8x32_ps(_mm256_sub_ps(p2z, o2z), sr));
}
// Bending (Group 1: Odd Center)
{
__m256 v_b = _mm256_and_ps(_mm256_cmp_ps(rl[0], zero_v, _CMP_GT_OQ), _mm256_cmp_ps(rl[1], zero_v, _CMP_GT_OQ));
__m256 rx = _mm256_permutevar8x32_ps(p_pos[0][0], sl);
__m256 ry = _mm256_permutevar8x32_ps(p_pos[1][0], sl);
__m256 rz = _mm256_permutevar8x32_ps(p_pos[2][0], sl);
__m256 rm = _mm256_permutevar8x32_ps(inv_m[0], sl);
solve_angle(&p_pos[0][0], &p_pos[1][0], &p_pos[2][0], inv_m[0],
&p_pos[0][1], &p_pos[1][1], &p_pos[2][1], inv_m[1], &rx, &ry, &rz, rm,
b_compl, &blm[1], t_cos, v_b, eps_v);
p_pos[0][0] = _mm256_blendv_ps(p_pos[0][0], _mm256_permutevar8x32_ps(rx, sr), v_b);
p_pos[1][0] = _mm256_blendv_ps(p_pos[1][0], _mm256_permutevar8x32_ps(ry, sr), v_b);
p_pos[2][0] = _mm256_blendv_ps(p_pos[2][0], _mm256_permutevar8x32_ps(rz, sr), v_b);
}
// Bending (Group 2: Even Center)
{
__m256 v_b = _mm256_and_ps(_mm256_cmp_ps(_mm256_permutevar8x32_ps(rl[1], sr), zero_v, _CMP_GT_OQ), _mm256_cmp_ps(rl[0], zero_v, _CMP_GT_OQ));
__m256 lx = _mm256_permutevar8x32_ps(p_pos[0][1], sr);
__m256 ly = _mm256_permutevar8x32_ps(p_pos[1][1], sr);
__m256 lz = _mm256_permutevar8x32_ps(p_pos[2][1], sr);
__m256 lmm = _mm256_permutevar8x32_ps(inv_m[1], sr);
solve_angle(&lx, &ly, &lz, lmm,
&p_pos[0][0], &p_pos[1][0], &p_pos[2][0], inv_m[0],
&p_pos[0][1], &p_pos[1][1], &p_pos[2][1], inv_m[1],
b_compl, &blm[0], t_cos, v_b, eps_v);
p_pos[0][1] = _mm256_blendv_ps(p_pos[0][1], _mm256_permutevar8x32_ps(lx, sl), v_b);
p_pos[1][1] = _mm256_blendv_ps(p_pos[1][1], _mm256_permutevar8x32_ps(ly, sl), v_b);
p_pos[2][1] = _mm256_blendv_ps(p_pos[2][1], _mm256_permutevar8x32_ps(lz, sl), v_b);
}
}
/* --- Collisions --- */
int s_base = c * SPH_PER_CHN;
const __m256 f_coef = _mm256_set1_ps(cfg->friction);
for (int s = 0; s < SPH_PER_CHN; ++s) {
__m256 sr = _mm256_broadcast_ss(&cns->sph_r[s_base + s]);
__m256 sx = _mm256_broadcast_ss(&cns->sph_x[s_base + s]);
__m256 sy = _mm256_broadcast_ss(&cns->sph_y[s_base + s]);
__m256 sz = _mm256_broadcast_ss(&cns->sph_z[s_base + s]);
UNROLL_HINT
for (int k = 0; k < 2; ++k) {
__m256 dx = _mm256_sub_ps(p_pos[0][k], sx);
__m256 dy = _mm256_sub_ps(p_pos[1][k], sy);
__m256 dz = _mm256_sub_ps(p_pos[2][k], sz);
__m256 d2 = _mm256_add_ps(_mm256_mul_ps(dx, dx), _mm256_add_ps(_mm256_mul_ps(dy, dy), _mm256_mul_ps(dz, dz)));
// 1. Calculate safe distance
__m256 id = _mm256_rsqrt_ps(_mm256_max_ps(d2, eps_v));
__m256 dist = _mm256_mul_ps(d2, id);
// 2. Only calculate push if there is actual penetration and sr > 0
__m256 active_sph = _mm256_cmp_ps(sr, eps_v, _CMP_GT_OQ);
__m256 colliding = _mm256_cmp_ps(dist, sr, _CMP_LT_OQ);
__m256 mask = _mm256_and_ps(active_sph, colliding);
__m256 pen = _mm256_sub_ps(sr, dist);
__m256 push = _mm256_and_ps(pen, mask);
// 3. Normal vector (zeroed if not colliding to prevent NaN propagation)
__m256 nx = _mm256_mul_ps(dx, id);
__m256 ny = _mm256_mul_ps(dy, id);
__m256 nz = _mm256_mul_ps(dz, id);
p_pos[0][k] = _mm256_add_ps(p_pos[0][k], _mm256_mul_ps(nx, push));
p_pos[1][k] = _mm256_add_ps(p_pos[1][k], _mm256_mul_ps(ny, push));
p_pos[2][k] = _mm256_add_ps(p_pos[2][k], _mm256_mul_ps(nz, push));
// 4. Friction (Stable Step-by-Step)
__m256 vx = _mm256_sub_ps(p_pos[0][k], p_prev[0][k]);
__m256 vy = _mm256_sub_ps(p_pos[1][k], p_prev[1][k]);
__m256 vz = _mm256_sub_ps(p_pos[2][k], p_prev[2][k]);
__m256 dot = _mm256_add_ps(_mm256_mul_ps(vx, nx), _mm256_add_ps(_mm256_mul_ps(vy, ny), _mm256_mul_ps(vz, nz)));
// Tangent Velocity = Total Velocity - Normal Velocity
__m256 tx = _mm256_sub_ps(vx, _mm256_mul_ps(dot, nx));
__m256 ty = _mm256_sub_ps(vy, _mm256_mul_ps(dot, ny));
__m256 tz = _mm256_sub_ps(vz, _mm256_mul_ps(dot, nz));
__m256 f_scale = _mm256_mul_ps(push, f_coef);
// Apply friction only where colliding
p_pos[0][k] = _mm256_sub_ps(p_pos[0][k], _mm256_and_ps(_mm256_mul_ps(tx, f_scale), mask));
p_pos[1][k] = _mm256_sub_ps(p_pos[1][k], _mm256_and_ps(_mm256_mul_ps(ty, f_scale), mask));
p_pos[2][k] = _mm256_sub_ps(p_pos[2][k], _mm256_and_ps(_mm256_mul_ps(tz, f_scale), mask));
}
}
/* --- Motion Radius Guardrail --- */
UNROLL_HINT
for (int k = 0; k < 2; ++k) {
__m256 dx = _mm256_sub_ps(p_pos[0][k], r_pos[0][k]);
__m256 dy = _mm256_sub_ps(p_pos[1][k], r_pos[1][k]);
__m256 dz = _mm256_sub_ps(p_pos[2][k], r_pos[2][k]);
__m256 d2 = _mm256_fmadd_ps(dx, dx, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dz, dz)));
__m256 dist = _mm256_sqrt_ps(d2);
__m256 mask = _mm256_cmp_ps(dist, m_rad[k], _CMP_GT_OQ);
__m256 scale = _mm256_div_ps(m_rad[k], _mm256_max_ps(dist, eps_v));
p_pos[0][k] = _mm256_blendv_ps(p_pos[0][k], _mm256_fmadd_ps(dx, scale, r_pos[0][k]), mask);
p_pos[1][k] = _mm256_blendv_ps(p_pos[1][k], _mm256_fmadd_ps(dy, scale, r_pos[1][k]), mask);
p_pos[2][k] = _mm256_blendv_ps(p_pos[2][k], _mm256_fmadd_ps(dz, scale, r_pos[2][k]), mask);
}
/* --- Final Store --- */
UNROLL_HINT
for (int k = 0; k < 2; ++k) {
int off = p_base + k*8, r_off = r_base + k*8;
_mm256_store_ps(&cns->ptc_pos_x[off], p_pos[0][k]);
_mm256_store_ps(&cns->ptc_pos_y[off], p_pos[1][k]);
_mm256_store_ps(&cns->ptc_pos_z[off], p_pos[2][k]);
_mm256_store_ps(&cns->ptc_prev_pos_x[off], p_prev[0][k]);
_mm256_store_ps(&cns->ptc_prev_pos_y[off], p_prev[1][k]);
_mm256_store_ps(&cns->ptc_prev_pos_z[off], p_prev[2][k]);
_mm256_store_ps(&cns->ptc_lambda[r_off], lm[k]);
_mm256_store_ps(&cns->ptc_bend_lambda[r_off], blm[k]);
}
}
}
}

Yes! You are in luck. Springiness is the exact reason XPBD (Extended Position Based Dynamics) was invented.

Standard PBD (Position Based Dynamics) creates infinitely rigid constraints. XPBD introduces "Compliance" ($\alpha$), which mathematically transforms your rigid links into perfect, frame-rate-independent Hookean springs (like rubber bands or bungee cords).

However, looking closely at your code, you have a mathematical bug in your compliance calculation that is currently preventing true springiness.

The Bug: Your Compliance Math is Upside Down

In your current code, you calculate compliance like this:

// Current Code: compliance = dt^2 / stiffness
__m256 compl = _mm256_div_ps(dt2_v, _mm256_max_ps(_mm256_set1_ps(cfg->stiffness), eps_v));

According to the XPBD whitepaper by Macklin et al., physical compliance ($\alpha$) is the inverse of stiffness ($k$). The solver requires $\tilde{\alpha}$ (tilde alpha), which is: $$ \tilde{\alpha} = \frac{\alpha}{\Delta t^2} = \frac{1}{k \cdot \Delta t^2} $$

Because your code is multiplying by $dt^2$ instead of dividing by it, your simulation is currently frame-rate dependent, and your stiffness value isn't behaving like a true physical spring.

How to Add Perfect Springiness

Step 1: Fix the Compliance Formula

Change your compl and b_compl (bending) variables in the local chain simulation phase to match the true XPBD equation:

// NEW CODE: compliance = 1.0f / (stiffness * dt^2)
__m256 stiff_dt2 = _mm256_mul_ps(_mm256_set1_ps(cfg->stiffness), dt2_v);
__m256 compl = _mm256_div_ps(one_vec, _mm256_max_ps(stiff_dt2, eps_v));

__m256 b_stiff_dt2 = _mm256_mul_ps(_mm256_set1_ps(cfg->bend_stiffness), dt2_v);
__m256 b_compl = _mm256_div_ps(one_vec, _mm256_max_ps(b_stiff_dt2, eps_v));

(Note: You will need to define const __m256 one_vec = _mm256_set1_ps(1.0f); at the top of your chn_sim_run function).

Step 2: Change Your Config Values

Right now, your cfg->stiffness is scaled between [0, 1]. Because we fixed the math to be physically accurate, stiffness is now a real-world spring constant (Newtons per meter).

  • For a rigid iron chain: Set stiffness to 1000000.0f
  • For a bouncy rubber band: Set stiffness to 100.0f
  • For soft hair/cloth: Set stiffness to 10.0f

By just tweaking that single cfg->stiffness float, your chains will natively stretch, bounce, and oscillate exactly like real-world springs, and they will do it with absolute mathematical stability regardless of your frame rate.


Did you mean "Collision Bounciness"? (Restitution)

If by "springiness" you meant you want the chain to bounce off colliders (like a rubber ball hitting the floor), that is called Restitution.

Currently, your collision solver is perfectly inelastic (restitution = 0). When a particle hits a sphere, its velocity along the normal is killed, causing it to slide smoothly.

If you want it to bounce off spheres, you would inject a Restitution calculation right after your friction logic inside the collision loop:

// Pseudo-logic for bouncing:
// If the particle is moving TOWARD the sphere (vel_dot_norm < 0)
// reflect the velocity and multiply by a bounciness factor (e.g., 0.5)

__m256 restitution = _mm256_set1_ps(cfg->bounciness); // Add this to your config

// Only bounce if velocity is into the surface
__m256 bounce_mask = _mm256_cmp_ps(vel_dot_norm, zero_v, _CMP_LT_OQ);

// Calculate bounce impulse: -(1 + restitution) * vel_dot_norm * norm
__m256 bounce_mag = _mm256_mul_ps(_mm256_sub_ps(zero_v, _mm256_add_ps(one_vec, restitution)), vel_dot_norm);
__m256 bounce_x = _mm256_mul_ps(norm_x, bounce_mag);
// ... y and z

// Add to new_p_x (masked by collision and direction)
// ...

Summary:

  1. If you want a bungee cord / rubber band, fix the 1 line of compliance math, and XPBD will give it to you for free.
  2. If you want it to ricochet off objects, add a restitution vector to your collision solver!

Transitioning from NvCloth to a custom XPBD solver is a very common path in AAA development right now. NvCloth is powerful but acts as a "black box" that is notoriously heavy on CPU cache and hard to multi-thread alongside other engine tasks.

Your old NvCloth parameters are completely valid and mapping them to XPBD will actually unlock the exact visual behavior you had before.

Here is the breakdown of how to translate these specific NvCloth concepts into your AVX2 XPBD solver, and why they are incredibly valuable.


1. Drag and Lift (Aerodynamics)

Currently, your solver treats wind as a flat, constant acceleration: force = gravity + wind. This looks "fake" because a chain moving with the wind shouldn't feel wind force, and a chain moving against the wind should feel immense resistance.

Drag opposes the Relative Velocity between the particle and the wind. Lift acts perpendicular to the wind and the chain segment (like an airplane wing).

Should you add them?

  • Drag: YES. This is critical for cloth/chain realism. It acts as a natural, physically correct damping mechanism.
  • Lift: Maybe. Lift calculations require finding the tangent of the chain segment and doing cross-products. For 1D chains (ropes/necklaces), lift is often visually imperceptible compared to drag. I highly recommend starting with just Drag to save AVX2 cycles.

How to implement Aerodynamic Drag in your Predict Step: Instead of a flat wind force, you calculate the relative velocity ($V_{rel} = V_{particle} - V_{wind}$).

// Inside your Phase 2 (Predict + Inertia) block:

__m256 drag_coeff = _mm256_set1_ps(cfg->dragCoefficient); // e.g. 0.5f

// 1. Particle Velocity
__m256 vel_x = _mm256_mul_ps(_mm256_sub_ps(p_curr_x, p_prev_x), inv_dt_vec);
__m256 vel_y = _mm256_mul_ps(_mm256_sub_ps(p_curr_y, p_prev_y), inv_dt_vec);
__m256 vel_z = _mm256_mul_ps(_mm256_sub_ps(p_curr_z, p_prev_z), inv_dt_vec);

// 2. Relative Velocity (Particle Vel - Wind Vel)
__m256 vrel_x = _mm256_sub_ps(vel_x, chn_wnd_x_vec);
__m256 vrel_y = _mm256_sub_ps(vel_y, chn_wnd_y_vec);
__m256 vrel_z = _mm256_sub_ps(vel_z, chn_wnd_z_vec);

// 3. Magnitude of Vrel
__m256 vrel_mag_sq = _mm256_fmadd_ps(vrel_z, vrel_z, _mm256_fmadd_ps(vrel_y, vrel_y, _mm256_mul_ps(vrel_x, vrel_x)));
__m256 vrel_mag = _mm256_sqrt_ps(vrel_mag_sq);

// 4. Drag Force = -0.5 * dragCoeff * Vrel_mag * Vrel
__m256 drag_mag = _mm256_mul_ps(_mm256_set1_ps(-0.5f), _mm256_mul_ps(drag_coeff, vrel_mag));
__m256 force_x = _mm256_fmadd_ps(vrel_x, drag_mag, chn_grav_x_vec); // Add gravity
__m256 force_y = _mm256_fmadd_ps(vrel_y, drag_mag, chn_grav_y_vec);
__m256 force_z = _mm256_fmadd_ps(vrel_z, drag_mag, chn_grav_z_vec);

// Apply force_x/y/z to acceleration as normal...

2. NvCloth Phase Constraints (The Secret to Ropes)

In NvCloth, a "Phase" is just a group of constraints. Let's look at your variables:

  • phasesStiffness: This maps exactly to your current cfg->stiffness.
  • phasesStiffnessMultiplier: Just pre-multiply this on the CPU (cfg.stiffness *= cfg.multiplier). Do not waste SIMD cycles on this.

The "Genius" Variables: phasesCompressionLimit & phasesStretchLimit These are called Strain Limits, and they are the reason NvCloth feels so good for things like ropes and capes.

  • Standard XPBD: Always tries to force the distance to exactly rest_length. If you compress a chain, it acts like a stiff spring and pushes back.
  • Strain Limits: You define a "safe zone".
    • If stretchLimit = 1.0 and compressionLimit = 0.0, the chain can bend and collapse inward completely without pushing back, but it will physically halt if stretched beyond its original length. This perfectly mimics a metal chain or a loose rope.

How to implement Strain Limits in AVX2 (Branchless): This is incredibly elegant to add to XPBD. You just clamp the target_distance to your limits before calculating the constraint C.

Add this to your chn_cfg struct:

float compression_limit; // e.g., 0.0f for chains, 0.9f for stiff cables
float stretch_limit;     // e.g., 1.0f for chains, 1.1f for bungee cords

Modify your solve_dist logic inside the inner solver loop:

__m256 comp_limit_vec = _mm256_set1_ps(cfg->compression_limit);
__m256 strc_limit_vec = _mm256_set1_ps(cfg->stretch_limit);

// Calculate the allowed bounds for this specific constraint
__m256 lower_bound = _mm256_mul_ps(rest_even, comp_limit_vec);
__m256 upper_bound = _mm256_mul_ps(rest_even, strc_limit_vec);

// Clamp the CURRENT distance to the bounds to find the TARGET distance
__m256 target_dist = _mm256_max_ps(lower_bound, _mm256_min_ps(dist, upper_bound));

// Calculate Constraint (C)
// If dist is within limits, dist == target_dist, so C = 0 (no force applied!)
__m256 C = _mm256_sub_ps(dist, target_dist); 

// Proceed with standard XPBD lambda math...
__m256 temp = _mm256_add_ps(C, _mm256_mul_ps(compliance_factor, lambda_even)); 
// ...

Summary of the Translation

NvCloth Variable XPBD Equivalent / Action
mass Maps directly to ptc_inv_mass (inverse is stored).
dragCoefficient Must Implement. Huge visual upgrade. Replaces static wind.
liftCoefficient Ignore for now. Expensive for 1D lines; drag does 90% of the work.
damping You already have this implemented perfectly as a velocity multiplier.
phasesStiffness Maps directly to XPBD Compliance (remember: $1.0 / (k \cdot dt^2)$).
phasesMultiplier Multiply on CPU during chn_sim_add.
Compression/Stretch Limits Must Implement. This is what makes a rope act like a rope instead of a rigid stick. Done via max/min clamping of the rest length.

By implementing the Drag and the Strain Limiting, your 0.5ms AVX2 solver will mathematically perfectly replicate the visual feel of NvCloth, but it will execute an order of magnitude faster and without the cache thrashing of the PhysX middleware.

// NEW CODE: compliance = 1.0f / (stiffness * dt^2)
__m256 stiff_dt2 = _mm256_mul_ps(_mm256_set1_ps(cfg->stiffness), dt2_v);
__m256 compl = _mm256_div_ps(one_vec, _mm256_max_ps(stiff_dt2, eps_v));
__m256 b_stiff_dt2 = _mm256_mul_ps(_mm256_set1_ps(cfg->bend_stiffness), dt2_v);
__m256 b_compl = _mm256_div_ps(one_vec, _mm256_max_ps(b_stiff_dt2, eps_v));
#include "raylib.h"
#include "raymath.h"
#include <stdio.h>
#include <math.h>
#if 0
struct ClothChainInstanceParameter
{
ArrayView< const float3 > joints;
float32 mass = 1.0f;
// wind
float32 dragCoefficient = 0.5f; // air drag coefficient
float32 liftCoefficient = 0.6f; // air lift coefficient
// damping
float32 stiffnessFrequencey = 8.0f; // number of iteration per frame the solver will run over constraints
float32 damping = 0.0f; // damping of local particle velocity (1/stiffnessFrequency). 0 (default): velocity is unaffected, 1: velocity is zeroed
float32 motionConstraintsRadius = 10.0f;
// acceleration
float32 linearInertia = 1.0f; // Set the portion of local frame acceleration applied to particles. 0: particles are unaffected, 1 ( default ) : physically correct.
float32 angularInertia = 1.0f;
float32 centrifugalInertia = 4.0f;
// phase
float32 phasesStiffness = 1.0f;
float32 phasesStiffnessMultiplier = 1.0f;
float32 phasesCompressionLimit = 1.0f;
float32 phasesStretchLimit = 1.0f;
// collision
float32 collisionFriction = 0.0f; // chain collision shape friction coefficient
};
#endif
static Vector3
GetParticleWorldPos(struct chn_sim* sim, int chain_id, int particle_idx) {
int p_base = chain_id * PTC_PER_CHN;
int phys_idx = (particle_idx % 2 == 0) ?
(p_base + particle_idx/2) :
(p_base + 8 + particle_idx/2);
float lx = sim->ptc_pos_x[phys_idx];
float ly = sim->ptc_pos_y[phys_idx];
float lz = sim->ptc_pos_z[phys_idx];
// Manual Vector3 x Quaternion rotation (v' = v + 2 * cross(q.xyz, cross(q.xyz, v) + q.w * v))
float qx = sim->chn_quat_x[chain_id];
float qy = sim->chn_quat_y[chain_id];
float qz = sim->chn_quat_z[chain_id];
float qw = sim->chn_quat_w[chain_id];
float tx = 2.0f * (qy * lz - qz * ly);
float ty = 2.0f * (qz * lx - qx * lz);
float tz = 2.0f * (qx * ly - qy * lx);
float rx = lx + qw * tx + (qy * tz - qz * ty);
float ry = ly + qw * ty + (qz * tx - qx * tz);
float rz = lz + qw * tz + (qx * ty - qy * tx);
return (Vector3){
sim->chn_pos_x[chain_id] + rx,
sim->chn_pos_y[chain_id] + ry,
sim->chn_pos_z[chain_id] + rz
};
}
extern int
main(void) {
const int screenWidth = 1280;
const int screenHeight = 720;
InitWindow(screenWidth, screenHeight, "Enshrouded Chain Physics - Diagnostic");
SetTargetFPS(60);
Camera3D camera = { 0 };
camera.position = (Vector3){ 0.0f, 2.0f, 5.0f };
camera.target = (Vector3){ 0.0f, 0.5f, 0.0f };
camera.up = (Vector3){ 0.0f, 1.0f, 0.0f };
camera.fovy = 45.0f;
camera.projection = CAMERA_PERSPECTIVE;
struct chn_sim sim;
chn_sim_init(&sim);
struct chn_cfg cfg = {
.damping = 0.99f,
.stiffness = 0.5f,
.bend_stiffness = 10.0f,
.friction = 0.5f,
.linear_inertia = 0.1f,
.rest_curvature = 0.0f
};
// Create chain
float local_rest_pos[16 * 4];
for (int i = 0; i < 16; ++i) {
local_rest_pos[i*4 + 0] = 0.0f;
local_rest_pos[i*4 + 1] = -(0.15f * i); // Offset downwards from root
local_rest_pos[i*4 + 2] = 0.0f;
local_rest_pos[i*4 + 3] = (i==0) ? 0.0f : 1.0f; // Root fixed at 0,0,0 local
}
chn_sim_add(&sim, &cfg, local_rest_pos, 16);
float grav[] = { 0.0f, -9.81f, 0.0f };
chn_sim_grav(&sim, 0, grav);
// Set initial world location
float p_init[] = { 0.0f, 2.0f, 0.0f }; // Start at height 2.0
float r_init[] = { 0, 0, 0, 1 };
chn_sim_tel(&sim, 0, p_init, r_init);
Vector3 rootPos = {0.0f, 2.0f, 0.0f};
Vector3 spherePos = { 0.0f, 0.8f, 0.2f };
float sphereRadius = 0.7f;
while (!WindowShouldClose()) {
if (IsKeyPressed(KEY_Q)) {
break;
}
float dt = 0.016f; // Fixed DT for stable debugging
if (IsKeyDown(KEY_UP)) {
rootPos.z -= 4.0f * dt;
}
if (IsKeyDown(KEY_DOWN)) {
rootPos.z += 4.0f * dt;
}
if (IsKeyDown(KEY_LEFT)) {
rootPos.x -= 4.0f * dt;
}
if (IsKeyDown(KEY_RIGHT)) {
rootPos.x += 4.0f * dt;
}
// Move Sphere
if (IsKeyDown(KEY_W)) {
spherePos.z -= 2.0f * dt;
}
if (IsKeyDown(KEY_S)) {
spherePos.z += 2.0f * dt;
}
if (IsKeyDown(KEY_A)) {
spherePos.x -= 2.0f * dt;
}
if (IsKeyDown(KEY_D)){
spherePos.x += 2.0f * dt;
}
UpdateCamera(&camera, CAMERA_ORBITAL);
// 1. Move the World Space Root and pass movement to the simulator
rootPos.x += sinf(GetTime() * 2.0f) * 0.005f;
float p[] = { rootPos.x, rootPos.y, rootPos.z };
float r[] = { 0, 0, 0, 1 }; // Identity rotation for now
chn_sim_mov(&sim, 0, p, r);
// 3. Update Colliders (MUST be in Local Space relative to rootPos)
float local_sph[] = {
spherePos.x - rootPos.x,
spherePos.y - rootPos.y,
spherePos.z - rootPos.z,
sphereRadius
};
chn_sim_set_sph(&sim, 0, local_sph, 1);
chn_sim_run(&sim, dt);
BeginDrawing();
ClearBackground(DARKGRAY);
BeginMode3D(camera);
DrawGrid(10, 1.0f);
DrawSphere(spherePos, sphereRadius, Fade(RED, 0.3f));
for(int i=0; i<15; i++) {
Vector3 p1 = GetParticleWorldPos(&sim, 0, i);
Vector3 p2 = GetParticleWorldPos(&sim, 0, i+1);
DrawCapsule(p1, p2, 0.02f, 4, 4, GOLD);
}
EndMode3D();
DrawFPS(10, 10);
EndDrawing();
}
CloseWindow();
return 0;
}
@vurtun
Copy link
Author

vurtun commented Dec 19, 2025

image

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment