Skip to content

Instantly share code, notes, and snippets.

@vurtun
Last active June 23, 2025 11:39
Show Gist options
  • Save vurtun/f8e6742d9b5edc00a55219a08d539e7d to your computer and use it in GitHub Desktop.
Save vurtun/f8e6742d9b5edc00a55219a08d539e7d to your computer and use it in GitHub Desktop.
chain simulation
#include <immintrin.h>
#include <math.h>
enum {
MAX_CHAINS = 64,
PARTICLES_PER_CHAIN = 16,
SPHERES_PER_CHAIN = 8,
MAX_PARTICLES = (MAX_CHAINS * PARTICLES_PER_CHAIN),
MAX_SPHERES = (MAX_CHAINS * SPHERES_PER_CHAIN),
CONSTRAINTS_PER_CHAIN = PARTICLES_PER_CHAIN,
MAX_REST_LENGTHS = (MAX_CHAINS * CONSTRAINTS_PER_CHAIN),
MAX_ITERATIONS = 4,
};
struct chain_cfg {
float damping;
float stiffness;
float stretch_factor;
float max_strain;
float friction;
float collision_radius;
float linear_inertia; // [0, 1], 1.0f = physically correct linear motion
float angular_inertia; // [0, 1], 1.0f = physically correct angular rotation
float centrifugal_inertia; // [0, 1], 1.0f = physically correct centrifugal force
} __attribute__((aligned(32)));
struct chain_sim {
unsigned short free_idx_cnt;
unsigned short free_idx[MAX_CHAINS];
struct chain_cfg chain_configs[MAX_CHAINS] __attribute__((aligned(32)));
// Chain global forces (world space gravity and wind)
float gravity_x[MAX_CHAINS] __attribute__((aligned(32)));
float gravity_y[MAX_CHAINS] __attribute__((aligned(32)));
float gravity_z[MAX_CHAINS] __attribute__((aligned(32)));
float wind_x[MAX_CHAINS] __attribute__((aligned(32)));
float wind_y[MAX_CHAINS] __attribute__((aligned(32)));
float wind_z[MAX_CHAINS] __attribute__((aligned(32)));
// Chain transformations (world space)
float chain_pos_x[MAX_CHAINS] __attribute__((aligned(32)));
float chain_pos_y[MAX_CHAINS] __attribute__((aligned(32)));
float chain_pos_z[MAX_CHAINS] __attribute__((aligned(32)));
float chain_quat_x[MAX_CHAINS] __attribute__((aligned(32)));
float chain_quat_y[MAX_CHAINS] __attribute__((aligned(32)));
float chain_quat_z[MAX_CHAINS] __attribute__((aligned(32)));
float chain_quat_w[MAX_CHAINS] __attribute__((aligned(32)));
float prev_chain_pos_x[MAX_CHAINS] __attribute__((aligned(32)));
float prev_chain_pos_y[MAX_CHAINS] __attribute__((aligned(32)));
float prev_chain_pos_z[MAX_CHAINS] __attribute__((aligned(32)));
float prev_chain_quat_x[MAX_CHAINS] __attribute__((aligned(32)));
float prev_chain_quat_y[MAX_CHAINS] __attribute__((aligned(32)));
float prev_chain_quat_z[MAX_CHAINS] __attribute__((aligned(32)));
float prev_chain_quat_w[MAX_CHAINS] __attribute__((aligned(32)));
// Particle positions (model space)
float p_pos_x[MAX_PARTICLES] __attribute__((aligned(32)));
float p_pos_y[MAX_PARTICLES] __attribute__((aligned(32)));
float p_pos_z[MAX_PARTICLES] __attribute__((aligned(32)));
float inv_mass[MAX_PARTICLES] __attribute__((aligned(32)));
float prev_p_pos_x[MAX_PARTICLES] __attribute__((aligned(32)));
float prev_p_pos_y[MAX_PARTICLES] __attribute__((aligned(32)));
float prev_p_pos_z[MAX_PARTICLES] __attribute__((aligned(32)));
float rest_lengths[MAX_REST_LENGTHS] __attribute__((aligned(32)));
// Sphere positions (model space)
float sphere_x[MAX_SPHERES] __attribute__((aligned(32)));
float sphere_y[MAX_SPHERES] __attribute__((aligned(32)));
float sphere_z[MAX_SPHERES] __attribute__((aligned(32)));
float sphere_radius[MAX_SPHERES] __attribute__((aligned(32)));
// Rest positions (model space)
float rest_pos_x[MAX_PARTICLES] __attribute__((aligned(32)));
float rest_pos_y[MAX_PARTICLES] __attribute__((aligned(32)));
float rest_pos_z[MAX_PARTICLES] __attribute__((aligned(32)));
float motion_radius[MAX_PARTICLES] __attribute__((aligned(32)));
};
void simulate_chains(struct chain_sim *cs, float dt) {
// Common SIMD constants
const __m256 dt_vec = _mm256_set1_ps(dt);
const __m256 dt2_vec = _mm256_set1_ps(dt * dt);
const __m256 one_vec = _mm256_set1_ps(1.0f);
const __m256 neg_one_vec = _mm256_set1_ps(-1.0f);
const __m256 eps_vec = _mm256_set1_ps(1e-6f);
const __m256 inv_mass_clamp_min = _mm256_set1_ps(0.0f);
const __m256 inv_mass_clamp_max = _mm256_set1_ps(1e6f);
const __m256 zero_vec = _mm256_setzero_ps();
const __m256 inv_dt_vec = _mm256_set1_ps(1.0f / dt);
const __m256 two_vec = _mm256_set1_ps(2.0f);
const __m256 half_vec = _mm256_set1_ps(0.5f);
const __m256 pi_over_2_vec = _mm256_set1_ps(1.5707963267948966f);
const __m256 acos_c1_vec = _mm256_set1_ps(-0.166666666666f);
const __m256 acos_c2_vec = _mm256_set1_ps(0.075000000000f);
const __m256 acos_c3_vec = _mm256_set1_ps(-0.044642857143f);
const __m256 sin_c1_vec = _mm256_set1_ps(-0.166666666666f);
const __m256 sin_c2_vec = _mm256_set1_ps(0.008333333333f);
const __m256 inertia_clamp_min = _mm256_set1_ps(0.0f);
const __m256 inertia_clamp_max = _mm256_set1_ps(1.0f);
const __m256 abs_mask_ps = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
// Local arrays for PBD solver
float px_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
float py_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
float pz_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
float pm_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
// Initialize dummy elements
px_local[PARTICLES_PER_CHAIN] = 0.0f;
py_local[PARTICLES_PER_CHAIN] = 0.0f;
pz_local[PARTICLES_PER_CHAIN] = 0.0f;
pm_local[PARTICLES_PER_CHAIN] = 0.0f;
// Stack arrays for precomputed data
float wind_local_x[MAX_CHAINS] __attribute__((aligned(32)));
float wind_local_y[MAX_CHAINS] __attribute__((aligned(32)));
float wind_local_z[MAX_CHAINS] __attribute__((aligned(32)));
float gravity_local_x[MAX_CHAINS] __attribute__((aligned(32)));
float gravity_local_y[MAX_CHAINS] __attribute__((aligned(32)));
float gravity_local_z[MAX_CHAINS] __attribute__((aligned(32)));
float vel_x[MAX_CHAINS] __attribute__((aligned(32)));
float vel_y[MAX_CHAINS] __attribute__((aligned(32)));
float vel_z[MAX_CHAINS] __attribute__((aligned(32)));
float ang_vel_x[MAX_CHAINS] __attribute__((aligned(32)));
float ang_vel_y[MAX_CHAINS] __attribute__((aligned(32)));
float ang_vel_z[MAX_CHAINS] __attribute__((aligned(32)));
// Step 0: Precomputation (SIMD, 8 chains at once)
for (int c = 0; c < MAX_CHAINS; c += 8) {
// Load chain quaternions
__m256 qx = _mm256_load_ps(&cs->chain_quat_x[c]);
__m256 qy = _mm256_load_ps(&cs->chain_quat_y[c]);
__m256 qz = _mm256_load_ps(&cs->chain_quat_z[c]);
__m256 qw = _mm256_load_ps(&cs->chain_quat_w[c]);
// Compute local-space wind
{
__m256 vx = _mm256_load_ps(&cs->wind_x[c]);
__m256 vy = _mm256_load_ps(&cs->wind_y[c]);
__m256 vz = _mm256_load_ps(&cs->wind_z[c]);
// Create q_conjugate components (qw remains, qx, qy, qz are negated)
__m256 cqx = _mm256_sub_ps(zero_vec, qx); // -qx
__m256 cqy = _mm256_sub_ps(zero_vec, qy); // -qy
__m256 cqz = _mm256_sub_ps(zero_vec, qz); // -qz
// Step 1: t = q_conj * v_world_as_quat
// t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz)
// tw = -(-qx)*vx - (-qy)*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz
// tx = qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy
// ty = qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz
// tz = qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx
__m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx)); // qw*vx + (-qy)*vz
tx = _mm256_fnmadd_ps(cqz, vy, tx);// qw*vx - qy*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy
__m256 ty = _mm256_fmadd_ps(cqz, vx, _mm256_mul_ps(qw, vy)); // qw*vy + (-qz)*vx
ty = _mm256_fnmadd_ps(cqx, vz, ty); // qw*vy - qz*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz
__m256 tz = _mm256_fmadd_ps(cqx, vy, _mm256_mul_ps(qw, vz)); // qw*vz + (-qx)*vy
tz = _mm256_fnmadd_ps(cqy, vx, tz); // qw*vz - qx*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx
__m256 tw = _mm256_fnmadd_ps(cqx, vx, zero_vec); // 0 - (-qx)*vx = qx*vx
tw = _mm256_fnmadd_ps(cqy, vy, tw); // qx*vx - (-qy)*vy = qx*vx + qy*vy
tw = _mm256_fnmadd_ps(cqz, vz, tw); // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz
// Step 2: result_vec = (t * q)_vec
// result = (tw, tx, ty, tz) * (qw, qx, qy, qz)
// result_x = tw*qx + tx*qw + ty*qz - tz*qy
// result_y = tw*qy + ty*qw + tz*qx - tx*qz
// result_z = tw*qz + ty*qw + tx*qy - ty*qx
__m256 result_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx));
result_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, result_x));
__m256 result_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy));
result_y = _mm256_fnmadd_ps(tx, qz, _mm256_fmadd_ps(ty, qw, result_y));
__m256 result_z = _mm256_fmadd_ps(tx, qy, _mm256_mul_ps(tw, qz));
result_z = _mm256_fnmadd_ps(ty, qx, _mm256_fmadd_ps(tz, qw, result_z));
_mm256_store_ps(&wind_local_x[c], result_x);
_mm256_store_ps(&wind_local_y[c], result_y);
_mm256_store_ps(&wind_local_z[c], result_z);
}
// Compute local-space gravity
{
__m256 vx = _mm256_load_ps(&cs->gravity_x[c]);
__m256 vy = _mm256_load_ps(&cs->gravity_y[c]);
__m256 vz = _mm256_load_ps(&cs->gravity_z[c]);
// Create q_conjugate components (qw remains, qx, qy, qz are negated)
__m256 cqx = _mm256_sub_ps(zero_vec, qx); // -qx
__m256 cqy = _mm256_sub_ps(zero_vec, qy); // -qy
__m256 cqz = _mm256_sub_ps(zero_vec, qz); // -qz
// Step 1: t = q_conj * v_world_as_quat
// t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz)
// tw = -(-qx)*vx - (-qy)*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz
// tx = qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy
// ty = qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz
// tz = qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx
__m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx)); // qw*vx + (-qy)*vz
tx = _mm256_fnmadd_ps(cqz, vy, tx);// qw*vx - qy*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy
__m256 ty = _mm256_fmadd_ps(cqz, vx, _mm256_mul_ps(qw, vy)); // qw*vy + (-qz)*vx
ty = _mm256_fnmadd_ps(cqx, vz, ty); // qw*vy - qz*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz
__m256 tz = _mm256_fmadd_ps(cqx, vy, _mm256_mul_ps(qw, vz)); // qw*vz + (-qx)*vy
tz = _mm256_fnmadd_ps(cqy, vx, tz); // qw*vz - qx*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx
__m256 tw = _mm256_fnmadd_ps(cqx, vx, zero_vec); // 0 - (-qx)*vx = qx*vx
tw = _mm256_fnmadd_ps(cqy, vy, tw); // qx*vx - (-qy)*vy = qx*vx + qy*vy
tw = _mm256_fnmadd_ps(cqz, vz, tw); // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz
// Step 2: result_vec = (t * q)_vec
// result = (tw, tx, ty, tz) * (qw, qx, qy, qz)
// result_x = tw*qx + tx*qw + ty*qz - tz*qy
// result_y = tw*qy + ty*qw + tz*qx - tx*qz
// result_z = tw*qz + ty*qw + tx*qy - ty*qx
__m256 result_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx));
result_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, result_x));
__m256 result_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy));
result_y = _mm256_fnmadd_ps(tx, qz, _mm256_fmadd_ps(ty, qw, result_y));
__m256 result_z = _mm256_fmadd_ps(tx, qy, _mm256_mul_ps(tw, qz));
result_z = _mm256_fnmadd_ps(ty, qx, _mm256_fmadd_ps(tz, qw, result_z));
_mm256_store_ps(&gravity_local_x[c], result_x);
_mm256_store_ps(&gravity_local_y[c], result_y);
_mm256_store_ps(&gravity_local_z[c], result_z);
}
// Compute linear velocity
{
__m256 curr_x = _mm256_load_ps(&cs->chain_pos_x[c]);
__m256 curr_y = _mm256_load_ps(&cs->chain_pos_y[c]);
__m256 curr_z = _mm256_load_ps(&cs->chain_pos_z[c]);
__m256 prev_x = _mm256_load_ps(&cs->prev_chain_pos_x[c]);
__m256 prev_y = _mm256_load_ps(&cs->prev_chain_pos_y[c]);
__m256 prev_z = _mm256_load_ps(&cs->prev_chain_pos_z[c]);
__m256 vel_x_vec = _mm256_mul_ps(_mm256_sub_ps(curr_x, prev_x), inv_dt_vec);
__m256 vel_y_vec = _mm256_mul_ps(_mm256_sub_ps(curr_y, prev_y), inv_dt_vec);
__m256 vel_z_vec = _mm256_mul_ps(_mm256_sub_ps(curr_z, prev_z), inv_dt_vec);
_mm256_store_ps(&vel_x[c], vel_x_vec);
_mm256_store_ps(&vel_y[c], vel_y_vec);
_mm256_store_ps(&vel_z[c], vel_z_vec);
}
// Compute angular velocity (SIMD with polynomial approximations)
{
__m256 prev_qx = _mm256_load_ps(&cs->prev_chain_quat_x[c]);
__m256 prev_qy = _mm256_load_ps(&cs->prev_chain_quat_y[c]);
__m256 prev_qz = _mm256_load_ps(&cs->prev_chain_quat_z[c]);
__m256 prev_qw = _mm256_load_ps(&cs->prev_chain_quat_w[c]);
// Quaternion inverse
__m256 norm_sq = _mm256_fmadd_ps(prev_qx, prev_qx, _mm256_mul_ps(prev_qy, prev_qy));
norm_sq = _mm256_fmadd_ps(prev_qz, prev_qz, norm_sq);
norm_sq = _mm256_fmadd_ps(prev_qw, prev_qw, norm_sq);
__m256 inv_norm_sq = _mm256_rcp_ps(_mm256_max_ps(norm_sq, eps_vec));
__m256 inv_qx = _mm256_mul_ps(_mm256_sub_ps(zero_vec, prev_qx), inv_norm_sq);
__m256 inv_qy = _mm256_mul_ps(_mm256_sub_ps(zero_vec, prev_qy), inv_norm_sq);
__m256 inv_qz = _mm256_mul_ps(_mm256_sub_ps(zero_vec, prev_qz), inv_norm_sq);
__m256 inv_qw = _mm256_mul_ps(prev_qw, inv_norm_sq);
// Delta quaternion: q_current * q_prev^-1
__m256 delta_qx = _mm256_fmadd_ps(qy, inv_qz, _mm256_mul_ps(qw, inv_qx));
delta_qx = _mm256_fnmadd_ps(qz, inv_qy, _mm256_fmadd_ps(qx, inv_qw, delta_qx));
__m256 delta_qy = _mm256_fmadd_ps(qy, inv_qw, _mm256_mul_ps(qw, inv_qy));
delta_qy = _mm256_fmadd_ps(qz, inv_qx, _mm256_fnmadd_ps(qx, inv_qz, delta_qy));
__m256 delta_qz = _mm256_fmadd_ps(qx, inv_qy, _mm256_mul_ps(qw, inv_qz));
delta_qz = _mm256_fnmadd_ps(qy, inv_qx, _mm256_fmadd_ps(qz, inv_qw, delta_qz));
__m256 delta_qw = _mm256_fnmadd_ps(qx, inv_qx, _mm256_mul_ps(qw, inv_qw));
delta_qw = _mm256_fnmadd_ps(qy, inv_qy, delta_qw);
delta_qw = _mm256_fnmadd_ps(qz, inv_qz, delta_qw);
// Angular velocity with polynomial approximations
__m256 qw_clamped = _mm256_max_ps(_mm256_min_ps(delta_qw, one_vec), neg_one_vec);
// Acos approximation: acos(x) ≈ π/2 - x * (1 + x^2 * (c1 + x^2 * (c2 + x^2 * c3)))
__m256 x = qw_clamped;
__m256 x2 = _mm256_mul_ps(x, x);
__m256 x4 = _mm256_mul_ps(x2, x2);
__m256 poly = _mm256_fmadd_ps(x4, acos_c3_vec, acos_c2_vec);
poly = _mm256_fmadd_ps(x2, poly, acos_c1_vec);
poly = _mm256_fmadd_ps(x2, poly, one_vec);
poly = _mm256_mul_ps(x, poly);
__m256 angle = _mm256_mul_ps(two_vec, _mm256_sub_ps(pi_over_2_vec, poly));
// Sin approximation for sin(angle/2): sin(x) ≈ x * (1 + x^2 * (c1 + x^2 * c2))
__m256 half_angle = _mm256_mul_ps(angle, half_vec);
__m256 ha2 = _mm256_mul_ps(half_angle, half_angle);
__m256 sin_poly = _mm256_fmadd_ps(ha2, sin_c2_vec, sin_c1_vec);
sin_poly = _mm256_fmadd_ps(ha2, sin_poly, one_vec);
__m256 sin_half_angle = _mm256_mul_ps(half_angle, sin_poly);
__m256 inv_sin_half_angle = _mm256_rcp_ps(_mm256_max_ps(sin_half_angle, eps_vec));
// Sign: Use qw > 0 instead of angle < π
__m256 sign = _mm256_cmp_ps(qw_clamped, zero_vec, _CMP_GT_OQ);
sign = _mm256_or_ps(_mm256_and_ps(sign, one_vec), _mm256_andnot_ps(sign, neg_one_vec));
// Angular velocity: (sign * qx * inv_sin_half_angle / dt)
__m256 scale = _mm256_mul_ps(sign, _mm256_mul_ps(inv_sin_half_angle, inv_dt_vec));
__m256 ang_vel_x_vec = _mm256_mul_ps(delta_qx, scale);
__m256 ang_vel_y_vec = _mm256_mul_ps(delta_qy, scale);
__m256 ang_vel_z_vec = _mm256_mul_ps(delta_qz, scale);
_mm256_store_ps(&ang_vel_x[c], ang_vel_x_vec);
_mm256_store_ps(&ang_vel_y[c], ang_vel_y_vec);
_mm256_store_ps(&ang_vel_z[c], ang_vel_z_vec);
}
}
// Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal)
for (int c = 0; c < MAX_CHAINS; c++) {
int base_idx = c * PARTICLES_PER_CHAIN;
struct chain_cfg *cfg = &cs->chain_configs[c];
// Load precomputed velocities and inertia parameters
__m256 vel_x_vec = _mm256_set1_ps(vel_x[c]);
__m256 vel_y_vec = _mm256_set1_ps(vel_y[c]);
__m256 vel_z_vec = _mm256_set1_ps(vel_z[c]);
__m256 ang_vel_x_vec = _mm256_set1_ps(ang_vel_x[c]);
__m256 ang_vel_y_vec = _mm256_set1_ps(ang_vel_y[c]);
__m256 ang_vel_z_vec = _mm256_set1_ps(ang_vel_z[c]);
__m256 linear_inertia_vec = _mm256_set1_ps(cfg->linear_inertia);
__m256 angular_inertia_vec = _mm256_set1_ps(cfg->angular_inertia);
__m256 centrifugal_inertia_vec = _mm256_set1_ps(cfg->centrifugal_inertia);
// Clamp inertia parameters to [0, 1]
linear_inertia_vec = _mm256_max_ps(_mm256_min_ps(linear_inertia_vec, inertia_clamp_max), inertia_clamp_min);
angular_inertia_vec = _mm256_max_ps(_mm256_min_ps(angular_inertia_vec, inertia_clamp_max), inertia_clamp_min);
centrifugal_inertia_vec = _mm256_max_ps(_mm256_min_ps(centrifugal_inertia_vec, inertia_clamp_max), inertia_clamp_min);
for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
__m256 px = _mm256_load_ps(&cs->p_pos_x[base_idx + i]);
__m256 py = _mm256_load_ps(&cs->p_pos_y[base_idx + i]);
__m256 pz = _mm256_load_ps(&cs->p_pos_z[base_idx + i]);
__m256 p_inv_mass = _mm256_load_ps(&cs->inv_mass[base_idx + i]);
p_inv_mass = _mm256_max_ps(_mm256_min_ps(p_inv_mass, inv_mass_clamp_max), inv_mass_clamp_min);
// Linear inertia: v * dt * linear_inertia
__m256 lin_delta_x = _mm256_mul_ps(_mm256_mul_ps(vel_x_vec, dt_vec), linear_inertia_vec);
__m256 lin_delta_y = _mm256_mul_ps(_mm256_mul_ps(vel_y_vec, dt_vec), linear_inertia_vec);
__m256 lin_delta_z = _mm256_mul_ps(_mm256_mul_ps(vel_z_vec, dt_vec), linear_inertia_vec);
// Angular inertia: (ω × r) * dt * angular_inertia
__m256 ang_delta_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py));
__m256 ang_delta_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz));
__m256 ang_delta_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px));
ang_delta_x = _mm256_mul_ps(_mm256_mul_ps(ang_delta_x, dt_vec), angular_inertia_vec);
ang_delta_y = _mm256_mul_ps(_mm256_mul_ps(ang_delta_y, dt_vec), angular_inertia_vec);
ang_delta_z = _mm256_mul_ps(_mm256_mul_ps(ang_delta_z, dt_vec), angular_inertia_vec);
// Centrifugal inertia: (ω × (ω × r)) * dt^2 * centrifugal_inertia
__m256 cross1_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py));
__m256 cross1_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz));
__m256 cross1_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px));
__m256 cross2_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, cross1_z), _mm256_mul_ps(ang_vel_z_vec, cross1_y));
__m256 cross2_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, cross1_x), _mm256_mul_ps(ang_vel_x_vec, cross1_z));
__m256 cross2_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, cross1_y), _mm256_mul_ps(ang_vel_y_vec, cross1_x));
// Calculate displacement: (ω × (ω × r)) * dt^2
__m256 base_cent_delta_x = _mm256_mul_ps(cross2_x, dt2_vec);
__m256 base_cent_delta_y = _mm256_mul_ps(cross2_y, dt2_vec);
__m256 base_cent_delta_z = _mm256_mul_ps(cross2_z, dt2_vec);
// Apply the centrifugal_inertia factor
__m256 cent_delta_x = _mm256_mul_ps(base_cent_delta_x, centrifugal_inertia_vec);
__m256 cent_delta_y = _mm256_mul_ps(base_cent_delta_y, centrifugal_inertia_vec);
__m256 cent_delta_z = _mm256_mul_ps(base_cent_delta_z, centrifugal_inertia_vec);
// Total delta
__m256 delta_x = _mm256_add_ps(_mm256_add_ps(lin_delta_x, ang_delta_x), cent_delta_x);
__m256 delta_y = _mm256_add_ps(_mm256_add_ps(lin_delta_y, ang_delta_y), cent_delta_y);
__m256 delta_z = _mm256_add_ps(_mm256_add_ps(lin_delta_z, ang_delta_z), cent_delta_z);
// Update positions
_mm256_store_ps(&cs->p_pos_x[base_idx + i], _mm256_add_ps(px, delta_x));
_mm256_store_ps(&cs->p_pos_y[base_idx + i], _mm256_add_ps(py, delta_y));
_mm256_store_ps(&cs->p_pos_z[base_idx + i], _mm256_add_ps(pz, delta_z));
}
// Update previous transformation
cs->prev_chain_pos_x[c] = cs->chain_pos_x[c];
cs->prev_chain_pos_y[c] = cs->chain_pos_y[c];
cs->prev_chain_pos_z[c] = cs->chain_pos_z[c];
cs->prev_chain_quat_x[c] = cs->chain_quat_x[c];
cs->prev_chain_quat_y[c] = cs->chain_quat_y[c];
cs->prev_chain_quat_z[c] = cs->chain_quat_z[c];
cs->prev_chain_quat_w[c] = cs->chain_quat_w[c];
}
// Step 2: Verlet Integration
for (int c = 0; c < MAX_CHAINS; c++) {
int base_idx = c * PARTICLES_PER_CHAIN;
struct chain_cfg *cfg = &cs->chain_configs[c];
__m256 gravity_x_vec = _mm256_set1_ps(gravity_local_x[c]);
__m256 gravity_y_vec = _mm256_set1_ps(gravity_local_y[c]);
__m256 gravity_z_vec = _mm256_set1_ps(gravity_local_z[c]);
__m256 wind_x_vec = _mm256_set1_ps(wind_local_x[c]);
__m256 wind_y_vec = _mm256_set1_ps(wind_local_y[c]);
__m256 wind_z_vec = _mm256_set1_ps(wind_local_z[c]);
__m256 damping_vec = _mm256_set1_ps(cfg->damping);
for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
__m256 p_curr_x = _mm256_load_ps(&cs->p_pos_x[base_idx + i]);
__m256 p_curr_y = _mm256_load_ps(&cs->p_pos_y[base_idx + i]);
__m256 p_curr_z = _mm256_load_ps(&cs->p_pos_z[base_idx + i]);
__m256 p_inv_mass = _mm256_load_ps(&cs->inv_mass[base_idx + i]);
p_inv_mass = _mm256_max_ps(_mm256_min_ps(p_inv_mass, inv_mass_clamp_max), inv_mass_clamp_min);
__m256 p_prev_x = _mm256_load_ps(&cs->prev_p_pos_x[base_idx + i]);
__m256 p_prev_y = _mm256_load_ps(&cs->prev_p_pos_y[base_idx + i]);
__m256 p_prev_z = _mm256_load_ps(&cs->prev_p_pos_z[base_idx + i]);
__m256 force_x = _mm256_add_ps(gravity_x_vec, wind_x_vec);
__m256 force_y = _mm256_add_ps(gravity_y_vec, wind_y_vec);
__m256 force_z = _mm256_add_ps(gravity_z_vec, wind_z_vec);
__m256 acc_x = _mm256_mul_ps(force_x, p_inv_mass);
__m256 acc_y = _mm256_mul_ps(force_y, p_inv_mass);
__m256 acc_z = _mm256_mul_ps(force_z, p_inv_mass);
__m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x);
__m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y);
__m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z);
__m256 damped_vel_x = _mm256_mul_ps(vel_x, damping_vec);
__m256 damped_vel_y = _mm256_mul_ps(vel_y, damping_vec);
__m256 damped_vel_z = _mm256_mul_ps(vel_z, damping_vec);
__m256 term_accel_x = _mm256_mul_ps(acc_x, dt2_vec);
__m256 term_accel_y = _mm256_mul_ps(acc_y, dt2_vec);
__m256 term_accel_z = _mm256_mul_ps(acc_z, dt2_vec);
__m256 new_p_x = _mm256_add_ps(p_curr_x, _mm256_add_ps(damped_vel_x, term_accel_x));
__m256 new_p_y = _mm256_add_ps(p_curr_y, _mm256_add_ps(damped_vel_y, term_accel_y));
__m256 new_p_z = _mm256_add_ps(p_curr_z, _mm256_add_ps(damped_vel_z, term_accel_z));
_mm256_store_ps(&cs->p_pos_x[base_idx + i], new_p_x);
_mm256_store_ps(&cs->p_pos_y[base_idx + i], new_p_y);
_mm256_store_ps(&cs->p_pos_z[base_idx + i], new_p_z);
_mm256_store_ps(&cs->prev_p_pos_x[base_idx + i], p_curr_x);
_mm256_store_ps(&cs->prev_p_pos_y[base_idx + i], p_curr_y);
_mm256_store_ps(&cs->prev_p_pos_z[base_idx + i], p_curr_z);
}
}
// Step 3: Distance Constraints
for (int c = 0; c < MAX_CHAINS; c++) {
int p_base = c * PARTICLES_PER_CHAIN;
int r_base = c * CONSTRAINTS_PER_CHAIN;
struct chain_cfg *cfg = &cs->chain_configs[c];
__m256 stiffness_vec = _mm256_set1_ps(cfg->stiffness);
__m256 stretch_factor_vec = _mm256_set1_ps(cfg->stretch_factor);
__m256 max_strain_vec_abs = _mm256_set1_ps(cfg->max_strain);
__m256 neg_max_strain_vec_abs = _mm256_mul_ps(max_strain_vec_abs, neg_one_vec);
for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
__m256 px = _mm256_load_ps(&cs->p_pos_x[p_base + i]);
__m256 py = _mm256_load_ps(&cs->p_pos_y[p_base + i]);
__m256 pz = _mm256_load_ps(&cs->p_pos_z[p_base + i]);
__m256 pm = _mm256_load_ps(&cs->inv_mass[p_base + i]);
pm = _mm256_max_ps(_mm256_min_ps(pm, inv_mass_clamp_max), inv_mass_clamp_min);
_mm256_store_ps(&px_local[i], px);
_mm256_store_ps(&py_local[i], py);
_mm256_store_ps(&pz_local[i], pz);
_mm256_store_ps(&pm_local[i], pm);
}
for (int iter = 0; iter < MAX_ITERATIONS; iter++) {
// First block (i=0 to 7)
{
__m256 p0x = _mm256_load_ps(&px_local[0]);
__m256 p0y = _mm256_load_ps(&py_local[0]);
__m256 p0z = _mm256_load_ps(&pz_local[0]);
__m256 p0m = _mm256_load_ps(&pm_local[0]);
__m256 p1x = _mm256_loadu_ps(&px_local[1]);
__m256 p1y = _mm256_loadu_ps(&py_local[1]);
__m256 p1z = _mm256_loadu_ps(&pz_local[1]);
__m256 p1m = _mm256_loadu_ps(&pm_local[1]);
__m256 rest_len_vec = _mm256_load_ps(&cs->rest_lengths[r_base]);
__m256 dx = _mm256_sub_ps(p1x, p0x);
__m256 dy = _mm256_sub_ps(p1y, p0y);
__m256 dz = _mm256_sub_ps(p1z, p0z);
__m256 dist_sq = _mm256_mul_ps(dx, dx);
dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
__m256 w_sum = _mm256_add_ps(p0m, p1m);
__m256 w_sum_clamped = _mm256_max_ps(w_sum, eps_vec);
__m256 rcp_w_sum = _mm256_rcp_ps(w_sum_clamped);
__m256 dist = _mm256_rcp_ps(inv_dist);
__m256 diff = _mm256_sub_ps(dist, rest_len_vec);
__m256 max_rest_len_eps = _mm256_max_ps(rest_len_vec, eps_vec);
__m256 strain = _mm256_mul_ps(diff, _mm256_rcp_ps(max_rest_len_eps));
__m256 strain_clamped = _mm256_max_ps(neg_max_strain_vec_abs, _mm256_min_ps(strain, max_strain_vec_abs));
__m256 dynamic_stiffness_denom = _mm256_add_ps(one_vec, _mm256_and_ps(strain_clamped, abs_mask_ps));
__m256 dynamic_stiffness = _mm256_mul_ps(stiffness_vec, _mm256_rcp_ps(dynamic_stiffness_denom));
__m256 corr_scalar_part = _mm256_mul_ps(dynamic_stiffness, stretch_factor_vec);
corr_scalar_part = _mm256_mul_ps(corr_scalar_part, rcp_w_sum);
__m256 corr_magnitude = _mm256_mul_ps(diff, corr_scalar_part);
__m256 apply_corr_mask = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ);
__m256 delta_unit_x = _mm256_mul_ps(dx, inv_dist);
__m256 delta_unit_y = _mm256_mul_ps(dy, inv_dist);
__m256 delta_unit_z = _mm256_mul_ps(dz, inv_dist);
__m256 delta_x = _mm256_mul_ps(delta_unit_x, corr_magnitude);
__m256 delta_y = _mm256_mul_ps(delta_unit_y, corr_magnitude);
__m256 delta_z = _mm256_mul_ps(delta_unit_z, corr_magnitude);
delta_x = _mm256_and_ps(delta_x, apply_corr_mask);
delta_y = _mm256_and_ps(delta_y, apply_corr_mask);
delta_z = _mm256_and_ps(delta_z, apply_corr_mask);
_mm256_store_ps(&px_local[0], _mm256_fmadd_ps(delta_x, p0m, p0x));
_mm256_store_ps(&py_local[0], _mm256_fmadd_ps(delta_y, p0m, p0y));
_mm256_store_ps(&pz_local[0], _mm256_fmadd_ps(delta_z, p0m, p0z));
_mm256_store_ps(&px_local[1], _mm256_fnmadd_ps(delta_x, p1m, p1x));
_mm256_store_ps(&py_local[1], _mm256_fnmadd_ps(delta_y, p1m, p1y));
_mm256_store_ps(&pz_local[1], _mm256_fnmadd_ps(delta_z, p1m, p1z));
}
// Second block (i=8 to 14)
{
__m256 p0x = _mm256_load_ps(&px_local[8]);
__m256 p0y = _mm256_load_ps(&py_local[8]);
__m256 p0z = _mm256_load_ps(&pz_local[8]);
__m256 p0m = _mm256_load_ps(&pm_local[8]);
__m256 p1x = _mm256_loadu_ps(&px_local[9]);
__m256 p1y = _mm256_loadu_ps(&py_local[9]);
__m256 p1z = _mm256_loadu_ps(&pz_local[9]);
__m256 p1m = _mm256_loadu_ps(&pm_local[9]);
__m256 rest_len_vec = _mm256_load_ps(&cs->rest_lengths[r_base + 8]);
__m256 dx = _mm256_sub_ps(p1x, p0x);
__m256 dy = _mm256_sub_ps(p1y, p0y);
__m256 dz = _mm256_sub_ps(p1z, p0z);
__m256 dist_sq = _mm256_mul_ps(dx, dx);
dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
__m256 w_sum = _mm256_add_ps(p0m, p1m);
__m256 w_sum_clamped = _mm256_max_ps(w_sum, eps_vec);
__m256 rcp_w_sum = _mm256_rcp_ps(w_sum_clamped);
__m256 dist = _mm256_rcp_ps(inv_dist);
__m256 diff = _mm256_sub_ps(dist, rest_len_vec);
__m256 max_rest_len_eps = _mm256_max_ps(rest_len_vec, eps_vec);
__m256 strain = _mm256_mul_ps(diff, _mm256_rcp_ps(max_rest_len_eps));
__m256 strain_clamped = _mm256_max_ps(neg_max_strain_vec_abs, _mm256_min_ps(strain, max_strain_vec_abs));
__m256 dynamic_stiffness_denom = _mm256_add_ps(one_vec, _mm256_and_ps(strain_clamped, abs_mask_ps));
__m256 dynamic_stiffness = _mm256_mul_ps(stiffness_vec, _mm256_rcp_ps(dynamic_stiffness_denom));
__m256 corr_scalar_part = _mm256_mul_ps(dynamic_stiffness, stretch_factor_vec);
corr_scalar_part = _mm256_mul_ps(corr_scalar_part, rcp_w_sum);
__m256 corr_magnitude = _mm256_mul_ps(diff, corr_scalar_part);
__m256 apply_corr_mask = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ);
__attribute__((aligned(32))) float valid_mask_array[8] = { -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 0.0f };
__m256 valid_mask = _mm256_load_ps(valid_mask_array);
apply_corr_mask = _mm256_and_ps(apply_corr_mask, valid_mask);
__m256 delta_unit_x = _mm256_mul_ps(dx, inv_dist);
__m256 delta_unit_y = _mm256_mul_ps(dy, inv_dist);
__m256 delta_unit_z = _mm256_mul_ps(dz, inv_dist);
__m256 delta_x = _mm256_mul_ps(delta_unit_x, corr_magnitude);
__m256 delta_y = _mm256_mul_ps(delta_unit_y, corr_magnitude);
__m256 delta_z = _mm256_mul_ps(delta_unit_z, corr_magnitude);
delta_x = _mm256_and_ps(delta_x, apply_corr_mask);
delta_y = _mm256_and_ps(delta_y, apply_corr_mask);
delta_z = _mm256_and_ps(delta_z, apply_corr_mask);
_mm256_store_ps(&px_local[8], _mm256_fmadd_ps(delta_x, p0m, p0x));
_mm256_store_ps(&py_local[8], _mm256_fmadd_ps(delta_y, p0m, p0y));
_mm256_store_ps(&pz_local[8], _mm256_fmadd_ps(delta_z, p0m, p0z));
_mm256_storeu_ps(&px_local[9], _mm256_fnmadd_ps(delta_x, p1m, p1x));
_mm256_storeu_ps(&py_local[9], _mm256_fnmadd_ps(delta_y, p1m, p1y));
_mm256_storeu_ps(&pz_local[9], _mm256_fnmadd_ps(delta_z, p1m, p1z));
}
}
for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
__m256 px = _mm256_load_ps(&px_local[i]);
__m256 py = _mm256_load_ps(&py_local[i]);
__m256 pz = _mm256_load_ps(&pz_local[i]);
_mm256_store_ps(&cs->p_pos_x[p_base + i], px);
_mm256_store_ps(&cs->p_pos_y[p_base + i], py);
_mm256_store_ps(&cs->p_pos_z[p_base + i], pz);
}
}
// Step 4: Sphere Collisions with Friction
for (int c = 0; c < MAX_CHAINS; c++) {
int p_base = c * PARTICLES_PER_CHAIN;
int s_base = c * SPHERES_PER_CHAIN;
struct chain_cfg *cfg = &cs->chain_configs[c];
__m256 friction_vec = _mm256_set1_ps(cfg->friction);
for (int s = 0; s < SPHERES_PER_CHAIN; s++) {
__m256 sphere_x_vec = _mm256_set1_ps(cs->sphere_x[s_base + s]);
__m256 sphere_y_vec = _mm256_set1_ps(cs->sphere_y[s_base + s]);
__m256 sphere_z_vec = _mm256_set1_ps(cs->sphere_z[s_base + s]);
__m256 sphere_r_vec = _mm256_set1_ps(cs->sphere_radius[s_base + s]);
for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
__m256 p_curr_x = _mm256_load_ps(&cs->p_pos_x[p_base + i]);
__m256 p_curr_y = _mm256_load_ps(&cs->p_pos_y[p_base + i]);
__m256 p_curr_z = _mm256_load_ps(&cs->p_pos_z[p_base + i]);
__m256 p_prev_x = _mm256_load_ps(&cs->prev_p_pos_x[p_base + i]);
__m256 p_prev_y = _mm256_load_ps(&cs->prev_p_pos_y[p_base + i]);
__m256 p_prev_z = _mm256_load_ps(&cs->prev_p_pos_z[p_base + i]);
__m256 p_inv_mass = _mm256_load_ps(&cs->inv_mass[p_base + i]);
p_inv_mass = _mm256_max_ps(_mm256_min_ps(p_inv_mass, inv_mass_clamp_max), inv_mass_clamp_min);
__m256 dx = _mm256_sub_ps(p_curr_x, sphere_x_vec);
__m256 dy = _mm256_sub_ps(p_curr_y, sphere_y_vec);
__m256 dz = _mm256_sub_ps(p_curr_z, sphere_z_vec);
__m256 dist_sq = _mm256_mul_ps(dx, dx);
dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
__m256 dist = _mm256_rcp_ps(inv_dist);
__m256 penetration = _mm256_sub_ps(sphere_r_vec, dist);
__m256 collision_mask = _mm256_cmp_ps(penetration, zero_vec, _CMP_GT_OQ);
__m256 normal_x = _mm256_mul_ps(dx, inv_dist);
__m256 normal_y = _mm256_mul_ps(dy, inv_dist);
__m256 normal_z = _mm256_mul_ps(dz, inv_dist);
__m256 normal_corr_x = _mm256_mul_ps(normal_x, penetration);
__m256 normal_corr_y = _mm256_mul_ps(normal_y, penetration);
__m256 normal_corr_z = _mm256_mul_ps(normal_z, penetration);
__m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x);
__m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y);
__m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z);
__m256 vel_dot_normal = _mm256_mul_ps(vel_x, normal_x);
vel_dot_normal = _mm256_fmadd_ps(vel_y, normal_y, vel_dot_normal);
vel_dot_normal = _mm256_fmadd_ps(vel_z, normal_z, vel_dot_normal);
__m256 tangent_x = _mm256_sub_ps(vel_x, _mm256_mul_ps(vel_dot_normal, normal_x));
__m256 tangent_y = _mm256_sub_ps(vel_y, _mm256_mul_ps(vel_dot_normal, normal_y));
__m256 tangent_z = _mm256_sub_ps(vel_z, _mm256_mul_ps(vel_dot_normal, normal_z));
__m256 tangent_mag_sq = _mm256_mul_ps(tangent_x, tangent_x);
tangent_mag_sq = _mm256_fmadd_ps(tangent_y, tangent_y, tangent_mag_sq);
tangent_mag_sq = _mm256_fmadd_ps(tangent_z, tangent_z, tangent_mag_sq);
__m256 inv_tangent_mag = _mm256_rsqrt_ps(_mm256_max_ps(tangent_mag_sq, eps_vec));
__m256 friction_mag = _mm256_mul_ps(penetration, friction_vec);
__m256 friction_x = _mm256_mul_ps(_mm256_mul_ps(tangent_x, inv_tangent_mag), friction_mag);
__m256 friction_y = _mm256_mul_ps(_mm256_mul_ps(tangent_y, inv_tangent_mag), friction_mag);
__m256 friction_z = _mm256_mul_ps(_mm256_mul_ps(tangent_z, inv_tangent_mag), friction_mag);
__m256 total_corr_x = _mm256_sub_ps(normal_corr_x, friction_x);
__m256 total_corr_y = _mm256_sub_ps(normal_corr_y, friction_y);
__m256 total_corr_z = _mm256_sub_ps(normal_corr_z, friction_z);
total_corr_x = _mm256_and_ps(total_corr_x, collision_mask);
total_corr_y = _mm256_and_ps(total_corr_y, collision_mask);
total_corr_z = _mm256_and_ps(total_corr_z, collision_mask);
total_corr_x = _mm256_mul_ps(total_corr_x, p_inv_mass);
total_corr_y = _mm256_mul_ps(total_corr_y, p_inv_mass);
total_corr_z = _mm256_mul_ps(total_corr_z, p_inv_mass);
__m256 new_p_x = _mm256_add_ps(p_curr_x, total_corr_x);
__m256 new_p_y = _mm256_add_ps(p_curr_y, total_corr_y);
__m256 new_p_z = _mm256_add_ps(p_curr_z, total_corr_z);
_mm256_store_ps(&cs->p_pos_x[p_base + i], new_p_x);
_mm256_store_ps(&cs->p_pos_y[p_base + i], new_p_y);
_mm256_store_ps(&cs->p_pos_z[p_base + i], new_p_z);
}
}
}
// Step 5: Motion Constraints
for (int c = 0; c < MAX_CHAINS; c++) {
int p_base = c * PARTICLES_PER_CHAIN;
for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
__m256 px = _mm256_load_ps(&cs->p_pos_x[p_base + i]);
__m256 py = _mm256_load_ps(&cs->p_pos_y[p_base + i]);
__m256 pz = _mm256_load_ps(&cs->p_pos_z[p_base + i]);
__m256 pm = _mm256_load_ps(&cs->inv_mass[p_base + i]);
pm = _mm256_max_ps(_mm256_min_ps(pm, inv_mass_clamp_max), inv_mass_clamp_min);
__m256 rx = _mm256_load_ps(&cs->rest_pos_x[p_base + i]);
__m256 ry = _mm256_load_ps(&cs->rest_pos_y[p_base + i]);
__m256 rz = _mm256_load_ps(&cs->rest_pos_z[p_base + i]);
__m256 radius_vec = _mm256_load_ps(&cs->motion_radius[p_base + i]);
__m256 dx = _mm256_sub_ps(px, rx);
__m256 dy = _mm256_sub_ps(py, ry);
__m256 dz = _mm256_sub_ps(pz, rz);
__m256 dist_sq = _mm256_mul_ps(dx, dx);
dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
__m256 dist = _mm256_rcp_ps(inv_dist);
__m256 penetration = _mm256_sub_ps(dist, radius_vec);
__m256 mask = _mm256_cmp_ps(penetration, zero_vec, _CMP_GT_OQ);
__m256 corr_factor = _mm256_mul_ps(penetration, inv_dist);
corr_factor = _mm256_and_ps(corr_factor, mask);
__m256 delta_x = _mm256_mul_ps(dx, corr_factor);
__m256 delta_y = _mm256_mul_ps(dy, corr_factor);
__m256 delta_z = _mm256_mul_ps(dz, corr_factor);
delta_x = _mm256_mul_ps(delta_x, pm);
delta_y = _mm256_mul_ps(delta_y, pm);
delta_z = _mm256_mul_ps(delta_z, pm);
_mm256_store_ps(&cs->p_pos_x[p_base + i], _mm256_sub_ps(px, delta_x));
_mm256_store_ps(&cs->p_pos_y[p_base + i], _mm256_sub_ps(py, delta_y));
_mm256_store_ps(&cs->p_pos_z[p_base + i], _mm256_sub_ps(pz, delta_z));
}
}
}
#include <immintrin.h>
#include <math.h>
enum {
MAX_CHAINS = 64,
PARTICLES_PER_CHAIN = 16,
SPHERES_PER_CHAIN = 8,
MAX_PARTICLES = (MAX_CHAINS * PARTICLES_PER_CHAIN),
MAX_SPHERES = (MAX_CHAINS * SPHERES_PER_CHAIN),
CONSTRAINTS_PER_CHAIN = PARTICLES_PER_CHAIN,
MAX_REST_LENGTHS = (MAX_CHAINS * CONSTRAINTS_PER_CHAIN),
MAX_ITERATIONS = 4,
};
struct chain_cfg {
float damping;
float friction;
float collision_radius;
float linear_inertia; // [0, 1], 1.0f = physically correct linear motion
float angular_inertia; // [0, 1], 1.0f = physically correct angular rotation
float centrifugal_inertia; // [0, 1], 1.0f = physically correct centrifugal force
float avbd_alpha; // Regularization parameter (e.g., 0.95f)
float avbd_beta; // Stiffness ramping control (e.g., 10.0f)
float avbd_gamma; // Warm starting scaling (e.g., 0.99f)
float avbd_k_start; // Initial stiffness for AVBD (e.g., 100.0f)
float avbd_k_max; // Maximum stiffness for AVBD (e.g., 1e6f)
} __attribute__((aligned(32)));
struct chain_sim {
unsigned short free_idx_cnt;
unsigned short free_idx[MAX_CHAINS];
struct chain_cfg chain_configs[MAX_CHAINS] __attribute__((aligned(32)));
// Chain global forces (world space gravity and wind)
float gravity_x[MAX_CHAINS] __attribute__((aligned(32)));
float gravity_y[MAX_CHAINS] __attribute__((aligned(32)));
float gravity_z[MAX_CHAINS] __attribute__((aligned(32)));
float wind_x[MAX_CHAINS] __attribute__((aligned(32)));
float wind_y[MAX_CHAINS] __attribute__((aligned(32)));
float wind_z[MAX_CHAINS] __attribute__((aligned(32)));
// Chain transformations (world space)
float chain_pos_x[MAX_CHAINS] __attribute__((aligned(32)));
float chain_pos_y[MAX_CHAINS] __attribute__((aligned(32)));
float chain_pos_z[MAX_CHAINS] __attribute__((aligned(32)));
float chain_quat_x[MAX_CHAINS] __attribute__((aligned(32)));
float chain_quat_y[MAX_CHAINS] __attribute__((aligned(32)));
float chain_quat_z[MAX_CHAINS] __attribute__((aligned(32)));
float chain_quat_w[MAX_CHAINS] __attribute__((aligned(32)));
float prev_chain_pos_x[MAX_CHAINS] __attribute__((aligned(32)));
float prev_chain_pos_y[MAX_CHAINS] __attribute__((aligned(32)));
float prev_chain_pos_z[MAX_CHAINS] __attribute__((aligned(32)));
float prev_chain_quat_x[MAX_CHAINS] __attribute__((aligned(32)));
float prev_chain_quat_y[MAX_CHAINS] __attribute__((aligned(32)));
float prev_chain_quat_z[MAX_CHAINS] __attribute__((aligned(32)));
float prev_chain_quat_w[MAX_CHAINS] __attribute__((aligned(32)));
// Particle positions (model space)
float p_pos_x[MAX_PARTICLES] __attribute__((aligned(32)));
float p_pos_y[MAX_PARTICLES] __attribute__((aligned(32)));
float p_pos_z[MAX_PARTICLES] __attribute__((aligned(32)));
float inv_mass[MAX_PARTICLES] __attribute__((aligned(32)));
float prev_p_pos_x[MAX_PARTICLES] __attribute__((aligned(32)));
float prev_p_pos_y[MAX_PARTICLES] __attribute__((aligned(32)));
float prev_p_pos_z[MAX_PARTICLES] __attribute__((aligned(32)));
float rest_lengths[MAX_REST_LENGTHS] __attribute__((aligned(32)));
float dist_lambda[MAX_REST_LENGTHS] __attribute__((aligned(32)));
float dist_k_current[MAX_REST_LENGTHS] __attribute__((aligned(32)));
float dist_prev_frame_error[MAX_REST_LENGTHS] __attribute__((aligned(32)));
// Sphere positions (model space)
float sphere_x[MAX_SPHERES] __attribute__((aligned(32)));
float sphere_y[MAX_SPHERES] __attribute__((aligned(32)));
float sphere_z[MAX_SPHERES] __attribute__((aligned(32)));
float sphere_radius[MAX_SPHERES] __attribute__((aligned(32)));
float coll_lambda[MAX_PARTICLES + 1] __attribute__((aligned(32)));
float coll_k_current[MAX_PARTICLES + 1] __attribute__((aligned(32)));
float coll_prev_frame_error[MAX_PARTICLES + 1] __attribute__((aligned(32)));
// Rest positions (model space)
float rest_pos_x[MAX_PARTICLES] __attribute__((aligned(32)));
float rest_pos_y[MAX_PARTICLES] __attribute__((aligned(32)));
float rest_pos_z[MAX_PARTICLES] __attribute__((aligned(32)));
float motion_radius[MAX_PARTICLES] __attribute__((aligned(32)));
};
void simulate_chains(struct chain_sim *cs, float dt) {
// Common SIMD constants
const __m256 dt_vec = _mm256_set1_ps(dt);
const __m256 dt2_vec = _mm256_set1_ps(dt * dt);
const __m256 one_vec = _mm256_set1_ps(1.0f);
const __m256 neg_one_vec = _mm256_set1_ps(-1.0f);
const __m256 eps_vec = _mm256_set1_ps(1e-6f);
const __m256 inv_mass_clamp_min = _mm256_set1_ps(0.0f);
const __m256 inv_mass_clamp_max = _mm256_set1_ps(1e6f);
const __m256 zero_vec = _mm256_setzero_ps();
const __m256 inv_dt_vec = _mm256_set1_ps(1.0f / dt);
const __m256 two_vec = _mm256_set1_ps(2.0f);
const __m256 half_vec = _mm256_set1_ps(0.5f);
const __m256 pi_over_2_vec = _mm256_set1_ps(1.5707963267948966f);
const __m256 acos_c1_vec = _mm256_set1_ps(-0.166666666666f);
const __m256 acos_c2_vec = _mm256_set1_ps(0.075000000000f);
const __m256 acos_c3_vec = _mm256_set1_ps(-0.044642857143f);
const __m256 sin_c1_vec = _mm256_set1_ps(-0.166666666666f);
const __m256 sin_c2_vec = _mm256_set1_ps(0.008333333333f);
const __m256 inertia_clamp_min = _mm256_set1_ps(0.0f);
const __m256 inertia_clamp_max = _mm256_set1_ps(1.0f);
const __m256 abs_mask_ps = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
// Local arrays for PBD solver
float px_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
float py_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
float pz_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
float pm_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32)));
// Initialize dummy elements
px_local[PARTICLES_PER_CHAIN] = 0.0f;
py_local[PARTICLES_PER_CHAIN] = 0.0f;
pz_local[PARTICLES_PER_CHAIN] = 0.0f;
pm_local[PARTICLES_PER_CHAIN] = 0.0f;
// Stack arrays for precomputed data
float wind_local_x[MAX_CHAINS] __attribute__((aligned(32)));
float wind_local_y[MAX_CHAINS] __attribute__((aligned(32)));
float wind_local_z[MAX_CHAINS] __attribute__((aligned(32)));
float gravity_local_x[MAX_CHAINS] __attribute__((aligned(32)));
float gravity_local_y[MAX_CHAINS] __attribute__((aligned(32)));
float gravity_local_z[MAX_CHAINS] __attribute__((aligned(32)));
float vel_x[MAX_CHAINS] __attribute__((aligned(32)));
float vel_y[MAX_CHAINS] __attribute__((aligned(32)));
float vel_z[MAX_CHAINS] __attribute__((aligned(32)));
float ang_vel_x[MAX_CHAINS] __attribute__((aligned(32)));
float ang_vel_y[MAX_CHAINS] __attribute__((aligned(32)));
float ang_vel_z[MAX_CHAINS] __attribute__((aligned(32)));
// Step 0: Precomputation (SIMD, 8 chains at once)
for (int c = 0; c < MAX_CHAINS; c += 8) {
// Load chain quaternions
__m256 qx = _mm256_load_ps(&cs->chain_quat_x[c]);
__m256 qy = _mm256_load_ps(&cs->chain_quat_y[c]);
__m256 qz = _mm256_load_ps(&cs->chain_quat_z[c]);
__m256 qw = _mm256_load_ps(&cs->chain_quat_w[c]);
// Compute local-space wind
{
__m256 vx = _mm256_load_ps(&cs->wind_x[c]);
__m256 vy = _mm256_load_ps(&cs->wind_y[c]);
__m256 vz = _mm256_load_ps(&cs->wind_z[c]);
// Create q_conjugate components (qw remains, qx, qy, qz are negated)
__m256 cqx = _mm256_sub_ps(zero_vec, qx); // -qx
__m256 cqy = _mm256_sub_ps(zero_vec, qy); // -qy
__m256 cqz = _mm256_sub_ps(zero_vec, qz); // -qz
// Step 1: t = q_conj * v_world_as_quat
// t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz)
// tw = -(-qx)*vx - (-qy)*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz
// tx = qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy
// ty = qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz
// tz = qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx
__m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx)); // qw*vx + (-qy)*vz
tx = _mm256_fnmadd_ps(cqz, vy, tx);// qw*vx - qy*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy
__m256 ty = _mm256_fmadd_ps(cqz, vx, _mm256_mul_ps(qw, vy)); // qw*vy + (-qz)*vx
ty = _mm256_fnmadd_ps(cqx, vz, ty); // qw*vy - qz*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz
__m256 tz = _mm256_fmadd_ps(cqx, vy, _mm256_mul_ps(qw, vz)); // qw*vz + (-qx)*vy
tz = _mm256_fnmadd_ps(cqy, vx, tz); // qw*vz - qx*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx
__m256 tw = _mm256_fnmadd_ps(cqx, vx, zero_vec); // 0 - (-qx)*vx = qx*vx
tw = _mm256_fnmadd_ps(cqy, vy, tw); // qx*vx - (-qy)*vy = qx*vx + qy*vy
tw = _mm256_fnmadd_ps(cqz, vz, tw); // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz
// Step 2: result_vec = (t * q)_vec
// result = (tw, tx, ty, tz) * (qw, qx, qy, qz)
// result_x = tw*qx + tx*qw + ty*qz - tz*qy
// result_y = tw*qy + ty*qw + tz*qx - tx*qz
// result_z = tw*qz + ty*qw + tx*qy - ty*qx
__m256 result_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx));
result_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, result_x));
__m256 result_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy));
result_y = _mm256_fnmadd_ps(tx, qz, _mm256_fmadd_ps(ty, qw, result_y));
__m256 result_z = _mm256_fmadd_ps(tx, qy, _mm256_mul_ps(tw, qz));
result_z = _mm256_fnmadd_ps(ty, qx, _mm256_fmadd_ps(tz, qw, result_z));
_mm256_store_ps(&wind_local_x[c], result_x);
_mm256_store_ps(&wind_local_y[c], result_y);
_mm256_store_ps(&wind_local_z[c], result_z);
}
// Compute local-space gravity
{
__m256 vx = _mm256_load_ps(&cs->gravity_x[c]);
__m256 vy = _mm256_load_ps(&cs->gravity_y[c]);
__m256 vz = _mm256_load_ps(&cs->gravity_z[c]);
// Create q_conjugate components (qw remains, qx, qy, qz are negated)
__m256 cqx = _mm256_sub_ps(zero_vec, qx); // -qx
__m256 cqy = _mm256_sub_ps(zero_vec, qy); // -qy
__m256 cqz = _mm256_sub_ps(zero_vec, qz); // -qz
// Step 1: t = q_conj * v_world_as_quat
// t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz)
// tw = -(-qx)*vx - (-qy)*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz
// tx = qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy
// ty = qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz
// tz = qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx
__m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx)); // qw*vx + (-qy)*vz
tx = _mm256_fnmadd_ps(cqz, vy, tx);// qw*vx - qy*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy
__m256 ty = _mm256_fmadd_ps(cqz, vx, _mm256_mul_ps(qw, vy)); // qw*vy + (-qz)*vx
ty = _mm256_fnmadd_ps(cqx, vz, ty); // qw*vy - qz*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz
__m256 tz = _mm256_fmadd_ps(cqx, vy, _mm256_mul_ps(qw, vz)); // qw*vz + (-qx)*vy
tz = _mm256_fnmadd_ps(cqy, vx, tz); // qw*vz - qx*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx
__m256 tw = _mm256_fnmadd_ps(cqx, vx, zero_vec); // 0 - (-qx)*vx = qx*vx
tw = _mm256_fnmadd_ps(cqy, vy, tw); // qx*vx - (-qy)*vy = qx*vx + qy*vy
tw = _mm256_fnmadd_ps(cqz, vz, tw); // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz
// Step 2: result_vec = (t * q)_vec
// result = (tw, tx, ty, tz) * (qw, qx, qy, qz)
// result_x = tw*qx + tx*qw + ty*qz - tz*qy
// result_y = tw*qy + ty*qw + tz*qx - tx*qz
// result_z = tw*qz + ty*qw + tx*qy - ty*qx
__m256 result_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx));
result_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, result_x));
__m256 result_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy));
result_y = _mm256_fnmadd_ps(tx, qz, _mm256_fmadd_ps(ty, qw, result_y));
__m256 result_z = _mm256_fmadd_ps(tx, qy, _mm256_mul_ps(tw, qz));
result_z = _mm256_fnmadd_ps(ty, qx, _mm256_fmadd_ps(tz, qw, result_z));
_mm256_store_ps(&gravity_local_x[c], result_x);
_mm256_store_ps(&gravity_local_y[c], result_y);
_mm256_store_ps(&gravity_local_z[c], result_z);
}
// Compute linear velocity
{
__m256 curr_x = _mm256_load_ps(&cs->chain_pos_x[c]);
__m256 curr_y = _mm256_load_ps(&cs->chain_pos_y[c]);
__m256 curr_z = _mm256_load_ps(&cs->chain_pos_z[c]);
__m256 prev_x = _mm256_load_ps(&cs->prev_chain_pos_x[c]);
__m256 prev_y = _mm256_load_ps(&cs->prev_chain_pos_y[c]);
__m256 prev_z = _mm256_load_ps(&cs->prev_chain_pos_z[c]);
__m256 vel_x_vec = _mm256_mul_ps(_mm256_sub_ps(curr_x, prev_x), inv_dt_vec);
__m256 vel_y_vec = _mm256_mul_ps(_mm256_sub_ps(curr_y, prev_y), inv_dt_vec);
__m256 vel_z_vec = _mm256_mul_ps(_mm256_sub_ps(curr_z, prev_z), inv_dt_vec);
_mm256_store_ps(&vel_x[c], vel_x_vec);
_mm256_store_ps(&vel_y[c], vel_y_vec);
_mm256_store_ps(&vel_z[c], vel_z_vec);
}
// Compute angular velocity (SIMD with polynomial approximations)
{
__m256 prev_qx = _mm256_load_ps(&cs->prev_chain_quat_x[c]);
__m256 prev_qy = _mm256_load_ps(&cs->prev_chain_quat_y[c]);
__m256 prev_qz = _mm256_load_ps(&cs->prev_chain_quat_z[c]);
__m256 prev_qw = _mm256_load_ps(&cs->prev_chain_quat_w[c]);
// Quaternion inverse
__m256 norm_sq = _mm256_fmadd_ps(prev_qx, prev_qx, _mm256_mul_ps(prev_qy, prev_qy));
norm_sq = _mm256_fmadd_ps(prev_qz, prev_qz, norm_sq);
norm_sq = _mm256_fmadd_ps(prev_qw, prev_qw, norm_sq);
__m256 inv_norm_sq = _mm256_rcp_ps(_mm256_max_ps(norm_sq, eps_vec));
__m256 inv_qx = _mm256_mul_ps(_mm256_sub_ps(zero_vec, prev_qx), inv_norm_sq);
__m256 inv_qy = _mm256_mul_ps(_mm256_sub_ps(zero_vec, prev_qy), inv_norm_sq);
__m256 inv_qz = _mm256_mul_ps(_mm256_sub_ps(zero_vec, prev_qz), inv_norm_sq);
__m256 inv_qw = _mm256_mul_ps(prev_qw, inv_norm_sq);
// Delta quaternion: q_current * q_prev^-1
__m256 delta_qx = _mm256_fmadd_ps(qy, inv_qz, _mm256_mul_ps(qw, inv_qx));
delta_qx = _mm256_fnmadd_ps(qz, inv_qy, _mm256_fmadd_ps(qx, inv_qw, delta_qx));
__m256 delta_qy = _mm256_fmadd_ps(qy, inv_qw, _mm256_mul_ps(qw, inv_qy));
delta_qy = _mm256_fmadd_ps(qz, inv_qx, _mm256_fnmadd_ps(qx, inv_qz, delta_qy));
__m256 delta_qz = _mm256_fmadd_ps(qx, inv_qy, _mm256_mul_ps(qw, inv_qz));
delta_qz = _mm256_fnmadd_ps(qy, inv_qx, _mm256_fmadd_ps(qz, inv_qw, delta_qz));
__m256 delta_qw = _mm256_fnmadd_ps(qx, inv_qx, _mm256_mul_ps(qw, inv_qw));
delta_qw = _mm256_fnmadd_ps(qy, inv_qy, delta_qw);
delta_qw = _mm256_fnmadd_ps(qz, inv_qz, delta_qw);
// Angular velocity with polynomial approximations
__m256 qw_clamped = _mm256_max_ps(_mm256_min_ps(delta_qw, one_vec), neg_one_vec);
// Acos approximation: acos(x) ≈ π/2 - x * (1 + x^2 * (c1 + x^2 * (c2 + x^2 * c3)))
__m256 x = qw_clamped;
__m256 x2 = _mm256_mul_ps(x, x);
__m256 x4 = _mm256_mul_ps(x2, x2);
__m256 poly = _mm256_fmadd_ps(x4, acos_c3_vec, acos_c2_vec);
poly = _mm256_fmadd_ps(x2, poly, acos_c1_vec);
poly = _mm256_fmadd_ps(x2, poly, one_vec);
poly = _mm256_mul_ps(x, poly);
__m256 angle = _mm256_mul_ps(two_vec, _mm256_sub_ps(pi_over_2_vec, poly));
// Sin approximation for sin(angle/2): sin(x) ≈ x * (1 + x^2 * (c1 + x^2 * c2))
__m256 half_angle = _mm256_mul_ps(angle, half_vec);
__m256 ha2 = _mm256_mul_ps(half_angle, half_angle);
__m256 sin_poly = _mm256_fmadd_ps(ha2, sin_c2_vec, sin_c1_vec);
sin_poly = _mm256_fmadd_ps(ha2, sin_poly, one_vec);
__m256 sin_half_angle = _mm256_mul_ps(half_angle, sin_poly);
__m256 inv_sin_half_angle = _mm256_rcp_ps(_mm256_max_ps(sin_half_angle, eps_vec));
// Sign: Use qw > 0 instead of angle < π
__m256 sign = _mm256_cmp_ps(qw_clamped, zero_vec, _CMP_GT_OQ);
sign = _mm256_or_ps(_mm256_and_ps(sign, one_vec), _mm256_andnot_ps(sign, neg_one_vec));
// Angular velocity: (sign * qx * inv_sin_half_angle / dt)
__m256 scale = _mm256_mul_ps(sign, _mm256_mul_ps(inv_sin_half_angle, inv_dt_vec));
__m256 ang_vel_x_vec = _mm256_mul_ps(delta_qx, scale);
__m256 ang_vel_y_vec = _mm256_mul_ps(delta_qy, scale);
__m256 ang_vel_z_vec = _mm256_mul_ps(delta_qz, scale);
_mm256_store_ps(&ang_vel_x[c], ang_vel_x_vec);
_mm256_store_ps(&ang_vel_y[c], ang_vel_y_vec);
_mm256_store_ps(&ang_vel_z[c], ang_vel_z_vec);
}
}
// Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal)
for (int c = 0; c < MAX_CHAINS; c++) {
int base_idx = c * PARTICLES_PER_CHAIN;
struct chain_cfg *cfg = &cs->chain_configs[c];
// Load precomputed velocities and inertia parameters
__m256 vel_x_vec = _mm256_set1_ps(vel_x[c]);
__m256 vel_y_vec = _mm256_set1_ps(vel_y[c]);
__m256 vel_z_vec = _mm256_set1_ps(vel_z[c]);
__m256 ang_vel_x_vec = _mm256_set1_ps(ang_vel_x[c]);
__m256 ang_vel_y_vec = _mm256_set1_ps(ang_vel_y[c]);
__m256 ang_vel_z_vec = _mm256_set1_ps(ang_vel_z[c]);
__m256 linear_inertia_vec = _mm256_set1_ps(cfg->linear_inertia);
__m256 angular_inertia_vec = _mm256_set1_ps(cfg->angular_inertia);
__m256 centrifugal_inertia_vec = _mm256_set1_ps(cfg->centrifugal_inertia);
// Clamp inertia parameters to [0, 1]
linear_inertia_vec = _mm256_max_ps(_mm256_min_ps(linear_inertia_vec, inertia_clamp_max), inertia_clamp_min);
angular_inertia_vec = _mm256_max_ps(_mm256_min_ps(angular_inertia_vec, inertia_clamp_max), inertia_clamp_min);
centrifugal_inertia_vec = _mm256_max_ps(_mm256_min_ps(centrifugal_inertia_vec, inertia_clamp_max), inertia_clamp_min);
for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
__m256 px = _mm256_load_ps(&cs->p_pos_x[base_idx + i]);
__m256 py = _mm256_load_ps(&cs->p_pos_y[base_idx + i]);
__m256 pz = _mm256_load_ps(&cs->p_pos_z[base_idx + i]);
__m256 p_inv_mass = _mm256_load_ps(&cs->inv_mass[base_idx + i]);
p_inv_mass = _mm256_max_ps(_mm256_min_ps(p_inv_mass, inv_mass_clamp_max), inv_mass_clamp_min);
// Linear inertia: v * dt * linear_inertia
__m256 lin_delta_x = _mm256_mul_ps(_mm256_mul_ps(vel_x_vec, dt_vec), linear_inertia_vec);
__m256 lin_delta_y = _mm256_mul_ps(_mm256_mul_ps(vel_y_vec, dt_vec), linear_inertia_vec);
__m256 lin_delta_z = _mm256_mul_ps(_mm256_mul_ps(vel_z_vec, dt_vec), linear_inertia_vec);
// Angular inertia: (ω × r) * dt * angular_inertia
__m256 ang_delta_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py));
__m256 ang_delta_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz));
__m256 ang_delta_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px));
ang_delta_x = _mm256_mul_ps(_mm256_mul_ps(ang_delta_x, dt_vec), angular_inertia_vec);
ang_delta_y = _mm256_mul_ps(_mm256_mul_ps(ang_delta_y, dt_vec), angular_inertia_vec);
ang_delta_z = _mm256_mul_ps(_mm256_mul_ps(ang_delta_z, dt_vec), angular_inertia_vec);
// Centrifugal inertia: (ω × (ω × r)) * dt^2 * centrifugal_inertia
__m256 cross1_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py));
__m256 cross1_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz));
__m256 cross1_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px));
__m256 cross2_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, cross1_z), _mm256_mul_ps(ang_vel_z_vec, cross1_y));
__m256 cross2_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, cross1_x), _mm256_mul_ps(ang_vel_x_vec, cross1_z));
__m256 cross2_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, cross1_y), _mm256_mul_ps(ang_vel_y_vec, cross1_x));
// Calculate displacement: (ω × (ω × r)) * dt^2
__m256 base_cent_delta_x = _mm256_mul_ps(cross2_x, dt2_vec);
__m256 base_cent_delta_y = _mm256_mul_ps(cross2_y, dt2_vec);
__m256 base_cent_delta_z = _mm256_mul_ps(cross2_z, dt2_vec);
// Apply the centrifugal_inertia factor
__m256 cent_delta_x = _mm256_mul_ps(base_cent_delta_x, centrifugal_inertia_vec);
__m256 cent_delta_y = _mm256_mul_ps(base_cent_delta_y, centrifugal_inertia_vec);
__m256 cent_delta_z = _mm256_mul_ps(base_cent_delta_z, centrifugal_inertia_vec);
// Total delta
__m256 delta_x = _mm256_add_ps(_mm256_add_ps(lin_delta_x, ang_delta_x), cent_delta_x);
__m256 delta_y = _mm256_add_ps(_mm256_add_ps(lin_delta_y, ang_delta_y), cent_delta_y);
__m256 delta_z = _mm256_add_ps(_mm256_add_ps(lin_delta_z, ang_delta_z), cent_delta_z);
// Update positions
_mm256_store_ps(&cs->p_pos_x[base_idx + i], _mm256_add_ps(px, delta_x));
_mm256_store_ps(&cs->p_pos_y[base_idx + i], _mm256_add_ps(py, delta_y));
_mm256_store_ps(&cs->p_pos_z[base_idx + i], _mm256_add_ps(pz, delta_z));
}
// Update previous transformation
cs->prev_chain_pos_x[c] = cs->chain_pos_x[c];
cs->prev_chain_pos_y[c] = cs->chain_pos_y[c];
cs->prev_chain_pos_z[c] = cs->chain_pos_z[c];
cs->prev_chain_quat_x[c] = cs->chain_quat_x[c];
cs->prev_chain_quat_y[c] = cs->chain_quat_y[c];
cs->prev_chain_quat_z[c] = cs->chain_quat_z[c];
cs->prev_chain_quat_w[c] = cs->chain_quat_w[c];
}
// Step 2: Verlet Integration
for (int c = 0; c < MAX_CHAINS; c++) {
int base_idx = c * PARTICLES_PER_CHAIN;
struct chain_cfg *cfg = &cs->chain_configs[c];
__m256 gravity_x_vec = _mm256_set1_ps(gravity_local_x[c]);
__m256 gravity_y_vec = _mm256_set1_ps(gravity_local_y[c]);
__m256 gravity_z_vec = _mm256_set1_ps(gravity_local_z[c]);
__m256 wind_x_vec = _mm256_set1_ps(wind_local_x[c]);
__m256 wind_y_vec = _mm256_set1_ps(wind_local_y[c]);
__m256 wind_z_vec = _mm256_set1_ps(wind_local_z[c]);
__m256 damping_vec = _mm256_set1_ps(cfg->damping);
for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
__m256 p_curr_x = _mm256_load_ps(&cs->p_pos_x[base_idx + i]);
__m256 p_curr_y = _mm256_load_ps(&cs->p_pos_y[base_idx + i]);
__m256 p_curr_z = _mm256_load_ps(&cs->p_pos_z[base_idx + i]);
__m256 p_inv_mass = _mm256_load_ps(&cs->inv_mass[base_idx + i]);
p_inv_mass = _mm256_max_ps(_mm256_min_ps(p_inv_mass, inv_mass_clamp_max), inv_mass_clamp_min);
__m256 p_prev_x = _mm256_load_ps(&cs->prev_p_pos_x[base_idx + i]);
__m256 p_prev_y = _mm256_load_ps(&cs->prev_p_pos_y[base_idx + i]);
__m256 p_prev_z = _mm256_load_ps(&cs->prev_p_pos_z[base_idx + i]);
__m256 force_x = _mm256_add_ps(gravity_x_vec, wind_x_vec);
__m256 force_y = _mm256_add_ps(gravity_y_vec, wind_y_vec);
__m256 force_z = _mm256_add_ps(gravity_z_vec, wind_z_vec);
__m256 acc_x = _mm256_mul_ps(force_x, p_inv_mass);
__m256 acc_y = _mm256_mul_ps(force_y, p_inv_mass);
__m256 acc_z = _mm256_mul_ps(force_z, p_inv_mass);
__m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x);
__m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y);
__m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z);
__m256 damped_vel_x = _mm256_mul_ps(vel_x, damping_vec);
__m256 damped_vel_y = _mm256_mul_ps(vel_y, damping_vec);
__m256 damped_vel_z = _mm256_mul_ps(vel_z, damping_vec);
__m256 term_accel_x = _mm256_mul_ps(acc_x, dt2_vec);
__m256 term_accel_y = _mm256_mul_ps(acc_y, dt2_vec);
__m256 term_accel_z = _mm256_mul_ps(acc_z, dt2_vec);
__m256 new_p_x = _mm256_add_ps(p_curr_x, _mm256_add_ps(damped_vel_x, term_accel_x));
__m256 new_p_y = _mm256_add_ps(p_curr_y, _mm256_add_ps(damped_vel_y, term_accel_y));
__m256 new_p_z = _mm256_add_ps(p_curr_z, _mm256_add_ps(damped_vel_z, term_accel_z));
_mm256_store_ps(&cs->p_pos_x[base_idx + i], new_p_x);
_mm256_store_ps(&cs->p_pos_y[base_idx + i], new_p_y);
_mm256_store_ps(&cs->p_pos_z[base_idx + i], new_p_z);
_mm256_store_ps(&cs->prev_p_pos_x[base_idx + i], p_curr_x);
_mm256_store_ps(&cs->prev_p_pos_y[base_idx + i], p_curr_y);
_mm256_store_ps(&cs->prev_p_pos_z[base_idx + i], p_curr_z);
}
}
// --- AVBD Warm-starting of lambda and k_current for all distance constraints.
for (int c = 0; c < MAX_CHAINS; ++c) {
struct chain_cfg* cfg = &cs->chain_configs[c];
__m256 alpha_scalar_vec = _mm256_set1_ps(cfg->avbd_alpha);
__m256 gamma_scalar_vec = _mm256_set1_ps(cfg->avbd_gamma);
__m256 k_start_scalar_vec = _mm256_set1_ps(cfg->avbd_k_start);
// Loop through each block of 8 constraints for warm-starting
int r_base = c * CONSTRAINTS_PER_CHAIN; // Base index for this chain's constraints
for (int i = 0; i < CONSTRAINTS_PER_CHAIN - 1; i += 8) {
// Load current lambda, k_current for 8 constraints simultaneously
__m256 current_lambda_vec = _mm256_load_ps(&cs->dist_lambda[r_base + i]);
__m256 current_k_vec = _mm256_load_ps(&cs->dist_k_current[r_base + i]);
// Apply warm-starting (Eq. 19 from paper)
// lambda_new = lambda_prev_frame * alpha * gamma
current_lambda_vec = _mm256_mul_ps(current_lambda_vec, _mm256_mul_ps(alpha_scalar_vec, gamma_scalar_vec));
// k_current_new = max(K_START, k_current_prev_frame * gamma)
current_k_vec = _mm256_max_ps(k_start_scalar_vec, _mm256_mul_ps(current_k_vec, gamma_scalar_vec));
// Store updated warm-started values back
_mm256_store_ps(&cs->dist_lambda[r_base + i], current_lambda_vec);
_mm256_store_ps(&cs->dist_k_current[r_base + i], current_k_vec);
}
}
// Warm-starting for Collision Constraints ---
for (int c_idx = 0; c_idx < MAX_CHAINS; ++c_idx) {
struct chain_cfg* cfg = &cs->chain_configs[c_idx];
__m256 alpha_scalar_vec = _mm256_set1_ps(cfg->avbd_alpha);
__m256 gamma_scalar_vec = _mm256_set1_ps(cfg->avbd_gamma);
__m256 k_start_scalar_vec = _mm256_set1_ps(cfg->avbd_k_start);
int p_base = c_idx * PARTICLES_PER_CHAIN; // Base index for this chain's particles
for (int i = 0; i < PARTICLES_PER_CHAIN; i += VEC_FLOAT_COUNT) {
__m256 current_lambda_vec = _mm256_load_ps(&cs->coll_lambda[p_base + i]);
__m256 current_k_vec = _mm256_load_ps(&cs->coll_k_current[p_base + i]);
current_lambda_vec = _mm256_mul_ps(current_lambda_vec, _mm256_mul_ps(alpha_scalar_vec, gamma_scalar_vec));
current_k_vec = _mm256_max_ps(k_start_scalar_vec, _mm256_mul_ps(current_k_vec, gamma_scalar_vec));
_mm256_store_ps(&cs->coll_lambda[p_base + i], current_lambda_vec);
_mm256_store_ps(&cs->coll_k_current[p_base + i], current_k_vec);
}
}
for (int c = 0; c < MAX_CHAINS; c++) {
struct chain_cfg *cfg = &cs->chain_configs[c];
__m256 alpha_vec = _mm256_set1_ps(cfg->avbd_alpha);
__m256 beta_vec = _mm256_set1_ps(cfg->avbd_beta);
__m256 k_max_vec = _mm256_set1_ps(cfg->avbd_k_max);
int p_base = c * PARTICLES_PER_CHAIN;
int r_base = c * CONSTRAINTS_PER_CHAIN;
// Load all current particle positions into local buffer (once per chain per iteration)
for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
_mm256_store_ps(&px_local[i], _mm256_load_ps(&cs->p_pos_x[p_base + i]));
_mm256_store_ps(&py_local[i], _mm256_load_ps(&cs->p_pos_y[p_base + i]));
_mm256_store_ps(&pz_local[i], _mm256_load_ps(&cs->p_pos_z[p_base + i]));
_mm256_store_ps(&pm_local[i], _mm256_load_ps(&cs->inv_mass[p_base + i]));
_mm256_store_ps(&pm_local[i], _mm256_max_ps(_mm256_min_ps(_mm256_load_ps(&pm_local[i]), inv_mass_clamp_max), inv_mass_clamp_min));
}
for (int iter = 0; iter < MAX_ITERATIONS; iter++) {
// First block (i=0 to 7 of particles, processing constraints 0 to 7)
{
// p0 is aligned from px_local[0]
__m256 p0x = _mm256_load_ps(&px_local[0]);
__m256 p0y = _mm256_load_ps(&py_local[0]);
__m256 p0z = _mm256_load_ps(&pz_local[0]);
__m256 p0m = _mm256_load_ps(&pm_local[0]);
// p1 is unaligned from px_local[1]
__m256 p1x = _mm256_loadu_ps(&px_local[1]);
__m256 p1y = _mm256_loadu_ps(&py_local[1]);
__m256 p1z = _mm256_loadu_ps(&pz_local[1]);
__m256 p1m = _mm256_loadu_ps(&pm_local[1]);
// This corresponds to rest_lengths[r_base + 0] through rest_lengths[r_base + 7]
__m256 rest_len_vec = _mm256_load_ps(&cs->rest_lengths[r_base]);
// Calculate inverse sum of masses (rcp_w_sum)
__m256 w_sum = _mm256_add_ps(p0m, p1m);
__m256 w_sum_clamped = _mm256_max_ps(w_sum, eps_vec);
__m256 rcp_w_sum = _mm256_rcp_ps(w_sum_clamped);
// Calculate current vector and distance between particles
__m256 dx_vec = _mm256_sub_ps(p1x, p0x);
__m256 dy_vec = _mm256_sub_ps(p1y, p0y);
__m256 dz_vec = _mm256_sub_ps(p1z, p0z);
__m256 dist_sq = _mm256_mul_ps(dx_vec, dx_vec);
dist_sq = _mm256_fmadd_ps(dy_vec, dy_vec, dist_sq);
dist_sq = _mm256_fmadd_ps(dz_vec, dz_vec, dist_sq);
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); // 1/dist
__m256 dist = _mm256_rcp_ps(inv_dist); // Current distance
// Primal Update for Distance Constraints (Block 1) ---
// Load AVBD state for this block of 8 constraints (indices r_base + 0 to r_base + 7)
__m256 current_lambda_vec = _mm256_load_ps(&cs->dist_lambda[r_base + 0]);
__m256 current_k_vec = _mm256_load_ps(&cs->dist_k_current[r_base + 0]);
__m256 prev_frame_error_vec = _mm256_load_ps(&cs->dist_prev_frame_error[r_base + 0]);
// Original constraint error C_j*(x) = current_dist - target_length
__m256 C_x_original = _mm256_sub_ps(dist, rest_len_vec);
// Regularized error C_j(x) = C_j*(x) - alpha * C_j*(x_prev_frame) (Eq. 18)
__m256 C_x_regularized = _mm256_sub_ps(C_x_original, _mm256_mul_ps(alpha_vec, prev_frame_error_vec));
// Calculate the total force term based on Augmented Lagrangian: -(k * C_reg + lambda)
// The effective displacement is proportional to this force divided by effective stiffness (k) and mass.
__m256 force_term = _mm256_fmadd_ps(current_k_vec, C_x_regularized, current_lambda_vec);
// Calculate the scalar part of the correction, similar to original 'corr_scalar_part'
// This is simplified, representing the desired displacement due to the augmented force
__m256 effective_corr_scalar_part = _mm256_div_ps(force_term, current_k_vec); // Simplified displacement factor
// Normalize difference vector
__m256 delta_unit_x = _mm256_mul_ps(dx_vec, inv_dist);
__m256 delta_unit_y = _mm256_mul_ps(dy_vec, inv_dist);
__m256 delta_unit_z = _mm256_mul_ps(dz_vec, inv_dist);
// Calculate the actual displacement vectors
// Note the sign inversion from original: -(delta_x * pm0) for p0 and +(delta_x * pm1) for p1
// The 'effective_corr_scalar_part' already implicitly carries the sign of the desired correction.
// So, we just multiply by the unit direction and inverse mass.
__m256 delta_x = _mm256_mul_ps(delta_unit_x, effective_corr_scalar_part);
__m256 delta_y = _mm256_mul_ps(delta_unit_y, effective_corr_scalar_part);
__m256 delta_z = _mm256_mul_ps(delta_unit_z, effective_corr_scalar_part);
// Apply correction mask (from original code, avoid division by zero artifacts where dist is ~0)
__m256 apply_corr_mask = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ);
delta_x = _mm256_and_ps(delta_x, apply_corr_mask);
delta_y = _mm256_and_ps(delta_y, apply_corr_mask);
delta_z = _mm256_and_ps(delta_z, apply_corr_mask);
// Update positions in local buffer
// p0 moves along -delta (subtract), p1 moves along +delta (add)
_mm256_store_ps(&px_local[0], _mm256_sub_ps(p0x, _mm256_mul_ps(delta_x, p0m)));
_mm256_store_ps(&py_local[0], _mm256_sub_ps(p0y, _mm256_mul_ps(delta_y, p0m)));
_mm256_store_ps(&pz_local[0], _mm256_sub_ps(p0z, _mm256_mul_ps(delta_z, p0m)));
_mm256_storeu_ps(&px_local[1], _mm256_add_ps(p1x, _mm256_mul_ps(delta_x, p1m)));
_mm256_storeu_ps(&py_local[1], _mm256_add_ps(p1y, _mm256_mul_ps(delta_y, p1m)));
_mm256_storeu_ps(&pz_local[1], _mm256_add_ps(p1z, _mm256_mul_ps(delta_z, p1m)));
// Dual Update for Distance Constraints (Block 1) ---
// Recalculate constraint error *after* position updates for accurate lambda update
// Reload updated positions from local buffer to get the 'post-primal' state
p0x = _mm256_load_ps(&px_local[0]);
p0y = _mm256_load_ps(&py_local[0]);
p0z = _mm256_load_ps(&pz_local[0]);
p1x = _mm256_loadu_ps(&px_local[1]);
p1y = _mm256_loadu_ps(&py_local[1]);
p1z = _mm256_loadu_ps(&pz_local[1]);
__m256 updated_dx = _mm256_sub_ps(p1x, p0x);
__m256 updated_dy = _mm256_sub_ps(p1y, p0y);
__m256 updated_dz = _mm256_sub_ps(p1z, p0z);
__m256 updated_dist_sq = _mm256_mul_ps(updated_dx, updated_dx);
updated_dist_sq = _mm256_fmadd_ps(updated_dy, updated_dy, updated_dist_sq);
updated_dist_sq = _mm256_fmadd_ps(updated_dz, updated_dz, updated_dist_sq);
__m256 updated_inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(updated_dist_sq, eps_vec));
__m256 updated_dist = _mm256_rcp_ps(updated_inv_dist);
__m256 C_x_updated = _mm256_sub_ps(updated_dist, rest_len_vec); // New error after primal update
// Update Lambda (Eq. 11: lambda_new = lambda_old + k_current * C_j(x_updated))
__m256 new_lambda_vec = _mm256_fmadd_ps(current_k_vec, C_x_updated, current_lambda_vec);
_mm256_store_ps(&cs->dist_lambda[r_base + 0], new_lambda_vec); // Store back
// Update k_current (Stiffness ramping, Eq. 12)
// k_new = min(K_MAX, k_current + BETA * |C_j(x_updated)|)
__m256 abs_C_x_updated = _mm256_and_ps(C_x_updated, abs_mask_ps); // Absolute value using bitmask
__m256 k_updated_vec = _mm256_fmadd_ps(beta_vec, abs_C_x_updated, current_k_vec);
__m256 new_k_vec = _mm256_min_ps(k_max_vec, k_updated_vec);
_mm256_store_ps(&cs->dist_k_current[r_base + 0], new_k_vec); // Store back
}
// Second block (i=8 to 14 of particles, processing constraints 8 to 14)
{
// p0 is aligned from px_local[8]
__m256 p0x = _mm256_load_ps(&px_local[8]);
__m256 p0y = _mm256_load_ps(&py_local[8]);
__m256 p0z = _mm256_load_ps(&pz_local[8]);
__m256 p0m = _mm256_load_ps(&pm_local[8]);
// p1 is unaligned from px_local[9], *including* the dummy element at px_local[16]
// when i=15, p_base+i+1 becomes p_base+16
__m256 p1x = _mm256_loadu_ps(&px_local[9]);
__m256 p1y = _mm256_loadu_ps(&py_local[9]);
__m256 p1z = _mm256_loadu_ps(&pz_local[9]);
__m256 p1m = _mm256_loadu_ps(&pm_local[9]);
// This corresponds to rest_lengths[r_base + 8] through rest_lengths[r_base + 15]
__m256 rest_len_vec = _mm256_load_ps(&cs->rest_lengths[r_base + 8]);
__m256 dx_vec = _mm256_sub_ps(p1x, p0x);
__m256 dy_vec = _mm256_sub_ps(p1y, p0y);
__m256 dz_vec = _mm256_sub_ps(p1z, p0z);
__m256 dist_sq = _mm256_mul_ps(dx_vec, dx_vec);
dist_sq = _mm256_fmadd_ps(dy_vec, dy_vec, dist_sq);
dist_sq = _mm256_fmadd_ps(dz_vec, dz_vec, dist_sq);
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
__m256 w_sum = _mm256_add_ps(p0m, p1m);
__m256 w_sum_clamped = _mm256_max_ps(w_sum, eps_vec);
__m256 rcp_w_sum = _mm256_rcp_ps(w_sum_clamped);
__m256 dist = _mm256_rcp_ps(inv_dist);
// Primal Update for Distance Constraints (Block 2) ---
// Load AVBD state for this block of 8 constraints (indices r_base + 8 to r_base + 15)
__m256 current_lambda_vec = _mm256_load_ps(&cs->dist_lambda[r_base + 8]);
__m256 current_k_vec = _mm256_load_ps(&cs->dist_k_current[r_base + 8]);
__m256 prev_frame_error_vec = _mm256_load_ps(&cs->dist_prev_frame_error[r_base + 8]);
// Original constraint error C_j*(x) = current_dist - target_length
__m256 C_x_original = _mm256_sub_ps(dist, rest_len_vec);
// Regularized error C_j(x) = C_j*(x) - alpha * C_j*(x_prev_frame) (Eq. 18)
__m256 C_x_regularized = _mm256_sub_ps(C_x_original, _mm256_mul_ps(alpha_vec, prev_frame_error_vec));
// Calculate the total force term based on Augmented Lagrangian: -(k * C_reg + lambda)
__m256 force_term = _mm256_fmadd_ps(current_k_vec, C_x_regularized, current_lambda_vec);
// Calculate the scalar part of the correction, similar to original 'corr_scalar_part'
__m256 effective_corr_scalar_part = _mm256_div_ps(force_term, current_k_vec); // Simplified displacement factor
__m256 apply_corr_mask = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ);
// This 'valid_mask' handles the fact that the last particle doesn't have a *next* particle
// for a distance constraint. It effectively sets the correction for the last element (index 7 of the block) to zero.
// For PARTICLES_PER_CHAIN = 16, this second block covers constraints for particles 8 to 15.
// The constraint for particle 15 (index 7 in this block) is between particle 15 and 16 (the dummy).
// The mask effectively disables the constraint correction for the dummy particle.
__attribute__((aligned(32))) float valid_mask_array[8] = { -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 0.0f };
__m256 valid_mask = _mm256_load_ps(valid_mask_array);
apply_corr_mask = _mm256_and_ps(apply_corr_mask, valid_mask);
__m256 delta_unit_x = _mm256_mul_ps(dx_vec, inv_dist);
__m256 delta_unit_y = _mm256_mul_ps(dy_vec, inv_dist);
__m256 delta_unit_z = _mm256_mul_ps(dz_vec, inv_dist);
__m256 delta_x = _mm256_mul_ps(delta_unit_x, effective_corr_scalar_part);
__m256 delta_y = _mm256_mul_ps(delta_unit_y, effective_corr_scalar_part);
__m256 delta_z = _mm256_mul_ps(delta_unit_z, effective_corr_scalar_part);
delta_x = _mm256_and_ps(delta_x, apply_corr_mask);
delta_y = _mm256_and_ps(delta_y, apply_corr_mask);
delta_z = _mm256_and_ps(delta_z, apply_corr_mask);
// Update positions in local buffer
_mm256_store_ps(&px_local[8], _mm256_sub_ps(p0x, _mm256_mul_ps(delta_x, p0m)));
_mm256_store_ps(&py_local[8], _mm256_sub_ps(p0y, _mm256_mul_ps(delta_y, p0m)));
_mm256_store_ps(&pz_local[8], _mm256_sub_ps(p0z, _mm256_mul_ps(delta_z, p0m)));
_mm256_storeu_ps(&px_local[9], _mm256_add_ps(p1x, _mm256_mul_ps(delta_x, p1m)));
_mm256_storeu_ps(&py_local[9], _mm256_add_ps(py1, _mm256_mul_ps(delta_y, p1m)));
_mm256_storeu_ps(&pz_local[9], _mm256_add_ps(pz1, _mm256_mul_ps(delta_z, p1m)));
// Recalculate constraint error *after* position updates for accurate lambda update
// Reload updated positions from local buffer
p0x = _mm256_load_ps(&px_local[8]);
p0y = _mm256_load_ps(&py_local[8]);
p0z = _mm256_load_ps(&pz_local[8]);
p1x = _mm256_loadu_ps(&px_local[9]);
p1y = _mm256_loadu_ps(&py_local[9]);
p1z = _mm256_loadu_ps(&pz_local[9]);
updated_dx = _mm256_sub_ps(p1x, p0x);
updated_dy = _mm256_sub_ps(p1y, p0y);
updated_dz = _mm256_sub_ps(p1z, p0z);
updated_dist_sq = _mm256_mul_ps(updated_dx, updated_dx);
updated_dist_sq = _mm256_fmadd_ps(updated_dy, updated_dy, updated_dist_sq);
updated_dist_sq = _mm256_fmadd_ps(updated_dz, updated_dz, updated_dist_sq);
updated_inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(updated_dist_sq, eps_vec));
updated_dist = _mm256_rcp_ps(updated_inv_dist);
__m256 C_x_updated = _mm256_sub_ps(updated_dist, rest_len_vec); // New error after primal update
// Update Lambda (Eq. 11: lambda_new = lambda_old + k_current * C_j(x_updated))
__m256 new_lambda_vec = _mm256_fmadd_ps(current_k_vec, C_x_updated, current_lambda_vec);
_mm256_store_ps(&cs->dist_lambda[r_base + 8], new_lambda_vec); // Store back
// Update k_current (Stiffness ramping, Eq. 12)
// k_new = min(K_MAX, k_current + BETA * |C_j(x_updated)|)
__m256 abs_C_x_updated = _mm256_and_ps(C_x_updated, abs_mask_ps); // Absolute value using bitmask
__m256 k_updated_vec = _mm256_fmadd_ps(beta_vec, abs_C_x_updated, current_k_vec);
__m256 new_k_vec = _mm256_min_ps(k_max_vec, k_updated_vec);
_mm256_store_ps(&cs->dist_k_current[r_base + 8], new_k_vec); // Store back
} // End of second block
}
// After all iterations, save the final positions from local buffer to global cs->p_pos_x/y/z
for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
_mm256_store_ps(&cs->p_pos_x[p_base + i], _mm256_load_ps(&px_local[i]));
_mm256_store_ps(&cs->p_pos_y[p_base + i], _mm256_load_ps(&py_local[i]));
_mm256_store_ps(&cs->p_pos_z[p_base + i], _mm256_load_ps(&pz_local[i]));
}
}
// Step 4: Sphere Collisions with Friction
for (int c = 0; c < MAX_CHAINS; c++) {
int p_base = c * PARTICLES_PER_CHAIN;
int s_base = c * SPHERES_PER_CHAIN;
struct chain_cfg *cfg = &cs->chain_configs[c];
__m256 friction_vec = _mm256_set1_ps(cfg->friction);
for (int s = 0; s < SPHERES_PER_CHAIN; s++) {
__m256 sphere_x_vec = _mm256_set1_ps(cs->sphere_x[s_base + s]);
__m256 sphere_y_vec = _mm256_set1_ps(cs->sphere_y[s_base + s]);
__m256 sphere_z_vec = _mm256_set1_ps(cs->sphere_z[s_base + s]);
__m256 sphere_r_vec = _mm256_set1_ps(cs->sphere_radius[s_base + s]);
__m256 alpha_coll_vec = _mm256_set1_ps(cfg->avbd_alpha);
__m256 beta_coll_vec = _mm256_set1_ps(cfg->avbd_beta);
__m256 k_max_coll_vec = _mm256_set1_ps(cfg->avbd_k_max);
for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
__m256 p_curr_x = _mm256_load_ps(&cs->p_pos_x[p_base + i]);
__m256 p_curr_y = _mm256_load_ps(&cs->p_pos_y[p_base + i]);
__m256 p_curr_z = _mm256_load_ps(&cs->p_pos_z[p_base + i]);
__m256 p_prev_x = _mm256_load_ps(&cs->prev_p_pos_x[p_base + i]);
__m256 p_prev_y = _mm256_load_ps(&cs->prev_p_pos_y[p_base + i]);
__m256 p_prev_z = _mm256_load_ps(&cs->prev_p_pos_z[p_base + i]);
__m256 p_inv_mass = _mm256_load_ps(&cs->inv_mass[p_base + i]);
p_inv_mass = _mm256_max_ps(_mm256_min_ps(p_inv_mass, inv_mass_clamp_max), inv_mass_clamp_min);
__m256 dx = _mm256_sub_ps(p_curr_x, sphere_x_vec);
__m256 dy = _mm256_sub_ps(p_curr_y, sphere_y_vec);
__m256 dz = _mm256_sub_ps(p_curr_z, sphere_z_vec);
__m256 dist_sq = _mm256_mul_ps(dx, dx);
dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
__m256 dist = _mm256_rcp_ps(inv_dist);
__m256 penetration = _mm256_sub_ps(sphere_r_vec, dist);
__m256 collision_mask = _mm256_cmp_ps(penetration, zero_vec, _CMP_GT_OQ);
__m256 normal_x = _mm256_mul_ps(dx, inv_dist);
__m256 normal_y = _mm256_mul_ps(dy, inv_dist);
__m256 normal_z = _mm256_mul_ps(dz, inv_dist);
// Load AVBD state for this particle block
__m256 coll_lambda_vec = _mm256_load_ps(&cs->coll_lambda[p_base + i]);
__m256 coll_k_current_vec = _mm256_load_ps(&cs->coll_k_current[p_base + i]);
__m256 coll_prev_frame_error_vec = _mm256_load_ps(&cs->coll_prev_frame_error[p_base + i]);
// AVBD Primal Update:
// Regularized error C_j(x) = C_j*(x) - alpha * C_j*(x_prev_frame)
__m256 C_x_regularized = _mm256_sub_ps(penetration, _mm256_mul_ps(alpha_coll_vec, coll_prev_frame_error_vec));
// Apply collision_mask to regularized error, so only active constraints contribute.
// This ensures C_x_regularized is 0 for non-penetrating particles, making force_term 0.
C_x_regularized = _mm256_and_ps(C_x_regularized, collision_mask);
// Calculate force term: (k * C_reg + lambda)
__m256 force_term = _mm256_fmadd_ps(coll_k_current_vec, C_x_regularized, coll_lambda_vec);
// Calculate correction magnitude (delta_p in paper): force_term * inv_mass / k_current
// For collision, delta_x = -nablaC_j(x) * (H_j_inv * (lambda_j + K_j C_j(x))).
// H_j_inv for a single particle collision is its inverse mass (p_inv_mass).
// So, the correction should be along the normal: -normal * (force_term * p_inv_mass) / k_current
// The force_term already has the appropriate sign for correction direction when C_x_regularized is positive (penetration).
__m256 correction_magnitude_scalar = _mm256_div_ps(_mm256_mul_ps(force_term, p_inv_mass), coll_k_current_vec);
// Apply the correction along the normal direction.
// The sign of 'penetration' (and thus 'C_x_regularized' and 'force_term')
// means positive 'correction_magnitude_scalar' should push OUT of collision.
// So we subtract from particle position.
__m256 delta_pos_x = _mm256_mul_ps(normal_x, correction_magnitude_scalar);
__m256 delta_pos_y = _mm256_mul_ps(normal_y, correction_magnitude_scalar);
__m256 delta_pos_z = _mm256_mul_ps(normal_z, correction_magnitude_scalar);
// Update positions: p_curr = p_curr - delta_pos (particle moves out of penetration)
p_curr_x = _mm256_sub_ps(p_curr_x, delta_pos_x);
p_curr_y = _mm256_sub_ps(p_curr_y, delta_pos_y);
p_curr_z = _mm256_sub_ps(p_curr_z, delta_pos_z);
_mm256_store_ps(&cs->p_pos_x[p_base + i], p_curr_x);
_mm256_store_ps(&cs->p_pos_y[p_base + i], p_curr_y);
_mm256_store_ps(&cs->p_pos_z[p_base + i], p_curr_z);
// Apply friction (post-stabilization step)
// Recalculate velocity based on updated positions
__m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x);
__m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y);
__m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z);
// Project velocity onto normal to get normal component
__m256 vel_dot_normal = _mm256_mul_ps(vel_x, normal_x);
vel_dot_normal = _mm256_fmadd_ps(vel_y, normal_y, vel_dot_normal);
vel_dot_normal = _mm256_fmadd_ps(vel_z, normal_z, vel_dot_normal);
// Get tangential velocity component
__m256 tangent_x = _mm256_sub_ps(vel_x, _mm256_mul_ps(vel_dot_normal, normal_x));
__m256 tangent_y = _mm256_sub_ps(vel_y, _mm256_mul_ps(vel_dot_normal, normal_y));
__m256 tangent_z = _mm256_sub_ps(vel_z, _mm256_mul_ps(vel_dot_normal, normal_z));
__m256 tangent_mag_sq = _mm256_mul_ps(tangent_x, tangent_x);
tangent_mag_sq = _mm256_fmadd_ps(tangent_y, tangent_y, tangent_mag_sq);
tangent_mag_sq = _mm256_fmadd_ps(tangent_z, tangent_z, tangent_mag_sq);
__m256 inv_tangent_mag = _mm256_rsqrt_ps(_mm256_max_ps(tangent_mag_sq, eps_vec));
// Calculate friction force magnitude limit (based on normal force proxy)
__m256 friction_mag_limit = _mm256_mul_ps(penetration, friction_vec);
// Actual tangential velocity length
__m256 actual_tangent_mag = _mm256_mul_ps(_mm256_sqrt_ps(_mm256_max_ps(tangent_mag_sq, eps_vec)), one_vec); // sqrt(tangent_mag_sq) is needed here
// Calculate the scaling factor for friction. If actual_tangent_mag is greater than limit, scale down.
__m256 friction_scale = _mm256_div_ps(friction_mag_limit, _mm256_max_ps(actual_tangent_mag, eps_vec));
friction_scale = _mm256_min_ps(friction_scale, one_vec); // Clamp to 1.0 (no acceleration from friction)
// Apply friction as a reduction of tangential velocity, converting to position change
// The idea is to reduce the tangential displacement that happened *this* iteration.
// The original `vel_x/y/z` are already displacements from `p_prev` to `p_curr`.
// So, the tangential displacement is `tangent_x/y/z`.
// We want to reduce this by `(1 - friction_scale)`.
__m256 friction_x_corr = _mm256_mul_ps(tangent_x, friction_scale);
__m256 friction_y_corr = _mm256_mul_ps(tangent_y, friction_scale);
__m256 friction_z_corr = _mm256_mul_ps(tangent_z, friction_scale);
// Only apply if actively colliding
friction_x_corr = _mm256_and_ps(friction_x_corr, collision_mask);
friction_y_corr = _mm256_and_ps(friction_y_corr, collision_mask);
friction_z_corr = _mm256_and_ps(friction_z_corr, collision_mask);
// Apply friction by moving the particle along the tangential direction.
// If friction_scale is 0 (no friction), no change. If 1 (full friction), moves back to p_prev.
p_curr_x = _mm256_sub_ps(p_curr_x, friction_x_corr);
p_curr_y = _mm256_sub_ps(p_curr_y, friction_y_corr);
p_curr_z = _mm256_sub_ps(p_curr_z, friction_z_corr);
_mm256_store_ps(&cs->p_pos_x[p_base + i], p_curr_x);
_mm256_store_ps(&cs->p_pos_y[p_base + i], p_curr_y);
_mm256_store_ps(&cs->p_pos_z[p_base + i], p_curr_z);
// Dual Update for Collision Constraints ---
// Recalculate collision error *after* position updates (normal & friction)
// Reload updated positions
p_curr_x = _mm256_load_ps(&cs->p_pos_x[p_base + i]);
p_curr_y = _mm256_load_ps(&cs->p_pos_y[p_base + i]);
p_curr_z = _mm256_load_ps(&cs->p_pos_z[p_base + i]);
dx = _mm256_sub_ps(p_curr_x, sphere_x_vec);
dy = _mm256_sub_ps(p_curr_y, sphere_y_vec);
dz = _mm256_sub_ps(p_curr_z, sphere_z_vec);
dist_sq = _mm256_mul_ps(dx, dx);
dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);
inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
dist = _mm256_rcp_ps(inv_dist);
__m256 C_x_updated_collision = _mm256_sub_ps(sphere_r_vec, dist); // New error after primal update
// Apply the collision mask again for updates, so only active constraints update their AVBD state
C_x_updated_collision = _mm256_and_ps(C_x_updated_collision, collision_mask);
// Update Lambda (Eq. 11: lambda_new = lambda_old + k_current * C_j(x_updated))
__m256 new_coll_lambda_vec = _mm256_fmadd_ps(coll_k_current_vec, C_x_updated_collision, coll_lambda_vec);
_mm256_store_ps(&cs->coll_lambda[p_base + i], new_coll_lambda_vec);
// Update k_current (Stiffness ramping, Eq. 12)
// k_new = min(K_MAX, k_current + BETA * |C_j(x_updated)|)
__m256 abs_C_x_updated_collision = _mm256_and_ps(C_x_updated_collision, abs_mask_ps); // Absolute value
__m256 k_updated_coll_vec = _mm256_fmadd_ps(beta_coll_vec, abs_C_x_updated_collision, coll_k_current_vec);
__m256 new_coll_k_vec = _mm256_min_ps(k_max_coll_vec, k_updated_coll_vec);
_mm256_store_ps(&cs->coll_k_current[p_base + i], new_coll_k_vec);
// --- END AVBD ADDITION ---
}
}
}
// Step 5: Motion Constraints
for (int c = 0; c < MAX_CHAINS; c++) {
int p_base = c * PARTICLES_PER_CHAIN;
for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) {
__m256 px = _mm256_load_ps(&cs->p_pos_x[p_base + i]);
__m256 py = _mm256_load_ps(&cs->p_pos_y[p_base + i]);
__m256 pz = _mm256_load_ps(&cs->p_pos_z[p_base + i]);
__m256 pm = _mm256_load_ps(&cs->inv_mass[p_base + i]);
pm = _mm256_max_ps(_mm256_min_ps(pm, inv_mass_clamp_max), inv_mass_clamp_min);
__m256 rx = _mm256_load_ps(&cs->rest_pos_x[p_base + i]);
__m256 ry = _mm256_load_ps(&cs->rest_pos_y[p_base + i]);
__m256 rz = _mm256_load_ps(&cs->rest_pos_z[p_base + i]);
__m256 radius_vec = _mm256_load_ps(&cs->motion_radius[p_base + i]);
__m256 dx = _mm256_sub_ps(px, rx);
__m256 dy = _mm256_sub_ps(py, ry);
__m256 dz = _mm256_sub_ps(pz, rz);
__m256 dist_sq = _mm256_mul_ps(dx, dx);
dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq);
dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq);
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
__m256 dist = _mm256_rcp_ps(inv_dist);
__m256 penetration = _mm256_sub_ps(dist, radius_vec);
__m256 mask = _mm256_cmp_ps(penetration, zero_vec, _CMP_GT_OQ);
__m256 corr_factor = _mm256_mul_ps(penetration, inv_dist);
corr_factor = _mm256_and_ps(corr_factor, mask);
__m256 delta_x = _mm256_mul_ps(dx, corr_factor);
__m256 delta_y = _mm256_mul_ps(dy, corr_factor);
__m256 delta_z = _mm256_mul_ps(dz, corr_factor);
delta_x = _mm256_mul_ps(delta_x, pm);
delta_y = _mm256_mul_ps(delta_y, pm);
delta_z = _mm256_mul_ps(delta_z, pm);
_mm256_store_ps(&cs->p_pos_x[p_base + i], _mm256_sub_ps(px, delta_x));
_mm256_store_ps(&cs->p_pos_y[p_base + i], _mm256_sub_ps(py, delta_y));
_mm256_store_ps(&cs->p_pos_z[p_base + i], _mm256_sub_ps(pz, delta_z));
}
}
// Store Final Errors for Next Frame's Regularization ---
// This happens once at the very end of simulate_chains, after all iterations are complete for the frame.
for (int c = 0; c < MAX_CHAINS; ++c) {
int p_base = c * PARTICLES_PER_CHAIN;
int r_base = c * CONSTRAINTS_PER_CHAIN;
// Loop through each block of 8 constraints (distance constraints are N-1 for N particles)
for (int i = 0; i < CONSTRAINTS_PER_CHAIN - 1; i += VEC_FLOAT_COUNT) {
// Calculate the current error for this distance constraint block
// (Similar to C_x_original calculation from Step 3, but using final positions)
// Load final positions for p0 and p1 (from global arrays now, as local buffer is stale)
__m256 px0 = _mm256_load_ps(&cs->p_pos_x[p_base + i + 0]);
__m256 py0 = _mm256_load_ps(&cs->p_pos_y[p_base + i + 0]);
__m256 pz0 = _mm256_load_ps(&cs->p_pos_z[p_base + i + 0]);
__m256 px1 = _mm256_load_ps(&cs->p_pos_x[p_base + i + 1]); // Use _mm256_load_ps as this is from global
__m256 py1 = _mm256_load_ps(&cs->p_pos_y[p_base + i + 1]);
__m256 pz1 = _mm256_load_ps(&cs->p_pos_z[p_base + i + 1]);
__m256 diff_x = _mm256_sub_ps(px1, px0);
__m256 diff_y = _mm256_sub_ps(py1, py0);
__m256 diff_z = _mm256_sub_ps(pz1, pz0);
__m256 dist_sq = _mm256_mul_ps(diff_x, diff_x);
dist_sq = _mm256_fmadd_ps(diff_y, diff_y, dist_sq);
dist_sq = _mm256_fmadd_ps(diff_z, diff_z, dist_sq);
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec));
__m256 dist = _mm256_rcp_ps(inv_dist);
__m256 rest_len_vec = _mm256_load_ps(&cs->rest_lengths[r_base + i]);
__m256 final_error_vec = _mm256_sub_ps(dist, rest_len_vec);
_mm256_store_ps(&cs->dist_prev_frame_error[r_base + i], final_error_vec); // Store for next frame's regularization
}
}
for (int c_idx = 0; c_idx < MAX_CHAINS; ++c_idx) {
int p_base = c_idx * PARTICLES_PER_CHAIN;
int s_base = c_idx * SPHERES_PER_CHAIN; // For accessing sphere data for this chain
for (int i = 0; i < PARTICLES_PER_CHAIN; i += VEC_FLOAT_COUNT) {
__m256 p_curr_x = _mm256_load_ps(&cs->p_pos_x[p_base + i]);
__m256 p_curr_y = _mm256_load_ps(&cs->p_pos_y[p_base + i]);
__m256 p_curr_z = _mm256_load_ps(&cs->p_pos_z[p_base + i]);
__m256 deepest_penetration_vec = neg_flt_max_vec; // Initialize with negative infinity
// Loop through all spheres for this chain to find the max penetration for each particle in the block
for (int s_idx = 0; s_idx < SPHERES_PER_CHAIN; ++s_idx) {
__m256 s_x = _mm256_set1_ps(cs->sphere_x[s_base + s_idx]);
__m256 s_y = _mm256_set1_ps(cs->sphere_y[s_base + s_idx]);
__m256 s_z = _mm256_set1_ps(cs->sphere_z[s_base + s_idx]);
__m256 s_r = _mm256_set1_ps(cs->sphere_radius[s_base + s_idx]);
__m256 dx_s = _mm256_sub_ps(p_curr_x, s_x);
__m256 dy_s = _mm256_sub_ps(p_curr_y, s_y);
__m256 dz_s = _mm256_sub_ps(p_curr_z, s_z);
__m256 dist_sq_s = _mm256_mul_ps(dx_s, dx_s);
dist_sq_s = _mm256_fmadd_ps(dy_s, dy_s, dist_sq_s);
dist_sq_s = _mm256_fmadd_ps(dz_s, dz_s, dist_sq_s);
__m256 inv_dist_s = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq_s, eps_vec));
__m256 dist_s = _mm256_rcp_ps(inv_dist_s);
__m256 current_penetration_s = _mm256_sub_ps(s_r, dist_s); // C(x) = radius - distance
// We only care about positive penetration (i.e., actually colliding)
current_penetration_s = _mm256_max_ps(zero_vec, current_penetration_s);
deepest_penetration_vec = _mm256_max_ps(deepest_penetration_vec, current_penetration_s);
}
_mm256_store_ps(&cs->coll_prev_frame_error[p_base + i], deepest_penetration_vec); // Store for next frame's regularization
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment