Last active
June 23, 2025 11:39
-
-
Save vurtun/f8e6742d9b5edc00a55219a08d539e7d to your computer and use it in GitHub Desktop.
chain simulation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <immintrin.h> | |
#include <math.h> | |
enum { | |
MAX_CHAINS = 64, | |
PARTICLES_PER_CHAIN = 16, | |
SPHERES_PER_CHAIN = 8, | |
MAX_PARTICLES = (MAX_CHAINS * PARTICLES_PER_CHAIN), | |
MAX_SPHERES = (MAX_CHAINS * SPHERES_PER_CHAIN), | |
CONSTRAINTS_PER_CHAIN = PARTICLES_PER_CHAIN, | |
MAX_REST_LENGTHS = (MAX_CHAINS * CONSTRAINTS_PER_CHAIN), | |
MAX_ITERATIONS = 4, | |
}; | |
struct chain_cfg { | |
float damping; | |
float stiffness; | |
float stretch_factor; | |
float max_strain; | |
float friction; | |
float collision_radius; | |
float linear_inertia; // [0, 1], 1.0f = physically correct linear motion | |
float angular_inertia; // [0, 1], 1.0f = physically correct angular rotation | |
float centrifugal_inertia; // [0, 1], 1.0f = physically correct centrifugal force | |
} __attribute__((aligned(32))); | |
struct chain_sim { | |
unsigned short free_idx_cnt; | |
unsigned short free_idx[MAX_CHAINS]; | |
struct chain_cfg chain_configs[MAX_CHAINS] __attribute__((aligned(32))); | |
// Chain global forces (world space gravity and wind) | |
float gravity_x[MAX_CHAINS] __attribute__((aligned(32))); | |
float gravity_y[MAX_CHAINS] __attribute__((aligned(32))); | |
float gravity_z[MAX_CHAINS] __attribute__((aligned(32))); | |
float wind_x[MAX_CHAINS] __attribute__((aligned(32))); | |
float wind_y[MAX_CHAINS] __attribute__((aligned(32))); | |
float wind_z[MAX_CHAINS] __attribute__((aligned(32))); | |
// Chain transformations (world space) | |
float chain_pos_x[MAX_CHAINS] __attribute__((aligned(32))); | |
float chain_pos_y[MAX_CHAINS] __attribute__((aligned(32))); | |
float chain_pos_z[MAX_CHAINS] __attribute__((aligned(32))); | |
float chain_quat_x[MAX_CHAINS] __attribute__((aligned(32))); | |
float chain_quat_y[MAX_CHAINS] __attribute__((aligned(32))); | |
float chain_quat_z[MAX_CHAINS] __attribute__((aligned(32))); | |
float chain_quat_w[MAX_CHAINS] __attribute__((aligned(32))); | |
float prev_chain_pos_x[MAX_CHAINS] __attribute__((aligned(32))); | |
float prev_chain_pos_y[MAX_CHAINS] __attribute__((aligned(32))); | |
float prev_chain_pos_z[MAX_CHAINS] __attribute__((aligned(32))); | |
float prev_chain_quat_x[MAX_CHAINS] __attribute__((aligned(32))); | |
float prev_chain_quat_y[MAX_CHAINS] __attribute__((aligned(32))); | |
float prev_chain_quat_z[MAX_CHAINS] __attribute__((aligned(32))); | |
float prev_chain_quat_w[MAX_CHAINS] __attribute__((aligned(32))); | |
// Particle positions (model space) | |
float p_pos_x[MAX_PARTICLES] __attribute__((aligned(32))); | |
float p_pos_y[MAX_PARTICLES] __attribute__((aligned(32))); | |
float p_pos_z[MAX_PARTICLES] __attribute__((aligned(32))); | |
float inv_mass[MAX_PARTICLES] __attribute__((aligned(32))); | |
float prev_p_pos_x[MAX_PARTICLES] __attribute__((aligned(32))); | |
float prev_p_pos_y[MAX_PARTICLES] __attribute__((aligned(32))); | |
float prev_p_pos_z[MAX_PARTICLES] __attribute__((aligned(32))); | |
float rest_lengths[MAX_REST_LENGTHS] __attribute__((aligned(32))); | |
// Sphere positions (model space) | |
float sphere_x[MAX_SPHERES] __attribute__((aligned(32))); | |
float sphere_y[MAX_SPHERES] __attribute__((aligned(32))); | |
float sphere_z[MAX_SPHERES] __attribute__((aligned(32))); | |
float sphere_radius[MAX_SPHERES] __attribute__((aligned(32))); | |
// Rest positions (model space) | |
float rest_pos_x[MAX_PARTICLES] __attribute__((aligned(32))); | |
float rest_pos_y[MAX_PARTICLES] __attribute__((aligned(32))); | |
float rest_pos_z[MAX_PARTICLES] __attribute__((aligned(32))); | |
float motion_radius[MAX_PARTICLES] __attribute__((aligned(32))); | |
}; | |
void simulate_chains(struct chain_sim *cs, float dt) { | |
// Common SIMD constants | |
const __m256 dt_vec = _mm256_set1_ps(dt); | |
const __m256 dt2_vec = _mm256_set1_ps(dt * dt); | |
const __m256 one_vec = _mm256_set1_ps(1.0f); | |
const __m256 neg_one_vec = _mm256_set1_ps(-1.0f); | |
const __m256 eps_vec = _mm256_set1_ps(1e-6f); | |
const __m256 inv_mass_clamp_min = _mm256_set1_ps(0.0f); | |
const __m256 inv_mass_clamp_max = _mm256_set1_ps(1e6f); | |
const __m256 zero_vec = _mm256_setzero_ps(); | |
const __m256 inv_dt_vec = _mm256_set1_ps(1.0f / dt); | |
const __m256 two_vec = _mm256_set1_ps(2.0f); | |
const __m256 half_vec = _mm256_set1_ps(0.5f); | |
const __m256 pi_over_2_vec = _mm256_set1_ps(1.5707963267948966f); | |
const __m256 acos_c1_vec = _mm256_set1_ps(-0.166666666666f); | |
const __m256 acos_c2_vec = _mm256_set1_ps(0.075000000000f); | |
const __m256 acos_c3_vec = _mm256_set1_ps(-0.044642857143f); | |
const __m256 sin_c1_vec = _mm256_set1_ps(-0.166666666666f); | |
const __m256 sin_c2_vec = _mm256_set1_ps(0.008333333333f); | |
const __m256 inertia_clamp_min = _mm256_set1_ps(0.0f); | |
const __m256 inertia_clamp_max = _mm256_set1_ps(1.0f); | |
const __m256 abs_mask_ps = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); | |
// Local arrays for PBD solver | |
float px_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32))); | |
float py_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32))); | |
float pz_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32))); | |
float pm_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32))); | |
// Initialize dummy elements | |
px_local[PARTICLES_PER_CHAIN] = 0.0f; | |
py_local[PARTICLES_PER_CHAIN] = 0.0f; | |
pz_local[PARTICLES_PER_CHAIN] = 0.0f; | |
pm_local[PARTICLES_PER_CHAIN] = 0.0f; | |
// Stack arrays for precomputed data | |
float wind_local_x[MAX_CHAINS] __attribute__((aligned(32))); | |
float wind_local_y[MAX_CHAINS] __attribute__((aligned(32))); | |
float wind_local_z[MAX_CHAINS] __attribute__((aligned(32))); | |
float gravity_local_x[MAX_CHAINS] __attribute__((aligned(32))); | |
float gravity_local_y[MAX_CHAINS] __attribute__((aligned(32))); | |
float gravity_local_z[MAX_CHAINS] __attribute__((aligned(32))); | |
float vel_x[MAX_CHAINS] __attribute__((aligned(32))); | |
float vel_y[MAX_CHAINS] __attribute__((aligned(32))); | |
float vel_z[MAX_CHAINS] __attribute__((aligned(32))); | |
float ang_vel_x[MAX_CHAINS] __attribute__((aligned(32))); | |
float ang_vel_y[MAX_CHAINS] __attribute__((aligned(32))); | |
float ang_vel_z[MAX_CHAINS] __attribute__((aligned(32))); | |
// Step 0: Precomputation (SIMD, 8 chains at once) | |
for (int c = 0; c < MAX_CHAINS; c += 8) { | |
// Load chain quaternions | |
__m256 qx = _mm256_load_ps(&cs->chain_quat_x[c]); | |
__m256 qy = _mm256_load_ps(&cs->chain_quat_y[c]); | |
__m256 qz = _mm256_load_ps(&cs->chain_quat_z[c]); | |
__m256 qw = _mm256_load_ps(&cs->chain_quat_w[c]); | |
// Compute local-space wind | |
{ | |
__m256 vx = _mm256_load_ps(&cs->wind_x[c]); | |
__m256 vy = _mm256_load_ps(&cs->wind_y[c]); | |
__m256 vz = _mm256_load_ps(&cs->wind_z[c]); | |
// Create q_conjugate components (qw remains, qx, qy, qz are negated) | |
__m256 cqx = _mm256_sub_ps(zero_vec, qx); // -qx | |
__m256 cqy = _mm256_sub_ps(zero_vec, qy); // -qy | |
__m256 cqz = _mm256_sub_ps(zero_vec, qz); // -qz | |
// Step 1: t = q_conj * v_world_as_quat | |
// t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz) | |
// tw = -(-qx)*vx - (-qy)*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz | |
// tx = qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy | |
// ty = qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz | |
// tz = qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx | |
__m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx)); // qw*vx + (-qy)*vz | |
tx = _mm256_fnmadd_ps(cqz, vy, tx);// qw*vx - qy*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy | |
__m256 ty = _mm256_fmadd_ps(cqz, vx, _mm256_mul_ps(qw, vy)); // qw*vy + (-qz)*vx | |
ty = _mm256_fnmadd_ps(cqx, vz, ty); // qw*vy - qz*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz | |
__m256 tz = _mm256_fmadd_ps(cqx, vy, _mm256_mul_ps(qw, vz)); // qw*vz + (-qx)*vy | |
tz = _mm256_fnmadd_ps(cqy, vx, tz); // qw*vz - qx*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx | |
__m256 tw = _mm256_fnmadd_ps(cqx, vx, zero_vec); // 0 - (-qx)*vx = qx*vx | |
tw = _mm256_fnmadd_ps(cqy, vy, tw); // qx*vx - (-qy)*vy = qx*vx + qy*vy | |
tw = _mm256_fnmadd_ps(cqz, vz, tw); // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz | |
// Step 2: result_vec = (t * q)_vec | |
// result = (tw, tx, ty, tz) * (qw, qx, qy, qz) | |
// result_x = tw*qx + tx*qw + ty*qz - tz*qy | |
// result_y = tw*qy + ty*qw + tz*qx - tx*qz | |
// result_z = tw*qz + ty*qw + tx*qy - ty*qx | |
__m256 result_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx)); | |
result_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, result_x)); | |
__m256 result_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy)); | |
result_y = _mm256_fnmadd_ps(tx, qz, _mm256_fmadd_ps(ty, qw, result_y)); | |
__m256 result_z = _mm256_fmadd_ps(tx, qy, _mm256_mul_ps(tw, qz)); | |
result_z = _mm256_fnmadd_ps(ty, qx, _mm256_fmadd_ps(tz, qw, result_z)); | |
_mm256_store_ps(&wind_local_x[c], result_x); | |
_mm256_store_ps(&wind_local_y[c], result_y); | |
_mm256_store_ps(&wind_local_z[c], result_z); | |
} | |
// Compute local-space gravity | |
{ | |
__m256 vx = _mm256_load_ps(&cs->gravity_x[c]); | |
__m256 vy = _mm256_load_ps(&cs->gravity_y[c]); | |
__m256 vz = _mm256_load_ps(&cs->gravity_z[c]); | |
// Create q_conjugate components (qw remains, qx, qy, qz are negated) | |
__m256 cqx = _mm256_sub_ps(zero_vec, qx); // -qx | |
__m256 cqy = _mm256_sub_ps(zero_vec, qy); // -qy | |
__m256 cqz = _mm256_sub_ps(zero_vec, qz); // -qz | |
// Step 1: t = q_conj * v_world_as_quat | |
// t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz) | |
// tw = -(-qx)*vx - (-qy)*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz | |
// tx = qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy | |
// ty = qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz | |
// tz = qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx | |
__m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx)); // qw*vx + (-qy)*vz | |
tx = _mm256_fnmadd_ps(cqz, vy, tx);// qw*vx - qy*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy | |
__m256 ty = _mm256_fmadd_ps(cqz, vx, _mm256_mul_ps(qw, vy)); // qw*vy + (-qz)*vx | |
ty = _mm256_fnmadd_ps(cqx, vz, ty); // qw*vy - qz*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz | |
__m256 tz = _mm256_fmadd_ps(cqx, vy, _mm256_mul_ps(qw, vz)); // qw*vz + (-qx)*vy | |
tz = _mm256_fnmadd_ps(cqy, vx, tz); // qw*vz - qx*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx | |
__m256 tw = _mm256_fnmadd_ps(cqx, vx, zero_vec); // 0 - (-qx)*vx = qx*vx | |
tw = _mm256_fnmadd_ps(cqy, vy, tw); // qx*vx - (-qy)*vy = qx*vx + qy*vy | |
tw = _mm256_fnmadd_ps(cqz, vz, tw); // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz | |
// Step 2: result_vec = (t * q)_vec | |
// result = (tw, tx, ty, tz) * (qw, qx, qy, qz) | |
// result_x = tw*qx + tx*qw + ty*qz - tz*qy | |
// result_y = tw*qy + ty*qw + tz*qx - tx*qz | |
// result_z = tw*qz + ty*qw + tx*qy - ty*qx | |
__m256 result_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx)); | |
result_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, result_x)); | |
__m256 result_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy)); | |
result_y = _mm256_fnmadd_ps(tx, qz, _mm256_fmadd_ps(ty, qw, result_y)); | |
__m256 result_z = _mm256_fmadd_ps(tx, qy, _mm256_mul_ps(tw, qz)); | |
result_z = _mm256_fnmadd_ps(ty, qx, _mm256_fmadd_ps(tz, qw, result_z)); | |
_mm256_store_ps(&gravity_local_x[c], result_x); | |
_mm256_store_ps(&gravity_local_y[c], result_y); | |
_mm256_store_ps(&gravity_local_z[c], result_z); | |
} | |
// Compute linear velocity | |
{ | |
__m256 curr_x = _mm256_load_ps(&cs->chain_pos_x[c]); | |
__m256 curr_y = _mm256_load_ps(&cs->chain_pos_y[c]); | |
__m256 curr_z = _mm256_load_ps(&cs->chain_pos_z[c]); | |
__m256 prev_x = _mm256_load_ps(&cs->prev_chain_pos_x[c]); | |
__m256 prev_y = _mm256_load_ps(&cs->prev_chain_pos_y[c]); | |
__m256 prev_z = _mm256_load_ps(&cs->prev_chain_pos_z[c]); | |
__m256 vel_x_vec = _mm256_mul_ps(_mm256_sub_ps(curr_x, prev_x), inv_dt_vec); | |
__m256 vel_y_vec = _mm256_mul_ps(_mm256_sub_ps(curr_y, prev_y), inv_dt_vec); | |
__m256 vel_z_vec = _mm256_mul_ps(_mm256_sub_ps(curr_z, prev_z), inv_dt_vec); | |
_mm256_store_ps(&vel_x[c], vel_x_vec); | |
_mm256_store_ps(&vel_y[c], vel_y_vec); | |
_mm256_store_ps(&vel_z[c], vel_z_vec); | |
} | |
// Compute angular velocity (SIMD with polynomial approximations) | |
{ | |
__m256 prev_qx = _mm256_load_ps(&cs->prev_chain_quat_x[c]); | |
__m256 prev_qy = _mm256_load_ps(&cs->prev_chain_quat_y[c]); | |
__m256 prev_qz = _mm256_load_ps(&cs->prev_chain_quat_z[c]); | |
__m256 prev_qw = _mm256_load_ps(&cs->prev_chain_quat_w[c]); | |
// Quaternion inverse | |
__m256 norm_sq = _mm256_fmadd_ps(prev_qx, prev_qx, _mm256_mul_ps(prev_qy, prev_qy)); | |
norm_sq = _mm256_fmadd_ps(prev_qz, prev_qz, norm_sq); | |
norm_sq = _mm256_fmadd_ps(prev_qw, prev_qw, norm_sq); | |
__m256 inv_norm_sq = _mm256_rcp_ps(_mm256_max_ps(norm_sq, eps_vec)); | |
__m256 inv_qx = _mm256_mul_ps(_mm256_sub_ps(zero_vec, prev_qx), inv_norm_sq); | |
__m256 inv_qy = _mm256_mul_ps(_mm256_sub_ps(zero_vec, prev_qy), inv_norm_sq); | |
__m256 inv_qz = _mm256_mul_ps(_mm256_sub_ps(zero_vec, prev_qz), inv_norm_sq); | |
__m256 inv_qw = _mm256_mul_ps(prev_qw, inv_norm_sq); | |
// Delta quaternion: q_current * q_prev^-1 | |
__m256 delta_qx = _mm256_fmadd_ps(qy, inv_qz, _mm256_mul_ps(qw, inv_qx)); | |
delta_qx = _mm256_fnmadd_ps(qz, inv_qy, _mm256_fmadd_ps(qx, inv_qw, delta_qx)); | |
__m256 delta_qy = _mm256_fmadd_ps(qy, inv_qw, _mm256_mul_ps(qw, inv_qy)); | |
delta_qy = _mm256_fmadd_ps(qz, inv_qx, _mm256_fnmadd_ps(qx, inv_qz, delta_qy)); | |
__m256 delta_qz = _mm256_fmadd_ps(qx, inv_qy, _mm256_mul_ps(qw, inv_qz)); | |
delta_qz = _mm256_fnmadd_ps(qy, inv_qx, _mm256_fmadd_ps(qz, inv_qw, delta_qz)); | |
__m256 delta_qw = _mm256_fnmadd_ps(qx, inv_qx, _mm256_mul_ps(qw, inv_qw)); | |
delta_qw = _mm256_fnmadd_ps(qy, inv_qy, delta_qw); | |
delta_qw = _mm256_fnmadd_ps(qz, inv_qz, delta_qw); | |
// Angular velocity with polynomial approximations | |
__m256 qw_clamped = _mm256_max_ps(_mm256_min_ps(delta_qw, one_vec), neg_one_vec); | |
// Acos approximation: acos(x) ≈ π/2 - x * (1 + x^2 * (c1 + x^2 * (c2 + x^2 * c3))) | |
__m256 x = qw_clamped; | |
__m256 x2 = _mm256_mul_ps(x, x); | |
__m256 x4 = _mm256_mul_ps(x2, x2); | |
__m256 poly = _mm256_fmadd_ps(x4, acos_c3_vec, acos_c2_vec); | |
poly = _mm256_fmadd_ps(x2, poly, acos_c1_vec); | |
poly = _mm256_fmadd_ps(x2, poly, one_vec); | |
poly = _mm256_mul_ps(x, poly); | |
__m256 angle = _mm256_mul_ps(two_vec, _mm256_sub_ps(pi_over_2_vec, poly)); | |
// Sin approximation for sin(angle/2): sin(x) ≈ x * (1 + x^2 * (c1 + x^2 * c2)) | |
__m256 half_angle = _mm256_mul_ps(angle, half_vec); | |
__m256 ha2 = _mm256_mul_ps(half_angle, half_angle); | |
__m256 sin_poly = _mm256_fmadd_ps(ha2, sin_c2_vec, sin_c1_vec); | |
sin_poly = _mm256_fmadd_ps(ha2, sin_poly, one_vec); | |
__m256 sin_half_angle = _mm256_mul_ps(half_angle, sin_poly); | |
__m256 inv_sin_half_angle = _mm256_rcp_ps(_mm256_max_ps(sin_half_angle, eps_vec)); | |
// Sign: Use qw > 0 instead of angle < π | |
__m256 sign = _mm256_cmp_ps(qw_clamped, zero_vec, _CMP_GT_OQ); | |
sign = _mm256_or_ps(_mm256_and_ps(sign, one_vec), _mm256_andnot_ps(sign, neg_one_vec)); | |
// Angular velocity: (sign * qx * inv_sin_half_angle / dt) | |
__m256 scale = _mm256_mul_ps(sign, _mm256_mul_ps(inv_sin_half_angle, inv_dt_vec)); | |
__m256 ang_vel_x_vec = _mm256_mul_ps(delta_qx, scale); | |
__m256 ang_vel_y_vec = _mm256_mul_ps(delta_qy, scale); | |
__m256 ang_vel_z_vec = _mm256_mul_ps(delta_qz, scale); | |
_mm256_store_ps(&ang_vel_x[c], ang_vel_x_vec); | |
_mm256_store_ps(&ang_vel_y[c], ang_vel_y_vec); | |
_mm256_store_ps(&ang_vel_z[c], ang_vel_z_vec); | |
} | |
} | |
// Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal) | |
for (int c = 0; c < MAX_CHAINS; c++) { | |
int base_idx = c * PARTICLES_PER_CHAIN; | |
struct chain_cfg *cfg = &cs->chain_configs[c]; | |
// Load precomputed velocities and inertia parameters | |
__m256 vel_x_vec = _mm256_set1_ps(vel_x[c]); | |
__m256 vel_y_vec = _mm256_set1_ps(vel_y[c]); | |
__m256 vel_z_vec = _mm256_set1_ps(vel_z[c]); | |
__m256 ang_vel_x_vec = _mm256_set1_ps(ang_vel_x[c]); | |
__m256 ang_vel_y_vec = _mm256_set1_ps(ang_vel_y[c]); | |
__m256 ang_vel_z_vec = _mm256_set1_ps(ang_vel_z[c]); | |
__m256 linear_inertia_vec = _mm256_set1_ps(cfg->linear_inertia); | |
__m256 angular_inertia_vec = _mm256_set1_ps(cfg->angular_inertia); | |
__m256 centrifugal_inertia_vec = _mm256_set1_ps(cfg->centrifugal_inertia); | |
// Clamp inertia parameters to [0, 1] | |
linear_inertia_vec = _mm256_max_ps(_mm256_min_ps(linear_inertia_vec, inertia_clamp_max), inertia_clamp_min); | |
angular_inertia_vec = _mm256_max_ps(_mm256_min_ps(angular_inertia_vec, inertia_clamp_max), inertia_clamp_min); | |
centrifugal_inertia_vec = _mm256_max_ps(_mm256_min_ps(centrifugal_inertia_vec, inertia_clamp_max), inertia_clamp_min); | |
for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) { | |
__m256 px = _mm256_load_ps(&cs->p_pos_x[base_idx + i]); | |
__m256 py = _mm256_load_ps(&cs->p_pos_y[base_idx + i]); | |
__m256 pz = _mm256_load_ps(&cs->p_pos_z[base_idx + i]); | |
__m256 p_inv_mass = _mm256_load_ps(&cs->inv_mass[base_idx + i]); | |
p_inv_mass = _mm256_max_ps(_mm256_min_ps(p_inv_mass, inv_mass_clamp_max), inv_mass_clamp_min); | |
// Linear inertia: v * dt * linear_inertia | |
__m256 lin_delta_x = _mm256_mul_ps(_mm256_mul_ps(vel_x_vec, dt_vec), linear_inertia_vec); | |
__m256 lin_delta_y = _mm256_mul_ps(_mm256_mul_ps(vel_y_vec, dt_vec), linear_inertia_vec); | |
__m256 lin_delta_z = _mm256_mul_ps(_mm256_mul_ps(vel_z_vec, dt_vec), linear_inertia_vec); | |
// Angular inertia: (ω × r) * dt * angular_inertia | |
__m256 ang_delta_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py)); | |
__m256 ang_delta_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz)); | |
__m256 ang_delta_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px)); | |
ang_delta_x = _mm256_mul_ps(_mm256_mul_ps(ang_delta_x, dt_vec), angular_inertia_vec); | |
ang_delta_y = _mm256_mul_ps(_mm256_mul_ps(ang_delta_y, dt_vec), angular_inertia_vec); | |
ang_delta_z = _mm256_mul_ps(_mm256_mul_ps(ang_delta_z, dt_vec), angular_inertia_vec); | |
// Centrifugal inertia: (ω × (ω × r)) * dt^2 * centrifugal_inertia | |
__m256 cross1_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py)); | |
__m256 cross1_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz)); | |
__m256 cross1_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px)); | |
__m256 cross2_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, cross1_z), _mm256_mul_ps(ang_vel_z_vec, cross1_y)); | |
__m256 cross2_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, cross1_x), _mm256_mul_ps(ang_vel_x_vec, cross1_z)); | |
__m256 cross2_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, cross1_y), _mm256_mul_ps(ang_vel_y_vec, cross1_x)); | |
// Calculate displacement: (ω × (ω × r)) * dt^2 | |
__m256 base_cent_delta_x = _mm256_mul_ps(cross2_x, dt2_vec); | |
__m256 base_cent_delta_y = _mm256_mul_ps(cross2_y, dt2_vec); | |
__m256 base_cent_delta_z = _mm256_mul_ps(cross2_z, dt2_vec); | |
// Apply the centrifugal_inertia factor | |
__m256 cent_delta_x = _mm256_mul_ps(base_cent_delta_x, centrifugal_inertia_vec); | |
__m256 cent_delta_y = _mm256_mul_ps(base_cent_delta_y, centrifugal_inertia_vec); | |
__m256 cent_delta_z = _mm256_mul_ps(base_cent_delta_z, centrifugal_inertia_vec); | |
// Total delta | |
__m256 delta_x = _mm256_add_ps(_mm256_add_ps(lin_delta_x, ang_delta_x), cent_delta_x); | |
__m256 delta_y = _mm256_add_ps(_mm256_add_ps(lin_delta_y, ang_delta_y), cent_delta_y); | |
__m256 delta_z = _mm256_add_ps(_mm256_add_ps(lin_delta_z, ang_delta_z), cent_delta_z); | |
// Update positions | |
_mm256_store_ps(&cs->p_pos_x[base_idx + i], _mm256_add_ps(px, delta_x)); | |
_mm256_store_ps(&cs->p_pos_y[base_idx + i], _mm256_add_ps(py, delta_y)); | |
_mm256_store_ps(&cs->p_pos_z[base_idx + i], _mm256_add_ps(pz, delta_z)); | |
} | |
// Update previous transformation | |
cs->prev_chain_pos_x[c] = cs->chain_pos_x[c]; | |
cs->prev_chain_pos_y[c] = cs->chain_pos_y[c]; | |
cs->prev_chain_pos_z[c] = cs->chain_pos_z[c]; | |
cs->prev_chain_quat_x[c] = cs->chain_quat_x[c]; | |
cs->prev_chain_quat_y[c] = cs->chain_quat_y[c]; | |
cs->prev_chain_quat_z[c] = cs->chain_quat_z[c]; | |
cs->prev_chain_quat_w[c] = cs->chain_quat_w[c]; | |
} | |
// Step 2: Verlet Integration | |
for (int c = 0; c < MAX_CHAINS; c++) { | |
int base_idx = c * PARTICLES_PER_CHAIN; | |
struct chain_cfg *cfg = &cs->chain_configs[c]; | |
__m256 gravity_x_vec = _mm256_set1_ps(gravity_local_x[c]); | |
__m256 gravity_y_vec = _mm256_set1_ps(gravity_local_y[c]); | |
__m256 gravity_z_vec = _mm256_set1_ps(gravity_local_z[c]); | |
__m256 wind_x_vec = _mm256_set1_ps(wind_local_x[c]); | |
__m256 wind_y_vec = _mm256_set1_ps(wind_local_y[c]); | |
__m256 wind_z_vec = _mm256_set1_ps(wind_local_z[c]); | |
__m256 damping_vec = _mm256_set1_ps(cfg->damping); | |
for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) { | |
__m256 p_curr_x = _mm256_load_ps(&cs->p_pos_x[base_idx + i]); | |
__m256 p_curr_y = _mm256_load_ps(&cs->p_pos_y[base_idx + i]); | |
__m256 p_curr_z = _mm256_load_ps(&cs->p_pos_z[base_idx + i]); | |
__m256 p_inv_mass = _mm256_load_ps(&cs->inv_mass[base_idx + i]); | |
p_inv_mass = _mm256_max_ps(_mm256_min_ps(p_inv_mass, inv_mass_clamp_max), inv_mass_clamp_min); | |
__m256 p_prev_x = _mm256_load_ps(&cs->prev_p_pos_x[base_idx + i]); | |
__m256 p_prev_y = _mm256_load_ps(&cs->prev_p_pos_y[base_idx + i]); | |
__m256 p_prev_z = _mm256_load_ps(&cs->prev_p_pos_z[base_idx + i]); | |
__m256 force_x = _mm256_add_ps(gravity_x_vec, wind_x_vec); | |
__m256 force_y = _mm256_add_ps(gravity_y_vec, wind_y_vec); | |
__m256 force_z = _mm256_add_ps(gravity_z_vec, wind_z_vec); | |
__m256 acc_x = _mm256_mul_ps(force_x, p_inv_mass); | |
__m256 acc_y = _mm256_mul_ps(force_y, p_inv_mass); | |
__m256 acc_z = _mm256_mul_ps(force_z, p_inv_mass); | |
__m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x); | |
__m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y); | |
__m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z); | |
__m256 damped_vel_x = _mm256_mul_ps(vel_x, damping_vec); | |
__m256 damped_vel_y = _mm256_mul_ps(vel_y, damping_vec); | |
__m256 damped_vel_z = _mm256_mul_ps(vel_z, damping_vec); | |
__m256 term_accel_x = _mm256_mul_ps(acc_x, dt2_vec); | |
__m256 term_accel_y = _mm256_mul_ps(acc_y, dt2_vec); | |
__m256 term_accel_z = _mm256_mul_ps(acc_z, dt2_vec); | |
__m256 new_p_x = _mm256_add_ps(p_curr_x, _mm256_add_ps(damped_vel_x, term_accel_x)); | |
__m256 new_p_y = _mm256_add_ps(p_curr_y, _mm256_add_ps(damped_vel_y, term_accel_y)); | |
__m256 new_p_z = _mm256_add_ps(p_curr_z, _mm256_add_ps(damped_vel_z, term_accel_z)); | |
_mm256_store_ps(&cs->p_pos_x[base_idx + i], new_p_x); | |
_mm256_store_ps(&cs->p_pos_y[base_idx + i], new_p_y); | |
_mm256_store_ps(&cs->p_pos_z[base_idx + i], new_p_z); | |
_mm256_store_ps(&cs->prev_p_pos_x[base_idx + i], p_curr_x); | |
_mm256_store_ps(&cs->prev_p_pos_y[base_idx + i], p_curr_y); | |
_mm256_store_ps(&cs->prev_p_pos_z[base_idx + i], p_curr_z); | |
} | |
} | |
// Step 3: Distance Constraints | |
for (int c = 0; c < MAX_CHAINS; c++) { | |
int p_base = c * PARTICLES_PER_CHAIN; | |
int r_base = c * CONSTRAINTS_PER_CHAIN; | |
struct chain_cfg *cfg = &cs->chain_configs[c]; | |
__m256 stiffness_vec = _mm256_set1_ps(cfg->stiffness); | |
__m256 stretch_factor_vec = _mm256_set1_ps(cfg->stretch_factor); | |
__m256 max_strain_vec_abs = _mm256_set1_ps(cfg->max_strain); | |
__m256 neg_max_strain_vec_abs = _mm256_mul_ps(max_strain_vec_abs, neg_one_vec); | |
for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) { | |
__m256 px = _mm256_load_ps(&cs->p_pos_x[p_base + i]); | |
__m256 py = _mm256_load_ps(&cs->p_pos_y[p_base + i]); | |
__m256 pz = _mm256_load_ps(&cs->p_pos_z[p_base + i]); | |
__m256 pm = _mm256_load_ps(&cs->inv_mass[p_base + i]); | |
pm = _mm256_max_ps(_mm256_min_ps(pm, inv_mass_clamp_max), inv_mass_clamp_min); | |
_mm256_store_ps(&px_local[i], px); | |
_mm256_store_ps(&py_local[i], py); | |
_mm256_store_ps(&pz_local[i], pz); | |
_mm256_store_ps(&pm_local[i], pm); | |
} | |
for (int iter = 0; iter < MAX_ITERATIONS; iter++) { | |
// First block (i=0 to 7) | |
{ | |
__m256 p0x = _mm256_load_ps(&px_local[0]); | |
__m256 p0y = _mm256_load_ps(&py_local[0]); | |
__m256 p0z = _mm256_load_ps(&pz_local[0]); | |
__m256 p0m = _mm256_load_ps(&pm_local[0]); | |
__m256 p1x = _mm256_loadu_ps(&px_local[1]); | |
__m256 p1y = _mm256_loadu_ps(&py_local[1]); | |
__m256 p1z = _mm256_loadu_ps(&pz_local[1]); | |
__m256 p1m = _mm256_loadu_ps(&pm_local[1]); | |
__m256 rest_len_vec = _mm256_load_ps(&cs->rest_lengths[r_base]); | |
__m256 dx = _mm256_sub_ps(p1x, p0x); | |
__m256 dy = _mm256_sub_ps(p1y, p0y); | |
__m256 dz = _mm256_sub_ps(p1z, p0z); | |
__m256 dist_sq = _mm256_mul_ps(dx, dx); | |
dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq); | |
dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq); | |
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); | |
__m256 w_sum = _mm256_add_ps(p0m, p1m); | |
__m256 w_sum_clamped = _mm256_max_ps(w_sum, eps_vec); | |
__m256 rcp_w_sum = _mm256_rcp_ps(w_sum_clamped); | |
__m256 dist = _mm256_rcp_ps(inv_dist); | |
__m256 diff = _mm256_sub_ps(dist, rest_len_vec); | |
__m256 max_rest_len_eps = _mm256_max_ps(rest_len_vec, eps_vec); | |
__m256 strain = _mm256_mul_ps(diff, _mm256_rcp_ps(max_rest_len_eps)); | |
__m256 strain_clamped = _mm256_max_ps(neg_max_strain_vec_abs, _mm256_min_ps(strain, max_strain_vec_abs)); | |
__m256 dynamic_stiffness_denom = _mm256_add_ps(one_vec, _mm256_and_ps(strain_clamped, abs_mask_ps)); | |
__m256 dynamic_stiffness = _mm256_mul_ps(stiffness_vec, _mm256_rcp_ps(dynamic_stiffness_denom)); | |
__m256 corr_scalar_part = _mm256_mul_ps(dynamic_stiffness, stretch_factor_vec); | |
corr_scalar_part = _mm256_mul_ps(corr_scalar_part, rcp_w_sum); | |
__m256 corr_magnitude = _mm256_mul_ps(diff, corr_scalar_part); | |
__m256 apply_corr_mask = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ); | |
__m256 delta_unit_x = _mm256_mul_ps(dx, inv_dist); | |
__m256 delta_unit_y = _mm256_mul_ps(dy, inv_dist); | |
__m256 delta_unit_z = _mm256_mul_ps(dz, inv_dist); | |
__m256 delta_x = _mm256_mul_ps(delta_unit_x, corr_magnitude); | |
__m256 delta_y = _mm256_mul_ps(delta_unit_y, corr_magnitude); | |
__m256 delta_z = _mm256_mul_ps(delta_unit_z, corr_magnitude); | |
delta_x = _mm256_and_ps(delta_x, apply_corr_mask); | |
delta_y = _mm256_and_ps(delta_y, apply_corr_mask); | |
delta_z = _mm256_and_ps(delta_z, apply_corr_mask); | |
_mm256_store_ps(&px_local[0], _mm256_fmadd_ps(delta_x, p0m, p0x)); | |
_mm256_store_ps(&py_local[0], _mm256_fmadd_ps(delta_y, p0m, p0y)); | |
_mm256_store_ps(&pz_local[0], _mm256_fmadd_ps(delta_z, p0m, p0z)); | |
_mm256_store_ps(&px_local[1], _mm256_fnmadd_ps(delta_x, p1m, p1x)); | |
_mm256_store_ps(&py_local[1], _mm256_fnmadd_ps(delta_y, p1m, p1y)); | |
_mm256_store_ps(&pz_local[1], _mm256_fnmadd_ps(delta_z, p1m, p1z)); | |
} | |
// Second block (i=8 to 14) | |
{ | |
__m256 p0x = _mm256_load_ps(&px_local[8]); | |
__m256 p0y = _mm256_load_ps(&py_local[8]); | |
__m256 p0z = _mm256_load_ps(&pz_local[8]); | |
__m256 p0m = _mm256_load_ps(&pm_local[8]); | |
__m256 p1x = _mm256_loadu_ps(&px_local[9]); | |
__m256 p1y = _mm256_loadu_ps(&py_local[9]); | |
__m256 p1z = _mm256_loadu_ps(&pz_local[9]); | |
__m256 p1m = _mm256_loadu_ps(&pm_local[9]); | |
__m256 rest_len_vec = _mm256_load_ps(&cs->rest_lengths[r_base + 8]); | |
__m256 dx = _mm256_sub_ps(p1x, p0x); | |
__m256 dy = _mm256_sub_ps(p1y, p0y); | |
__m256 dz = _mm256_sub_ps(p1z, p0z); | |
__m256 dist_sq = _mm256_mul_ps(dx, dx); | |
dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq); | |
dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq); | |
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); | |
__m256 w_sum = _mm256_add_ps(p0m, p1m); | |
__m256 w_sum_clamped = _mm256_max_ps(w_sum, eps_vec); | |
__m256 rcp_w_sum = _mm256_rcp_ps(w_sum_clamped); | |
__m256 dist = _mm256_rcp_ps(inv_dist); | |
__m256 diff = _mm256_sub_ps(dist, rest_len_vec); | |
__m256 max_rest_len_eps = _mm256_max_ps(rest_len_vec, eps_vec); | |
__m256 strain = _mm256_mul_ps(diff, _mm256_rcp_ps(max_rest_len_eps)); | |
__m256 strain_clamped = _mm256_max_ps(neg_max_strain_vec_abs, _mm256_min_ps(strain, max_strain_vec_abs)); | |
__m256 dynamic_stiffness_denom = _mm256_add_ps(one_vec, _mm256_and_ps(strain_clamped, abs_mask_ps)); | |
__m256 dynamic_stiffness = _mm256_mul_ps(stiffness_vec, _mm256_rcp_ps(dynamic_stiffness_denom)); | |
__m256 corr_scalar_part = _mm256_mul_ps(dynamic_stiffness, stretch_factor_vec); | |
corr_scalar_part = _mm256_mul_ps(corr_scalar_part, rcp_w_sum); | |
__m256 corr_magnitude = _mm256_mul_ps(diff, corr_scalar_part); | |
__m256 apply_corr_mask = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ); | |
__attribute__((aligned(32))) float valid_mask_array[8] = { -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 0.0f }; | |
__m256 valid_mask = _mm256_load_ps(valid_mask_array); | |
apply_corr_mask = _mm256_and_ps(apply_corr_mask, valid_mask); | |
__m256 delta_unit_x = _mm256_mul_ps(dx, inv_dist); | |
__m256 delta_unit_y = _mm256_mul_ps(dy, inv_dist); | |
__m256 delta_unit_z = _mm256_mul_ps(dz, inv_dist); | |
__m256 delta_x = _mm256_mul_ps(delta_unit_x, corr_magnitude); | |
__m256 delta_y = _mm256_mul_ps(delta_unit_y, corr_magnitude); | |
__m256 delta_z = _mm256_mul_ps(delta_unit_z, corr_magnitude); | |
delta_x = _mm256_and_ps(delta_x, apply_corr_mask); | |
delta_y = _mm256_and_ps(delta_y, apply_corr_mask); | |
delta_z = _mm256_and_ps(delta_z, apply_corr_mask); | |
_mm256_store_ps(&px_local[8], _mm256_fmadd_ps(delta_x, p0m, p0x)); | |
_mm256_store_ps(&py_local[8], _mm256_fmadd_ps(delta_y, p0m, p0y)); | |
_mm256_store_ps(&pz_local[8], _mm256_fmadd_ps(delta_z, p0m, p0z)); | |
_mm256_storeu_ps(&px_local[9], _mm256_fnmadd_ps(delta_x, p1m, p1x)); | |
_mm256_storeu_ps(&py_local[9], _mm256_fnmadd_ps(delta_y, p1m, p1y)); | |
_mm256_storeu_ps(&pz_local[9], _mm256_fnmadd_ps(delta_z, p1m, p1z)); | |
} | |
} | |
for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) { | |
__m256 px = _mm256_load_ps(&px_local[i]); | |
__m256 py = _mm256_load_ps(&py_local[i]); | |
__m256 pz = _mm256_load_ps(&pz_local[i]); | |
_mm256_store_ps(&cs->p_pos_x[p_base + i], px); | |
_mm256_store_ps(&cs->p_pos_y[p_base + i], py); | |
_mm256_store_ps(&cs->p_pos_z[p_base + i], pz); | |
} | |
} | |
// Step 4: Sphere Collisions with Friction | |
for (int c = 0; c < MAX_CHAINS; c++) { | |
int p_base = c * PARTICLES_PER_CHAIN; | |
int s_base = c * SPHERES_PER_CHAIN; | |
struct chain_cfg *cfg = &cs->chain_configs[c]; | |
__m256 friction_vec = _mm256_set1_ps(cfg->friction); | |
for (int s = 0; s < SPHERES_PER_CHAIN; s++) { | |
__m256 sphere_x_vec = _mm256_set1_ps(cs->sphere_x[s_base + s]); | |
__m256 sphere_y_vec = _mm256_set1_ps(cs->sphere_y[s_base + s]); | |
__m256 sphere_z_vec = _mm256_set1_ps(cs->sphere_z[s_base + s]); | |
__m256 sphere_r_vec = _mm256_set1_ps(cs->sphere_radius[s_base + s]); | |
for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) { | |
__m256 p_curr_x = _mm256_load_ps(&cs->p_pos_x[p_base + i]); | |
__m256 p_curr_y = _mm256_load_ps(&cs->p_pos_y[p_base + i]); | |
__m256 p_curr_z = _mm256_load_ps(&cs->p_pos_z[p_base + i]); | |
__m256 p_prev_x = _mm256_load_ps(&cs->prev_p_pos_x[p_base + i]); | |
__m256 p_prev_y = _mm256_load_ps(&cs->prev_p_pos_y[p_base + i]); | |
__m256 p_prev_z = _mm256_load_ps(&cs->prev_p_pos_z[p_base + i]); | |
__m256 p_inv_mass = _mm256_load_ps(&cs->inv_mass[p_base + i]); | |
p_inv_mass = _mm256_max_ps(_mm256_min_ps(p_inv_mass, inv_mass_clamp_max), inv_mass_clamp_min); | |
__m256 dx = _mm256_sub_ps(p_curr_x, sphere_x_vec); | |
__m256 dy = _mm256_sub_ps(p_curr_y, sphere_y_vec); | |
__m256 dz = _mm256_sub_ps(p_curr_z, sphere_z_vec); | |
__m256 dist_sq = _mm256_mul_ps(dx, dx); | |
dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq); | |
dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq); | |
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); | |
__m256 dist = _mm256_rcp_ps(inv_dist); | |
__m256 penetration = _mm256_sub_ps(sphere_r_vec, dist); | |
__m256 collision_mask = _mm256_cmp_ps(penetration, zero_vec, _CMP_GT_OQ); | |
__m256 normal_x = _mm256_mul_ps(dx, inv_dist); | |
__m256 normal_y = _mm256_mul_ps(dy, inv_dist); | |
__m256 normal_z = _mm256_mul_ps(dz, inv_dist); | |
__m256 normal_corr_x = _mm256_mul_ps(normal_x, penetration); | |
__m256 normal_corr_y = _mm256_mul_ps(normal_y, penetration); | |
__m256 normal_corr_z = _mm256_mul_ps(normal_z, penetration); | |
__m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x); | |
__m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y); | |
__m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z); | |
__m256 vel_dot_normal = _mm256_mul_ps(vel_x, normal_x); | |
vel_dot_normal = _mm256_fmadd_ps(vel_y, normal_y, vel_dot_normal); | |
vel_dot_normal = _mm256_fmadd_ps(vel_z, normal_z, vel_dot_normal); | |
__m256 tangent_x = _mm256_sub_ps(vel_x, _mm256_mul_ps(vel_dot_normal, normal_x)); | |
__m256 tangent_y = _mm256_sub_ps(vel_y, _mm256_mul_ps(vel_dot_normal, normal_y)); | |
__m256 tangent_z = _mm256_sub_ps(vel_z, _mm256_mul_ps(vel_dot_normal, normal_z)); | |
__m256 tangent_mag_sq = _mm256_mul_ps(tangent_x, tangent_x); | |
tangent_mag_sq = _mm256_fmadd_ps(tangent_y, tangent_y, tangent_mag_sq); | |
tangent_mag_sq = _mm256_fmadd_ps(tangent_z, tangent_z, tangent_mag_sq); | |
__m256 inv_tangent_mag = _mm256_rsqrt_ps(_mm256_max_ps(tangent_mag_sq, eps_vec)); | |
__m256 friction_mag = _mm256_mul_ps(penetration, friction_vec); | |
__m256 friction_x = _mm256_mul_ps(_mm256_mul_ps(tangent_x, inv_tangent_mag), friction_mag); | |
__m256 friction_y = _mm256_mul_ps(_mm256_mul_ps(tangent_y, inv_tangent_mag), friction_mag); | |
__m256 friction_z = _mm256_mul_ps(_mm256_mul_ps(tangent_z, inv_tangent_mag), friction_mag); | |
__m256 total_corr_x = _mm256_sub_ps(normal_corr_x, friction_x); | |
__m256 total_corr_y = _mm256_sub_ps(normal_corr_y, friction_y); | |
__m256 total_corr_z = _mm256_sub_ps(normal_corr_z, friction_z); | |
total_corr_x = _mm256_and_ps(total_corr_x, collision_mask); | |
total_corr_y = _mm256_and_ps(total_corr_y, collision_mask); | |
total_corr_z = _mm256_and_ps(total_corr_z, collision_mask); | |
total_corr_x = _mm256_mul_ps(total_corr_x, p_inv_mass); | |
total_corr_y = _mm256_mul_ps(total_corr_y, p_inv_mass); | |
total_corr_z = _mm256_mul_ps(total_corr_z, p_inv_mass); | |
__m256 new_p_x = _mm256_add_ps(p_curr_x, total_corr_x); | |
__m256 new_p_y = _mm256_add_ps(p_curr_y, total_corr_y); | |
__m256 new_p_z = _mm256_add_ps(p_curr_z, total_corr_z); | |
_mm256_store_ps(&cs->p_pos_x[p_base + i], new_p_x); | |
_mm256_store_ps(&cs->p_pos_y[p_base + i], new_p_y); | |
_mm256_store_ps(&cs->p_pos_z[p_base + i], new_p_z); | |
} | |
} | |
} | |
// Step 5: Motion Constraints | |
for (int c = 0; c < MAX_CHAINS; c++) { | |
int p_base = c * PARTICLES_PER_CHAIN; | |
for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) { | |
__m256 px = _mm256_load_ps(&cs->p_pos_x[p_base + i]); | |
__m256 py = _mm256_load_ps(&cs->p_pos_y[p_base + i]); | |
__m256 pz = _mm256_load_ps(&cs->p_pos_z[p_base + i]); | |
__m256 pm = _mm256_load_ps(&cs->inv_mass[p_base + i]); | |
pm = _mm256_max_ps(_mm256_min_ps(pm, inv_mass_clamp_max), inv_mass_clamp_min); | |
__m256 rx = _mm256_load_ps(&cs->rest_pos_x[p_base + i]); | |
__m256 ry = _mm256_load_ps(&cs->rest_pos_y[p_base + i]); | |
__m256 rz = _mm256_load_ps(&cs->rest_pos_z[p_base + i]); | |
__m256 radius_vec = _mm256_load_ps(&cs->motion_radius[p_base + i]); | |
__m256 dx = _mm256_sub_ps(px, rx); | |
__m256 dy = _mm256_sub_ps(py, ry); | |
__m256 dz = _mm256_sub_ps(pz, rz); | |
__m256 dist_sq = _mm256_mul_ps(dx, dx); | |
dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq); | |
dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq); | |
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); | |
__m256 dist = _mm256_rcp_ps(inv_dist); | |
__m256 penetration = _mm256_sub_ps(dist, radius_vec); | |
__m256 mask = _mm256_cmp_ps(penetration, zero_vec, _CMP_GT_OQ); | |
__m256 corr_factor = _mm256_mul_ps(penetration, inv_dist); | |
corr_factor = _mm256_and_ps(corr_factor, mask); | |
__m256 delta_x = _mm256_mul_ps(dx, corr_factor); | |
__m256 delta_y = _mm256_mul_ps(dy, corr_factor); | |
__m256 delta_z = _mm256_mul_ps(dz, corr_factor); | |
delta_x = _mm256_mul_ps(delta_x, pm); | |
delta_y = _mm256_mul_ps(delta_y, pm); | |
delta_z = _mm256_mul_ps(delta_z, pm); | |
_mm256_store_ps(&cs->p_pos_x[p_base + i], _mm256_sub_ps(px, delta_x)); | |
_mm256_store_ps(&cs->p_pos_y[p_base + i], _mm256_sub_ps(py, delta_y)); | |
_mm256_store_ps(&cs->p_pos_z[p_base + i], _mm256_sub_ps(pz, delta_z)); | |
} | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <immintrin.h> | |
#include <math.h> | |
enum { | |
MAX_CHAINS = 64, | |
PARTICLES_PER_CHAIN = 16, | |
SPHERES_PER_CHAIN = 8, | |
MAX_PARTICLES = (MAX_CHAINS * PARTICLES_PER_CHAIN), | |
MAX_SPHERES = (MAX_CHAINS * SPHERES_PER_CHAIN), | |
CONSTRAINTS_PER_CHAIN = PARTICLES_PER_CHAIN, | |
MAX_REST_LENGTHS = (MAX_CHAINS * CONSTRAINTS_PER_CHAIN), | |
MAX_ITERATIONS = 4, | |
}; | |
struct chain_cfg { | |
float damping; | |
float friction; | |
float collision_radius; | |
float linear_inertia; // [0, 1], 1.0f = physically correct linear motion | |
float angular_inertia; // [0, 1], 1.0f = physically correct angular rotation | |
float centrifugal_inertia; // [0, 1], 1.0f = physically correct centrifugal force | |
float avbd_alpha; // Regularization parameter (e.g., 0.95f) | |
float avbd_beta; // Stiffness ramping control (e.g., 10.0f) | |
float avbd_gamma; // Warm starting scaling (e.g., 0.99f) | |
float avbd_k_start; // Initial stiffness for AVBD (e.g., 100.0f) | |
float avbd_k_max; // Maximum stiffness for AVBD (e.g., 1e6f) | |
} __attribute__((aligned(32))); | |
struct chain_sim { | |
unsigned short free_idx_cnt; | |
unsigned short free_idx[MAX_CHAINS]; | |
struct chain_cfg chain_configs[MAX_CHAINS] __attribute__((aligned(32))); | |
// Chain global forces (world space gravity and wind) | |
float gravity_x[MAX_CHAINS] __attribute__((aligned(32))); | |
float gravity_y[MAX_CHAINS] __attribute__((aligned(32))); | |
float gravity_z[MAX_CHAINS] __attribute__((aligned(32))); | |
float wind_x[MAX_CHAINS] __attribute__((aligned(32))); | |
float wind_y[MAX_CHAINS] __attribute__((aligned(32))); | |
float wind_z[MAX_CHAINS] __attribute__((aligned(32))); | |
// Chain transformations (world space) | |
float chain_pos_x[MAX_CHAINS] __attribute__((aligned(32))); | |
float chain_pos_y[MAX_CHAINS] __attribute__((aligned(32))); | |
float chain_pos_z[MAX_CHAINS] __attribute__((aligned(32))); | |
float chain_quat_x[MAX_CHAINS] __attribute__((aligned(32))); | |
float chain_quat_y[MAX_CHAINS] __attribute__((aligned(32))); | |
float chain_quat_z[MAX_CHAINS] __attribute__((aligned(32))); | |
float chain_quat_w[MAX_CHAINS] __attribute__((aligned(32))); | |
float prev_chain_pos_x[MAX_CHAINS] __attribute__((aligned(32))); | |
float prev_chain_pos_y[MAX_CHAINS] __attribute__((aligned(32))); | |
float prev_chain_pos_z[MAX_CHAINS] __attribute__((aligned(32))); | |
float prev_chain_quat_x[MAX_CHAINS] __attribute__((aligned(32))); | |
float prev_chain_quat_y[MAX_CHAINS] __attribute__((aligned(32))); | |
float prev_chain_quat_z[MAX_CHAINS] __attribute__((aligned(32))); | |
float prev_chain_quat_w[MAX_CHAINS] __attribute__((aligned(32))); | |
// Particle positions (model space) | |
float p_pos_x[MAX_PARTICLES] __attribute__((aligned(32))); | |
float p_pos_y[MAX_PARTICLES] __attribute__((aligned(32))); | |
float p_pos_z[MAX_PARTICLES] __attribute__((aligned(32))); | |
float inv_mass[MAX_PARTICLES] __attribute__((aligned(32))); | |
float prev_p_pos_x[MAX_PARTICLES] __attribute__((aligned(32))); | |
float prev_p_pos_y[MAX_PARTICLES] __attribute__((aligned(32))); | |
float prev_p_pos_z[MAX_PARTICLES] __attribute__((aligned(32))); | |
float rest_lengths[MAX_REST_LENGTHS] __attribute__((aligned(32))); | |
float dist_lambda[MAX_REST_LENGTHS] __attribute__((aligned(32))); | |
float dist_k_current[MAX_REST_LENGTHS] __attribute__((aligned(32))); | |
float dist_prev_frame_error[MAX_REST_LENGTHS] __attribute__((aligned(32))); | |
// Sphere positions (model space) | |
float sphere_x[MAX_SPHERES] __attribute__((aligned(32))); | |
float sphere_y[MAX_SPHERES] __attribute__((aligned(32))); | |
float sphere_z[MAX_SPHERES] __attribute__((aligned(32))); | |
float sphere_radius[MAX_SPHERES] __attribute__((aligned(32))); | |
float coll_lambda[MAX_PARTICLES + 1] __attribute__((aligned(32))); | |
float coll_k_current[MAX_PARTICLES + 1] __attribute__((aligned(32))); | |
float coll_prev_frame_error[MAX_PARTICLES + 1] __attribute__((aligned(32))); | |
// Rest positions (model space) | |
float rest_pos_x[MAX_PARTICLES] __attribute__((aligned(32))); | |
float rest_pos_y[MAX_PARTICLES] __attribute__((aligned(32))); | |
float rest_pos_z[MAX_PARTICLES] __attribute__((aligned(32))); | |
float motion_radius[MAX_PARTICLES] __attribute__((aligned(32))); | |
}; | |
void simulate_chains(struct chain_sim *cs, float dt) { | |
// Common SIMD constants | |
const __m256 dt_vec = _mm256_set1_ps(dt); | |
const __m256 dt2_vec = _mm256_set1_ps(dt * dt); | |
const __m256 one_vec = _mm256_set1_ps(1.0f); | |
const __m256 neg_one_vec = _mm256_set1_ps(-1.0f); | |
const __m256 eps_vec = _mm256_set1_ps(1e-6f); | |
const __m256 inv_mass_clamp_min = _mm256_set1_ps(0.0f); | |
const __m256 inv_mass_clamp_max = _mm256_set1_ps(1e6f); | |
const __m256 zero_vec = _mm256_setzero_ps(); | |
const __m256 inv_dt_vec = _mm256_set1_ps(1.0f / dt); | |
const __m256 two_vec = _mm256_set1_ps(2.0f); | |
const __m256 half_vec = _mm256_set1_ps(0.5f); | |
const __m256 pi_over_2_vec = _mm256_set1_ps(1.5707963267948966f); | |
const __m256 acos_c1_vec = _mm256_set1_ps(-0.166666666666f); | |
const __m256 acos_c2_vec = _mm256_set1_ps(0.075000000000f); | |
const __m256 acos_c3_vec = _mm256_set1_ps(-0.044642857143f); | |
const __m256 sin_c1_vec = _mm256_set1_ps(-0.166666666666f); | |
const __m256 sin_c2_vec = _mm256_set1_ps(0.008333333333f); | |
const __m256 inertia_clamp_min = _mm256_set1_ps(0.0f); | |
const __m256 inertia_clamp_max = _mm256_set1_ps(1.0f); | |
const __m256 abs_mask_ps = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)); | |
// Local arrays for PBD solver | |
float px_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32))); | |
float py_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32))); | |
float pz_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32))); | |
float pm_local[PARTICLES_PER_CHAIN + 1] __attribute__((aligned(32))); | |
// Initialize dummy elements | |
px_local[PARTICLES_PER_CHAIN] = 0.0f; | |
py_local[PARTICLES_PER_CHAIN] = 0.0f; | |
pz_local[PARTICLES_PER_CHAIN] = 0.0f; | |
pm_local[PARTICLES_PER_CHAIN] = 0.0f; | |
// Stack arrays for precomputed data | |
float wind_local_x[MAX_CHAINS] __attribute__((aligned(32))); | |
float wind_local_y[MAX_CHAINS] __attribute__((aligned(32))); | |
float wind_local_z[MAX_CHAINS] __attribute__((aligned(32))); | |
float gravity_local_x[MAX_CHAINS] __attribute__((aligned(32))); | |
float gravity_local_y[MAX_CHAINS] __attribute__((aligned(32))); | |
float gravity_local_z[MAX_CHAINS] __attribute__((aligned(32))); | |
float vel_x[MAX_CHAINS] __attribute__((aligned(32))); | |
float vel_y[MAX_CHAINS] __attribute__((aligned(32))); | |
float vel_z[MAX_CHAINS] __attribute__((aligned(32))); | |
float ang_vel_x[MAX_CHAINS] __attribute__((aligned(32))); | |
float ang_vel_y[MAX_CHAINS] __attribute__((aligned(32))); | |
float ang_vel_z[MAX_CHAINS] __attribute__((aligned(32))); | |
// Step 0: Precomputation (SIMD, 8 chains at once) | |
for (int c = 0; c < MAX_CHAINS; c += 8) { | |
// Load chain quaternions | |
__m256 qx = _mm256_load_ps(&cs->chain_quat_x[c]); | |
__m256 qy = _mm256_load_ps(&cs->chain_quat_y[c]); | |
__m256 qz = _mm256_load_ps(&cs->chain_quat_z[c]); | |
__m256 qw = _mm256_load_ps(&cs->chain_quat_w[c]); | |
// Compute local-space wind | |
{ | |
__m256 vx = _mm256_load_ps(&cs->wind_x[c]); | |
__m256 vy = _mm256_load_ps(&cs->wind_y[c]); | |
__m256 vz = _mm256_load_ps(&cs->wind_z[c]); | |
// Create q_conjugate components (qw remains, qx, qy, qz are negated) | |
__m256 cqx = _mm256_sub_ps(zero_vec, qx); // -qx | |
__m256 cqy = _mm256_sub_ps(zero_vec, qy); // -qy | |
__m256 cqz = _mm256_sub_ps(zero_vec, qz); // -qz | |
// Step 1: t = q_conj * v_world_as_quat | |
// t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz) | |
// tw = -(-qx)*vx - (-qy)*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz | |
// tx = qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy | |
// ty = qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz | |
// tz = qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx | |
__m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx)); // qw*vx + (-qy)*vz | |
tx = _mm256_fnmadd_ps(cqz, vy, tx);// qw*vx - qy*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy | |
__m256 ty = _mm256_fmadd_ps(cqz, vx, _mm256_mul_ps(qw, vy)); // qw*vy + (-qz)*vx | |
ty = _mm256_fnmadd_ps(cqx, vz, ty); // qw*vy - qz*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz | |
__m256 tz = _mm256_fmadd_ps(cqx, vy, _mm256_mul_ps(qw, vz)); // qw*vz + (-qx)*vy | |
tz = _mm256_fnmadd_ps(cqy, vx, tz); // qw*vz - qx*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx | |
__m256 tw = _mm256_fnmadd_ps(cqx, vx, zero_vec); // 0 - (-qx)*vx = qx*vx | |
tw = _mm256_fnmadd_ps(cqy, vy, tw); // qx*vx - (-qy)*vy = qx*vx + qy*vy | |
tw = _mm256_fnmadd_ps(cqz, vz, tw); // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz | |
// Step 2: result_vec = (t * q)_vec | |
// result = (tw, tx, ty, tz) * (qw, qx, qy, qz) | |
// result_x = tw*qx + tx*qw + ty*qz - tz*qy | |
// result_y = tw*qy + ty*qw + tz*qx - tx*qz | |
// result_z = tw*qz + ty*qw + tx*qy - ty*qx | |
__m256 result_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx)); | |
result_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, result_x)); | |
__m256 result_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy)); | |
result_y = _mm256_fnmadd_ps(tx, qz, _mm256_fmadd_ps(ty, qw, result_y)); | |
__m256 result_z = _mm256_fmadd_ps(tx, qy, _mm256_mul_ps(tw, qz)); | |
result_z = _mm256_fnmadd_ps(ty, qx, _mm256_fmadd_ps(tz, qw, result_z)); | |
_mm256_store_ps(&wind_local_x[c], result_x); | |
_mm256_store_ps(&wind_local_y[c], result_y); | |
_mm256_store_ps(&wind_local_z[c], result_z); | |
} | |
// Compute local-space gravity | |
{ | |
__m256 vx = _mm256_load_ps(&cs->gravity_x[c]); | |
__m256 vy = _mm256_load_ps(&cs->gravity_y[c]); | |
__m256 vz = _mm256_load_ps(&cs->gravity_z[c]); | |
// Create q_conjugate components (qw remains, qx, qy, qz are negated) | |
__m256 cqx = _mm256_sub_ps(zero_vec, qx); // -qx | |
__m256 cqy = _mm256_sub_ps(zero_vec, qy); // -qy | |
__m256 cqz = _mm256_sub_ps(zero_vec, qz); // -qz | |
// Step 1: t = q_conj * v_world_as_quat | |
// t = (qw, -qx, -qy, -qz) * (0, vx, vy, vz) | |
// tw = -(-qx)*vx - (-qy)*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz | |
// tx = qw*vx + (-qy)*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy | |
// ty = qw*vy + (-qz)*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz | |
// tz = qw*vz + (-qx)*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx | |
__m256 tx = _mm256_fmadd_ps(cqy, vz, _mm256_mul_ps(qw, vx)); // qw*vx + (-qy)*vz | |
tx = _mm256_fnmadd_ps(cqz, vy, tx);// qw*vx - qy*vz - (-qz)*vy = qw*vx - qy*vz + qz*vy | |
__m256 ty = _mm256_fmadd_ps(cqz, vx, _mm256_mul_ps(qw, vy)); // qw*vy + (-qz)*vx | |
ty = _mm256_fnmadd_ps(cqx, vz, ty); // qw*vy - qz*vx - (-qx)*vz = qw*vy - qz*vx + qx*vz | |
__m256 tz = _mm256_fmadd_ps(cqx, vy, _mm256_mul_ps(qw, vz)); // qw*vz + (-qx)*vy | |
tz = _mm256_fnmadd_ps(cqy, vx, tz); // qw*vz - qx*vy - (-qy)*vx = qw*vz - qx*vy + qy*vx | |
__m256 tw = _mm256_fnmadd_ps(cqx, vx, zero_vec); // 0 - (-qx)*vx = qx*vx | |
tw = _mm256_fnmadd_ps(cqy, vy, tw); // qx*vx - (-qy)*vy = qx*vx + qy*vy | |
tw = _mm256_fnmadd_ps(cqz, vz, tw); // qx*vx + qy*vy - (-qz)*vz = qx*vx + qy*vy + qz*vz | |
// Step 2: result_vec = (t * q)_vec | |
// result = (tw, tx, ty, tz) * (qw, qx, qy, qz) | |
// result_x = tw*qx + tx*qw + ty*qz - tz*qy | |
// result_y = tw*qy + ty*qw + tz*qx - tx*qz | |
// result_z = tw*qz + ty*qw + tx*qy - ty*qx | |
__m256 result_x = _mm256_fmadd_ps(ty, qz, _mm256_mul_ps(tw, qx)); | |
result_x = _mm256_fnmadd_ps(tz, qy, _mm256_fmadd_ps(tx, qw, result_x)); | |
__m256 result_y = _mm256_fmadd_ps(tz, qx, _mm256_mul_ps(tw, qy)); | |
result_y = _mm256_fnmadd_ps(tx, qz, _mm256_fmadd_ps(ty, qw, result_y)); | |
__m256 result_z = _mm256_fmadd_ps(tx, qy, _mm256_mul_ps(tw, qz)); | |
result_z = _mm256_fnmadd_ps(ty, qx, _mm256_fmadd_ps(tz, qw, result_z)); | |
_mm256_store_ps(&gravity_local_x[c], result_x); | |
_mm256_store_ps(&gravity_local_y[c], result_y); | |
_mm256_store_ps(&gravity_local_z[c], result_z); | |
} | |
// Compute linear velocity | |
{ | |
__m256 curr_x = _mm256_load_ps(&cs->chain_pos_x[c]); | |
__m256 curr_y = _mm256_load_ps(&cs->chain_pos_y[c]); | |
__m256 curr_z = _mm256_load_ps(&cs->chain_pos_z[c]); | |
__m256 prev_x = _mm256_load_ps(&cs->prev_chain_pos_x[c]); | |
__m256 prev_y = _mm256_load_ps(&cs->prev_chain_pos_y[c]); | |
__m256 prev_z = _mm256_load_ps(&cs->prev_chain_pos_z[c]); | |
__m256 vel_x_vec = _mm256_mul_ps(_mm256_sub_ps(curr_x, prev_x), inv_dt_vec); | |
__m256 vel_y_vec = _mm256_mul_ps(_mm256_sub_ps(curr_y, prev_y), inv_dt_vec); | |
__m256 vel_z_vec = _mm256_mul_ps(_mm256_sub_ps(curr_z, prev_z), inv_dt_vec); | |
_mm256_store_ps(&vel_x[c], vel_x_vec); | |
_mm256_store_ps(&vel_y[c], vel_y_vec); | |
_mm256_store_ps(&vel_z[c], vel_z_vec); | |
} | |
// Compute angular velocity (SIMD with polynomial approximations) | |
{ | |
__m256 prev_qx = _mm256_load_ps(&cs->prev_chain_quat_x[c]); | |
__m256 prev_qy = _mm256_load_ps(&cs->prev_chain_quat_y[c]); | |
__m256 prev_qz = _mm256_load_ps(&cs->prev_chain_quat_z[c]); | |
__m256 prev_qw = _mm256_load_ps(&cs->prev_chain_quat_w[c]); | |
// Quaternion inverse | |
__m256 norm_sq = _mm256_fmadd_ps(prev_qx, prev_qx, _mm256_mul_ps(prev_qy, prev_qy)); | |
norm_sq = _mm256_fmadd_ps(prev_qz, prev_qz, norm_sq); | |
norm_sq = _mm256_fmadd_ps(prev_qw, prev_qw, norm_sq); | |
__m256 inv_norm_sq = _mm256_rcp_ps(_mm256_max_ps(norm_sq, eps_vec)); | |
__m256 inv_qx = _mm256_mul_ps(_mm256_sub_ps(zero_vec, prev_qx), inv_norm_sq); | |
__m256 inv_qy = _mm256_mul_ps(_mm256_sub_ps(zero_vec, prev_qy), inv_norm_sq); | |
__m256 inv_qz = _mm256_mul_ps(_mm256_sub_ps(zero_vec, prev_qz), inv_norm_sq); | |
__m256 inv_qw = _mm256_mul_ps(prev_qw, inv_norm_sq); | |
// Delta quaternion: q_current * q_prev^-1 | |
__m256 delta_qx = _mm256_fmadd_ps(qy, inv_qz, _mm256_mul_ps(qw, inv_qx)); | |
delta_qx = _mm256_fnmadd_ps(qz, inv_qy, _mm256_fmadd_ps(qx, inv_qw, delta_qx)); | |
__m256 delta_qy = _mm256_fmadd_ps(qy, inv_qw, _mm256_mul_ps(qw, inv_qy)); | |
delta_qy = _mm256_fmadd_ps(qz, inv_qx, _mm256_fnmadd_ps(qx, inv_qz, delta_qy)); | |
__m256 delta_qz = _mm256_fmadd_ps(qx, inv_qy, _mm256_mul_ps(qw, inv_qz)); | |
delta_qz = _mm256_fnmadd_ps(qy, inv_qx, _mm256_fmadd_ps(qz, inv_qw, delta_qz)); | |
__m256 delta_qw = _mm256_fnmadd_ps(qx, inv_qx, _mm256_mul_ps(qw, inv_qw)); | |
delta_qw = _mm256_fnmadd_ps(qy, inv_qy, delta_qw); | |
delta_qw = _mm256_fnmadd_ps(qz, inv_qz, delta_qw); | |
// Angular velocity with polynomial approximations | |
__m256 qw_clamped = _mm256_max_ps(_mm256_min_ps(delta_qw, one_vec), neg_one_vec); | |
// Acos approximation: acos(x) ≈ π/2 - x * (1 + x^2 * (c1 + x^2 * (c2 + x^2 * c3))) | |
__m256 x = qw_clamped; | |
__m256 x2 = _mm256_mul_ps(x, x); | |
__m256 x4 = _mm256_mul_ps(x2, x2); | |
__m256 poly = _mm256_fmadd_ps(x4, acos_c3_vec, acos_c2_vec); | |
poly = _mm256_fmadd_ps(x2, poly, acos_c1_vec); | |
poly = _mm256_fmadd_ps(x2, poly, one_vec); | |
poly = _mm256_mul_ps(x, poly); | |
__m256 angle = _mm256_mul_ps(two_vec, _mm256_sub_ps(pi_over_2_vec, poly)); | |
// Sin approximation for sin(angle/2): sin(x) ≈ x * (1 + x^2 * (c1 + x^2 * c2)) | |
__m256 half_angle = _mm256_mul_ps(angle, half_vec); | |
__m256 ha2 = _mm256_mul_ps(half_angle, half_angle); | |
__m256 sin_poly = _mm256_fmadd_ps(ha2, sin_c2_vec, sin_c1_vec); | |
sin_poly = _mm256_fmadd_ps(ha2, sin_poly, one_vec); | |
__m256 sin_half_angle = _mm256_mul_ps(half_angle, sin_poly); | |
__m256 inv_sin_half_angle = _mm256_rcp_ps(_mm256_max_ps(sin_half_angle, eps_vec)); | |
// Sign: Use qw > 0 instead of angle < π | |
__m256 sign = _mm256_cmp_ps(qw_clamped, zero_vec, _CMP_GT_OQ); | |
sign = _mm256_or_ps(_mm256_and_ps(sign, one_vec), _mm256_andnot_ps(sign, neg_one_vec)); | |
// Angular velocity: (sign * qx * inv_sin_half_angle / dt) | |
__m256 scale = _mm256_mul_ps(sign, _mm256_mul_ps(inv_sin_half_angle, inv_dt_vec)); | |
__m256 ang_vel_x_vec = _mm256_mul_ps(delta_qx, scale); | |
__m256 ang_vel_y_vec = _mm256_mul_ps(delta_qy, scale); | |
__m256 ang_vel_z_vec = _mm256_mul_ps(delta_qz, scale); | |
_mm256_store_ps(&ang_vel_x[c], ang_vel_x_vec); | |
_mm256_store_ps(&ang_vel_y[c], ang_vel_y_vec); | |
_mm256_store_ps(&ang_vel_z[c], ang_vel_z_vec); | |
} | |
} | |
// Step 1: Apply Chain Inertia (Linear, Angular, and Centrifugal) | |
for (int c = 0; c < MAX_CHAINS; c++) { | |
int base_idx = c * PARTICLES_PER_CHAIN; | |
struct chain_cfg *cfg = &cs->chain_configs[c]; | |
// Load precomputed velocities and inertia parameters | |
__m256 vel_x_vec = _mm256_set1_ps(vel_x[c]); | |
__m256 vel_y_vec = _mm256_set1_ps(vel_y[c]); | |
__m256 vel_z_vec = _mm256_set1_ps(vel_z[c]); | |
__m256 ang_vel_x_vec = _mm256_set1_ps(ang_vel_x[c]); | |
__m256 ang_vel_y_vec = _mm256_set1_ps(ang_vel_y[c]); | |
__m256 ang_vel_z_vec = _mm256_set1_ps(ang_vel_z[c]); | |
__m256 linear_inertia_vec = _mm256_set1_ps(cfg->linear_inertia); | |
__m256 angular_inertia_vec = _mm256_set1_ps(cfg->angular_inertia); | |
__m256 centrifugal_inertia_vec = _mm256_set1_ps(cfg->centrifugal_inertia); | |
// Clamp inertia parameters to [0, 1] | |
linear_inertia_vec = _mm256_max_ps(_mm256_min_ps(linear_inertia_vec, inertia_clamp_max), inertia_clamp_min); | |
angular_inertia_vec = _mm256_max_ps(_mm256_min_ps(angular_inertia_vec, inertia_clamp_max), inertia_clamp_min); | |
centrifugal_inertia_vec = _mm256_max_ps(_mm256_min_ps(centrifugal_inertia_vec, inertia_clamp_max), inertia_clamp_min); | |
for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) { | |
__m256 px = _mm256_load_ps(&cs->p_pos_x[base_idx + i]); | |
__m256 py = _mm256_load_ps(&cs->p_pos_y[base_idx + i]); | |
__m256 pz = _mm256_load_ps(&cs->p_pos_z[base_idx + i]); | |
__m256 p_inv_mass = _mm256_load_ps(&cs->inv_mass[base_idx + i]); | |
p_inv_mass = _mm256_max_ps(_mm256_min_ps(p_inv_mass, inv_mass_clamp_max), inv_mass_clamp_min); | |
// Linear inertia: v * dt * linear_inertia | |
__m256 lin_delta_x = _mm256_mul_ps(_mm256_mul_ps(vel_x_vec, dt_vec), linear_inertia_vec); | |
__m256 lin_delta_y = _mm256_mul_ps(_mm256_mul_ps(vel_y_vec, dt_vec), linear_inertia_vec); | |
__m256 lin_delta_z = _mm256_mul_ps(_mm256_mul_ps(vel_z_vec, dt_vec), linear_inertia_vec); | |
// Angular inertia: (ω × r) * dt * angular_inertia | |
__m256 ang_delta_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py)); | |
__m256 ang_delta_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz)); | |
__m256 ang_delta_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px)); | |
ang_delta_x = _mm256_mul_ps(_mm256_mul_ps(ang_delta_x, dt_vec), angular_inertia_vec); | |
ang_delta_y = _mm256_mul_ps(_mm256_mul_ps(ang_delta_y, dt_vec), angular_inertia_vec); | |
ang_delta_z = _mm256_mul_ps(_mm256_mul_ps(ang_delta_z, dt_vec), angular_inertia_vec); | |
// Centrifugal inertia: (ω × (ω × r)) * dt^2 * centrifugal_inertia | |
__m256 cross1_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, pz), _mm256_mul_ps(ang_vel_z_vec, py)); | |
__m256 cross1_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, px), _mm256_mul_ps(ang_vel_x_vec, pz)); | |
__m256 cross1_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, py), _mm256_mul_ps(ang_vel_y_vec, px)); | |
__m256 cross2_x = _mm256_sub_ps(_mm256_mul_ps(ang_vel_y_vec, cross1_z), _mm256_mul_ps(ang_vel_z_vec, cross1_y)); | |
__m256 cross2_y = _mm256_sub_ps(_mm256_mul_ps(ang_vel_z_vec, cross1_x), _mm256_mul_ps(ang_vel_x_vec, cross1_z)); | |
__m256 cross2_z = _mm256_sub_ps(_mm256_mul_ps(ang_vel_x_vec, cross1_y), _mm256_mul_ps(ang_vel_y_vec, cross1_x)); | |
// Calculate displacement: (ω × (ω × r)) * dt^2 | |
__m256 base_cent_delta_x = _mm256_mul_ps(cross2_x, dt2_vec); | |
__m256 base_cent_delta_y = _mm256_mul_ps(cross2_y, dt2_vec); | |
__m256 base_cent_delta_z = _mm256_mul_ps(cross2_z, dt2_vec); | |
// Apply the centrifugal_inertia factor | |
__m256 cent_delta_x = _mm256_mul_ps(base_cent_delta_x, centrifugal_inertia_vec); | |
__m256 cent_delta_y = _mm256_mul_ps(base_cent_delta_y, centrifugal_inertia_vec); | |
__m256 cent_delta_z = _mm256_mul_ps(base_cent_delta_z, centrifugal_inertia_vec); | |
// Total delta | |
__m256 delta_x = _mm256_add_ps(_mm256_add_ps(lin_delta_x, ang_delta_x), cent_delta_x); | |
__m256 delta_y = _mm256_add_ps(_mm256_add_ps(lin_delta_y, ang_delta_y), cent_delta_y); | |
__m256 delta_z = _mm256_add_ps(_mm256_add_ps(lin_delta_z, ang_delta_z), cent_delta_z); | |
// Update positions | |
_mm256_store_ps(&cs->p_pos_x[base_idx + i], _mm256_add_ps(px, delta_x)); | |
_mm256_store_ps(&cs->p_pos_y[base_idx + i], _mm256_add_ps(py, delta_y)); | |
_mm256_store_ps(&cs->p_pos_z[base_idx + i], _mm256_add_ps(pz, delta_z)); | |
} | |
// Update previous transformation | |
cs->prev_chain_pos_x[c] = cs->chain_pos_x[c]; | |
cs->prev_chain_pos_y[c] = cs->chain_pos_y[c]; | |
cs->prev_chain_pos_z[c] = cs->chain_pos_z[c]; | |
cs->prev_chain_quat_x[c] = cs->chain_quat_x[c]; | |
cs->prev_chain_quat_y[c] = cs->chain_quat_y[c]; | |
cs->prev_chain_quat_z[c] = cs->chain_quat_z[c]; | |
cs->prev_chain_quat_w[c] = cs->chain_quat_w[c]; | |
} | |
// Step 2: Verlet Integration | |
for (int c = 0; c < MAX_CHAINS; c++) { | |
int base_idx = c * PARTICLES_PER_CHAIN; | |
struct chain_cfg *cfg = &cs->chain_configs[c]; | |
__m256 gravity_x_vec = _mm256_set1_ps(gravity_local_x[c]); | |
__m256 gravity_y_vec = _mm256_set1_ps(gravity_local_y[c]); | |
__m256 gravity_z_vec = _mm256_set1_ps(gravity_local_z[c]); | |
__m256 wind_x_vec = _mm256_set1_ps(wind_local_x[c]); | |
__m256 wind_y_vec = _mm256_set1_ps(wind_local_y[c]); | |
__m256 wind_z_vec = _mm256_set1_ps(wind_local_z[c]); | |
__m256 damping_vec = _mm256_set1_ps(cfg->damping); | |
for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) { | |
__m256 p_curr_x = _mm256_load_ps(&cs->p_pos_x[base_idx + i]); | |
__m256 p_curr_y = _mm256_load_ps(&cs->p_pos_y[base_idx + i]); | |
__m256 p_curr_z = _mm256_load_ps(&cs->p_pos_z[base_idx + i]); | |
__m256 p_inv_mass = _mm256_load_ps(&cs->inv_mass[base_idx + i]); | |
p_inv_mass = _mm256_max_ps(_mm256_min_ps(p_inv_mass, inv_mass_clamp_max), inv_mass_clamp_min); | |
__m256 p_prev_x = _mm256_load_ps(&cs->prev_p_pos_x[base_idx + i]); | |
__m256 p_prev_y = _mm256_load_ps(&cs->prev_p_pos_y[base_idx + i]); | |
__m256 p_prev_z = _mm256_load_ps(&cs->prev_p_pos_z[base_idx + i]); | |
__m256 force_x = _mm256_add_ps(gravity_x_vec, wind_x_vec); | |
__m256 force_y = _mm256_add_ps(gravity_y_vec, wind_y_vec); | |
__m256 force_z = _mm256_add_ps(gravity_z_vec, wind_z_vec); | |
__m256 acc_x = _mm256_mul_ps(force_x, p_inv_mass); | |
__m256 acc_y = _mm256_mul_ps(force_y, p_inv_mass); | |
__m256 acc_z = _mm256_mul_ps(force_z, p_inv_mass); | |
__m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x); | |
__m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y); | |
__m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z); | |
__m256 damped_vel_x = _mm256_mul_ps(vel_x, damping_vec); | |
__m256 damped_vel_y = _mm256_mul_ps(vel_y, damping_vec); | |
__m256 damped_vel_z = _mm256_mul_ps(vel_z, damping_vec); | |
__m256 term_accel_x = _mm256_mul_ps(acc_x, dt2_vec); | |
__m256 term_accel_y = _mm256_mul_ps(acc_y, dt2_vec); | |
__m256 term_accel_z = _mm256_mul_ps(acc_z, dt2_vec); | |
__m256 new_p_x = _mm256_add_ps(p_curr_x, _mm256_add_ps(damped_vel_x, term_accel_x)); | |
__m256 new_p_y = _mm256_add_ps(p_curr_y, _mm256_add_ps(damped_vel_y, term_accel_y)); | |
__m256 new_p_z = _mm256_add_ps(p_curr_z, _mm256_add_ps(damped_vel_z, term_accel_z)); | |
_mm256_store_ps(&cs->p_pos_x[base_idx + i], new_p_x); | |
_mm256_store_ps(&cs->p_pos_y[base_idx + i], new_p_y); | |
_mm256_store_ps(&cs->p_pos_z[base_idx + i], new_p_z); | |
_mm256_store_ps(&cs->prev_p_pos_x[base_idx + i], p_curr_x); | |
_mm256_store_ps(&cs->prev_p_pos_y[base_idx + i], p_curr_y); | |
_mm256_store_ps(&cs->prev_p_pos_z[base_idx + i], p_curr_z); | |
} | |
} | |
// --- AVBD Warm-starting of lambda and k_current for all distance constraints. | |
for (int c = 0; c < MAX_CHAINS; ++c) { | |
struct chain_cfg* cfg = &cs->chain_configs[c]; | |
__m256 alpha_scalar_vec = _mm256_set1_ps(cfg->avbd_alpha); | |
__m256 gamma_scalar_vec = _mm256_set1_ps(cfg->avbd_gamma); | |
__m256 k_start_scalar_vec = _mm256_set1_ps(cfg->avbd_k_start); | |
// Loop through each block of 8 constraints for warm-starting | |
int r_base = c * CONSTRAINTS_PER_CHAIN; // Base index for this chain's constraints | |
for (int i = 0; i < CONSTRAINTS_PER_CHAIN - 1; i += 8) { | |
// Load current lambda, k_current for 8 constraints simultaneously | |
__m256 current_lambda_vec = _mm256_load_ps(&cs->dist_lambda[r_base + i]); | |
__m256 current_k_vec = _mm256_load_ps(&cs->dist_k_current[r_base + i]); | |
// Apply warm-starting (Eq. 19 from paper) | |
// lambda_new = lambda_prev_frame * alpha * gamma | |
current_lambda_vec = _mm256_mul_ps(current_lambda_vec, _mm256_mul_ps(alpha_scalar_vec, gamma_scalar_vec)); | |
// k_current_new = max(K_START, k_current_prev_frame * gamma) | |
current_k_vec = _mm256_max_ps(k_start_scalar_vec, _mm256_mul_ps(current_k_vec, gamma_scalar_vec)); | |
// Store updated warm-started values back | |
_mm256_store_ps(&cs->dist_lambda[r_base + i], current_lambda_vec); | |
_mm256_store_ps(&cs->dist_k_current[r_base + i], current_k_vec); | |
} | |
} | |
// Warm-starting for Collision Constraints --- | |
for (int c_idx = 0; c_idx < MAX_CHAINS; ++c_idx) { | |
struct chain_cfg* cfg = &cs->chain_configs[c_idx]; | |
__m256 alpha_scalar_vec = _mm256_set1_ps(cfg->avbd_alpha); | |
__m256 gamma_scalar_vec = _mm256_set1_ps(cfg->avbd_gamma); | |
__m256 k_start_scalar_vec = _mm256_set1_ps(cfg->avbd_k_start); | |
int p_base = c_idx * PARTICLES_PER_CHAIN; // Base index for this chain's particles | |
for (int i = 0; i < PARTICLES_PER_CHAIN; i += VEC_FLOAT_COUNT) { | |
__m256 current_lambda_vec = _mm256_load_ps(&cs->coll_lambda[p_base + i]); | |
__m256 current_k_vec = _mm256_load_ps(&cs->coll_k_current[p_base + i]); | |
current_lambda_vec = _mm256_mul_ps(current_lambda_vec, _mm256_mul_ps(alpha_scalar_vec, gamma_scalar_vec)); | |
current_k_vec = _mm256_max_ps(k_start_scalar_vec, _mm256_mul_ps(current_k_vec, gamma_scalar_vec)); | |
_mm256_store_ps(&cs->coll_lambda[p_base + i], current_lambda_vec); | |
_mm256_store_ps(&cs->coll_k_current[p_base + i], current_k_vec); | |
} | |
} | |
for (int c = 0; c < MAX_CHAINS; c++) { | |
struct chain_cfg *cfg = &cs->chain_configs[c]; | |
__m256 alpha_vec = _mm256_set1_ps(cfg->avbd_alpha); | |
__m256 beta_vec = _mm256_set1_ps(cfg->avbd_beta); | |
__m256 k_max_vec = _mm256_set1_ps(cfg->avbd_k_max); | |
int p_base = c * PARTICLES_PER_CHAIN; | |
int r_base = c * CONSTRAINTS_PER_CHAIN; | |
// Load all current particle positions into local buffer (once per chain per iteration) | |
for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) { | |
_mm256_store_ps(&px_local[i], _mm256_load_ps(&cs->p_pos_x[p_base + i])); | |
_mm256_store_ps(&py_local[i], _mm256_load_ps(&cs->p_pos_y[p_base + i])); | |
_mm256_store_ps(&pz_local[i], _mm256_load_ps(&cs->p_pos_z[p_base + i])); | |
_mm256_store_ps(&pm_local[i], _mm256_load_ps(&cs->inv_mass[p_base + i])); | |
_mm256_store_ps(&pm_local[i], _mm256_max_ps(_mm256_min_ps(_mm256_load_ps(&pm_local[i]), inv_mass_clamp_max), inv_mass_clamp_min)); | |
} | |
for (int iter = 0; iter < MAX_ITERATIONS; iter++) { | |
// First block (i=0 to 7 of particles, processing constraints 0 to 7) | |
{ | |
// p0 is aligned from px_local[0] | |
__m256 p0x = _mm256_load_ps(&px_local[0]); | |
__m256 p0y = _mm256_load_ps(&py_local[0]); | |
__m256 p0z = _mm256_load_ps(&pz_local[0]); | |
__m256 p0m = _mm256_load_ps(&pm_local[0]); | |
// p1 is unaligned from px_local[1] | |
__m256 p1x = _mm256_loadu_ps(&px_local[1]); | |
__m256 p1y = _mm256_loadu_ps(&py_local[1]); | |
__m256 p1z = _mm256_loadu_ps(&pz_local[1]); | |
__m256 p1m = _mm256_loadu_ps(&pm_local[1]); | |
// This corresponds to rest_lengths[r_base + 0] through rest_lengths[r_base + 7] | |
__m256 rest_len_vec = _mm256_load_ps(&cs->rest_lengths[r_base]); | |
// Calculate inverse sum of masses (rcp_w_sum) | |
__m256 w_sum = _mm256_add_ps(p0m, p1m); | |
__m256 w_sum_clamped = _mm256_max_ps(w_sum, eps_vec); | |
__m256 rcp_w_sum = _mm256_rcp_ps(w_sum_clamped); | |
// Calculate current vector and distance between particles | |
__m256 dx_vec = _mm256_sub_ps(p1x, p0x); | |
__m256 dy_vec = _mm256_sub_ps(p1y, p0y); | |
__m256 dz_vec = _mm256_sub_ps(p1z, p0z); | |
__m256 dist_sq = _mm256_mul_ps(dx_vec, dx_vec); | |
dist_sq = _mm256_fmadd_ps(dy_vec, dy_vec, dist_sq); | |
dist_sq = _mm256_fmadd_ps(dz_vec, dz_vec, dist_sq); | |
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); // 1/dist | |
__m256 dist = _mm256_rcp_ps(inv_dist); // Current distance | |
// Primal Update for Distance Constraints (Block 1) --- | |
// Load AVBD state for this block of 8 constraints (indices r_base + 0 to r_base + 7) | |
__m256 current_lambda_vec = _mm256_load_ps(&cs->dist_lambda[r_base + 0]); | |
__m256 current_k_vec = _mm256_load_ps(&cs->dist_k_current[r_base + 0]); | |
__m256 prev_frame_error_vec = _mm256_load_ps(&cs->dist_prev_frame_error[r_base + 0]); | |
// Original constraint error C_j*(x) = current_dist - target_length | |
__m256 C_x_original = _mm256_sub_ps(dist, rest_len_vec); | |
// Regularized error C_j(x) = C_j*(x) - alpha * C_j*(x_prev_frame) (Eq. 18) | |
__m256 C_x_regularized = _mm256_sub_ps(C_x_original, _mm256_mul_ps(alpha_vec, prev_frame_error_vec)); | |
// Calculate the total force term based on Augmented Lagrangian: -(k * C_reg + lambda) | |
// The effective displacement is proportional to this force divided by effective stiffness (k) and mass. | |
__m256 force_term = _mm256_fmadd_ps(current_k_vec, C_x_regularized, current_lambda_vec); | |
// Calculate the scalar part of the correction, similar to original 'corr_scalar_part' | |
// This is simplified, representing the desired displacement due to the augmented force | |
__m256 effective_corr_scalar_part = _mm256_div_ps(force_term, current_k_vec); // Simplified displacement factor | |
// Normalize difference vector | |
__m256 delta_unit_x = _mm256_mul_ps(dx_vec, inv_dist); | |
__m256 delta_unit_y = _mm256_mul_ps(dy_vec, inv_dist); | |
__m256 delta_unit_z = _mm256_mul_ps(dz_vec, inv_dist); | |
// Calculate the actual displacement vectors | |
// Note the sign inversion from original: -(delta_x * pm0) for p0 and +(delta_x * pm1) for p1 | |
// The 'effective_corr_scalar_part' already implicitly carries the sign of the desired correction. | |
// So, we just multiply by the unit direction and inverse mass. | |
__m256 delta_x = _mm256_mul_ps(delta_unit_x, effective_corr_scalar_part); | |
__m256 delta_y = _mm256_mul_ps(delta_unit_y, effective_corr_scalar_part); | |
__m256 delta_z = _mm256_mul_ps(delta_unit_z, effective_corr_scalar_part); | |
// Apply correction mask (from original code, avoid division by zero artifacts where dist is ~0) | |
__m256 apply_corr_mask = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ); | |
delta_x = _mm256_and_ps(delta_x, apply_corr_mask); | |
delta_y = _mm256_and_ps(delta_y, apply_corr_mask); | |
delta_z = _mm256_and_ps(delta_z, apply_corr_mask); | |
// Update positions in local buffer | |
// p0 moves along -delta (subtract), p1 moves along +delta (add) | |
_mm256_store_ps(&px_local[0], _mm256_sub_ps(p0x, _mm256_mul_ps(delta_x, p0m))); | |
_mm256_store_ps(&py_local[0], _mm256_sub_ps(p0y, _mm256_mul_ps(delta_y, p0m))); | |
_mm256_store_ps(&pz_local[0], _mm256_sub_ps(p0z, _mm256_mul_ps(delta_z, p0m))); | |
_mm256_storeu_ps(&px_local[1], _mm256_add_ps(p1x, _mm256_mul_ps(delta_x, p1m))); | |
_mm256_storeu_ps(&py_local[1], _mm256_add_ps(p1y, _mm256_mul_ps(delta_y, p1m))); | |
_mm256_storeu_ps(&pz_local[1], _mm256_add_ps(p1z, _mm256_mul_ps(delta_z, p1m))); | |
// Dual Update for Distance Constraints (Block 1) --- | |
// Recalculate constraint error *after* position updates for accurate lambda update | |
// Reload updated positions from local buffer to get the 'post-primal' state | |
p0x = _mm256_load_ps(&px_local[0]); | |
p0y = _mm256_load_ps(&py_local[0]); | |
p0z = _mm256_load_ps(&pz_local[0]); | |
p1x = _mm256_loadu_ps(&px_local[1]); | |
p1y = _mm256_loadu_ps(&py_local[1]); | |
p1z = _mm256_loadu_ps(&pz_local[1]); | |
__m256 updated_dx = _mm256_sub_ps(p1x, p0x); | |
__m256 updated_dy = _mm256_sub_ps(p1y, p0y); | |
__m256 updated_dz = _mm256_sub_ps(p1z, p0z); | |
__m256 updated_dist_sq = _mm256_mul_ps(updated_dx, updated_dx); | |
updated_dist_sq = _mm256_fmadd_ps(updated_dy, updated_dy, updated_dist_sq); | |
updated_dist_sq = _mm256_fmadd_ps(updated_dz, updated_dz, updated_dist_sq); | |
__m256 updated_inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(updated_dist_sq, eps_vec)); | |
__m256 updated_dist = _mm256_rcp_ps(updated_inv_dist); | |
__m256 C_x_updated = _mm256_sub_ps(updated_dist, rest_len_vec); // New error after primal update | |
// Update Lambda (Eq. 11: lambda_new = lambda_old + k_current * C_j(x_updated)) | |
__m256 new_lambda_vec = _mm256_fmadd_ps(current_k_vec, C_x_updated, current_lambda_vec); | |
_mm256_store_ps(&cs->dist_lambda[r_base + 0], new_lambda_vec); // Store back | |
// Update k_current (Stiffness ramping, Eq. 12) | |
// k_new = min(K_MAX, k_current + BETA * |C_j(x_updated)|) | |
__m256 abs_C_x_updated = _mm256_and_ps(C_x_updated, abs_mask_ps); // Absolute value using bitmask | |
__m256 k_updated_vec = _mm256_fmadd_ps(beta_vec, abs_C_x_updated, current_k_vec); | |
__m256 new_k_vec = _mm256_min_ps(k_max_vec, k_updated_vec); | |
_mm256_store_ps(&cs->dist_k_current[r_base + 0], new_k_vec); // Store back | |
} | |
// Second block (i=8 to 14 of particles, processing constraints 8 to 14) | |
{ | |
// p0 is aligned from px_local[8] | |
__m256 p0x = _mm256_load_ps(&px_local[8]); | |
__m256 p0y = _mm256_load_ps(&py_local[8]); | |
__m256 p0z = _mm256_load_ps(&pz_local[8]); | |
__m256 p0m = _mm256_load_ps(&pm_local[8]); | |
// p1 is unaligned from px_local[9], *including* the dummy element at px_local[16] | |
// when i=15, p_base+i+1 becomes p_base+16 | |
__m256 p1x = _mm256_loadu_ps(&px_local[9]); | |
__m256 p1y = _mm256_loadu_ps(&py_local[9]); | |
__m256 p1z = _mm256_loadu_ps(&pz_local[9]); | |
__m256 p1m = _mm256_loadu_ps(&pm_local[9]); | |
// This corresponds to rest_lengths[r_base + 8] through rest_lengths[r_base + 15] | |
__m256 rest_len_vec = _mm256_load_ps(&cs->rest_lengths[r_base + 8]); | |
__m256 dx_vec = _mm256_sub_ps(p1x, p0x); | |
__m256 dy_vec = _mm256_sub_ps(p1y, p0y); | |
__m256 dz_vec = _mm256_sub_ps(p1z, p0z); | |
__m256 dist_sq = _mm256_mul_ps(dx_vec, dx_vec); | |
dist_sq = _mm256_fmadd_ps(dy_vec, dy_vec, dist_sq); | |
dist_sq = _mm256_fmadd_ps(dz_vec, dz_vec, dist_sq); | |
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); | |
__m256 w_sum = _mm256_add_ps(p0m, p1m); | |
__m256 w_sum_clamped = _mm256_max_ps(w_sum, eps_vec); | |
__m256 rcp_w_sum = _mm256_rcp_ps(w_sum_clamped); | |
__m256 dist = _mm256_rcp_ps(inv_dist); | |
// Primal Update for Distance Constraints (Block 2) --- | |
// Load AVBD state for this block of 8 constraints (indices r_base + 8 to r_base + 15) | |
__m256 current_lambda_vec = _mm256_load_ps(&cs->dist_lambda[r_base + 8]); | |
__m256 current_k_vec = _mm256_load_ps(&cs->dist_k_current[r_base + 8]); | |
__m256 prev_frame_error_vec = _mm256_load_ps(&cs->dist_prev_frame_error[r_base + 8]); | |
// Original constraint error C_j*(x) = current_dist - target_length | |
__m256 C_x_original = _mm256_sub_ps(dist, rest_len_vec); | |
// Regularized error C_j(x) = C_j*(x) - alpha * C_j*(x_prev_frame) (Eq. 18) | |
__m256 C_x_regularized = _mm256_sub_ps(C_x_original, _mm256_mul_ps(alpha_vec, prev_frame_error_vec)); | |
// Calculate the total force term based on Augmented Lagrangian: -(k * C_reg + lambda) | |
__m256 force_term = _mm256_fmadd_ps(current_k_vec, C_x_regularized, current_lambda_vec); | |
// Calculate the scalar part of the correction, similar to original 'corr_scalar_part' | |
__m256 effective_corr_scalar_part = _mm256_div_ps(force_term, current_k_vec); // Simplified displacement factor | |
__m256 apply_corr_mask = _mm256_cmp_ps(dist_sq, eps_vec, _CMP_GT_OQ); | |
// This 'valid_mask' handles the fact that the last particle doesn't have a *next* particle | |
// for a distance constraint. It effectively sets the correction for the last element (index 7 of the block) to zero. | |
// For PARTICLES_PER_CHAIN = 16, this second block covers constraints for particles 8 to 15. | |
// The constraint for particle 15 (index 7 in this block) is between particle 15 and 16 (the dummy). | |
// The mask effectively disables the constraint correction for the dummy particle. | |
__attribute__((aligned(32))) float valid_mask_array[8] = { -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, 0.0f }; | |
__m256 valid_mask = _mm256_load_ps(valid_mask_array); | |
apply_corr_mask = _mm256_and_ps(apply_corr_mask, valid_mask); | |
__m256 delta_unit_x = _mm256_mul_ps(dx_vec, inv_dist); | |
__m256 delta_unit_y = _mm256_mul_ps(dy_vec, inv_dist); | |
__m256 delta_unit_z = _mm256_mul_ps(dz_vec, inv_dist); | |
__m256 delta_x = _mm256_mul_ps(delta_unit_x, effective_corr_scalar_part); | |
__m256 delta_y = _mm256_mul_ps(delta_unit_y, effective_corr_scalar_part); | |
__m256 delta_z = _mm256_mul_ps(delta_unit_z, effective_corr_scalar_part); | |
delta_x = _mm256_and_ps(delta_x, apply_corr_mask); | |
delta_y = _mm256_and_ps(delta_y, apply_corr_mask); | |
delta_z = _mm256_and_ps(delta_z, apply_corr_mask); | |
// Update positions in local buffer | |
_mm256_store_ps(&px_local[8], _mm256_sub_ps(p0x, _mm256_mul_ps(delta_x, p0m))); | |
_mm256_store_ps(&py_local[8], _mm256_sub_ps(p0y, _mm256_mul_ps(delta_y, p0m))); | |
_mm256_store_ps(&pz_local[8], _mm256_sub_ps(p0z, _mm256_mul_ps(delta_z, p0m))); | |
_mm256_storeu_ps(&px_local[9], _mm256_add_ps(p1x, _mm256_mul_ps(delta_x, p1m))); | |
_mm256_storeu_ps(&py_local[9], _mm256_add_ps(py1, _mm256_mul_ps(delta_y, p1m))); | |
_mm256_storeu_ps(&pz_local[9], _mm256_add_ps(pz1, _mm256_mul_ps(delta_z, p1m))); | |
// Recalculate constraint error *after* position updates for accurate lambda update | |
// Reload updated positions from local buffer | |
p0x = _mm256_load_ps(&px_local[8]); | |
p0y = _mm256_load_ps(&py_local[8]); | |
p0z = _mm256_load_ps(&pz_local[8]); | |
p1x = _mm256_loadu_ps(&px_local[9]); | |
p1y = _mm256_loadu_ps(&py_local[9]); | |
p1z = _mm256_loadu_ps(&pz_local[9]); | |
updated_dx = _mm256_sub_ps(p1x, p0x); | |
updated_dy = _mm256_sub_ps(p1y, p0y); | |
updated_dz = _mm256_sub_ps(p1z, p0z); | |
updated_dist_sq = _mm256_mul_ps(updated_dx, updated_dx); | |
updated_dist_sq = _mm256_fmadd_ps(updated_dy, updated_dy, updated_dist_sq); | |
updated_dist_sq = _mm256_fmadd_ps(updated_dz, updated_dz, updated_dist_sq); | |
updated_inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(updated_dist_sq, eps_vec)); | |
updated_dist = _mm256_rcp_ps(updated_inv_dist); | |
__m256 C_x_updated = _mm256_sub_ps(updated_dist, rest_len_vec); // New error after primal update | |
// Update Lambda (Eq. 11: lambda_new = lambda_old + k_current * C_j(x_updated)) | |
__m256 new_lambda_vec = _mm256_fmadd_ps(current_k_vec, C_x_updated, current_lambda_vec); | |
_mm256_store_ps(&cs->dist_lambda[r_base + 8], new_lambda_vec); // Store back | |
// Update k_current (Stiffness ramping, Eq. 12) | |
// k_new = min(K_MAX, k_current + BETA * |C_j(x_updated)|) | |
__m256 abs_C_x_updated = _mm256_and_ps(C_x_updated, abs_mask_ps); // Absolute value using bitmask | |
__m256 k_updated_vec = _mm256_fmadd_ps(beta_vec, abs_C_x_updated, current_k_vec); | |
__m256 new_k_vec = _mm256_min_ps(k_max_vec, k_updated_vec); | |
_mm256_store_ps(&cs->dist_k_current[r_base + 8], new_k_vec); // Store back | |
} // End of second block | |
} | |
// After all iterations, save the final positions from local buffer to global cs->p_pos_x/y/z | |
for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) { | |
_mm256_store_ps(&cs->p_pos_x[p_base + i], _mm256_load_ps(&px_local[i])); | |
_mm256_store_ps(&cs->p_pos_y[p_base + i], _mm256_load_ps(&py_local[i])); | |
_mm256_store_ps(&cs->p_pos_z[p_base + i], _mm256_load_ps(&pz_local[i])); | |
} | |
} | |
// Step 4: Sphere Collisions with Friction | |
for (int c = 0; c < MAX_CHAINS; c++) { | |
int p_base = c * PARTICLES_PER_CHAIN; | |
int s_base = c * SPHERES_PER_CHAIN; | |
struct chain_cfg *cfg = &cs->chain_configs[c]; | |
__m256 friction_vec = _mm256_set1_ps(cfg->friction); | |
for (int s = 0; s < SPHERES_PER_CHAIN; s++) { | |
__m256 sphere_x_vec = _mm256_set1_ps(cs->sphere_x[s_base + s]); | |
__m256 sphere_y_vec = _mm256_set1_ps(cs->sphere_y[s_base + s]); | |
__m256 sphere_z_vec = _mm256_set1_ps(cs->sphere_z[s_base + s]); | |
__m256 sphere_r_vec = _mm256_set1_ps(cs->sphere_radius[s_base + s]); | |
__m256 alpha_coll_vec = _mm256_set1_ps(cfg->avbd_alpha); | |
__m256 beta_coll_vec = _mm256_set1_ps(cfg->avbd_beta); | |
__m256 k_max_coll_vec = _mm256_set1_ps(cfg->avbd_k_max); | |
for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) { | |
__m256 p_curr_x = _mm256_load_ps(&cs->p_pos_x[p_base + i]); | |
__m256 p_curr_y = _mm256_load_ps(&cs->p_pos_y[p_base + i]); | |
__m256 p_curr_z = _mm256_load_ps(&cs->p_pos_z[p_base + i]); | |
__m256 p_prev_x = _mm256_load_ps(&cs->prev_p_pos_x[p_base + i]); | |
__m256 p_prev_y = _mm256_load_ps(&cs->prev_p_pos_y[p_base + i]); | |
__m256 p_prev_z = _mm256_load_ps(&cs->prev_p_pos_z[p_base + i]); | |
__m256 p_inv_mass = _mm256_load_ps(&cs->inv_mass[p_base + i]); | |
p_inv_mass = _mm256_max_ps(_mm256_min_ps(p_inv_mass, inv_mass_clamp_max), inv_mass_clamp_min); | |
__m256 dx = _mm256_sub_ps(p_curr_x, sphere_x_vec); | |
__m256 dy = _mm256_sub_ps(p_curr_y, sphere_y_vec); | |
__m256 dz = _mm256_sub_ps(p_curr_z, sphere_z_vec); | |
__m256 dist_sq = _mm256_mul_ps(dx, dx); | |
dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq); | |
dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq); | |
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); | |
__m256 dist = _mm256_rcp_ps(inv_dist); | |
__m256 penetration = _mm256_sub_ps(sphere_r_vec, dist); | |
__m256 collision_mask = _mm256_cmp_ps(penetration, zero_vec, _CMP_GT_OQ); | |
__m256 normal_x = _mm256_mul_ps(dx, inv_dist); | |
__m256 normal_y = _mm256_mul_ps(dy, inv_dist); | |
__m256 normal_z = _mm256_mul_ps(dz, inv_dist); | |
// Load AVBD state for this particle block | |
__m256 coll_lambda_vec = _mm256_load_ps(&cs->coll_lambda[p_base + i]); | |
__m256 coll_k_current_vec = _mm256_load_ps(&cs->coll_k_current[p_base + i]); | |
__m256 coll_prev_frame_error_vec = _mm256_load_ps(&cs->coll_prev_frame_error[p_base + i]); | |
// AVBD Primal Update: | |
// Regularized error C_j(x) = C_j*(x) - alpha * C_j*(x_prev_frame) | |
__m256 C_x_regularized = _mm256_sub_ps(penetration, _mm256_mul_ps(alpha_coll_vec, coll_prev_frame_error_vec)); | |
// Apply collision_mask to regularized error, so only active constraints contribute. | |
// This ensures C_x_regularized is 0 for non-penetrating particles, making force_term 0. | |
C_x_regularized = _mm256_and_ps(C_x_regularized, collision_mask); | |
// Calculate force term: (k * C_reg + lambda) | |
__m256 force_term = _mm256_fmadd_ps(coll_k_current_vec, C_x_regularized, coll_lambda_vec); | |
// Calculate correction magnitude (delta_p in paper): force_term * inv_mass / k_current | |
// For collision, delta_x = -nablaC_j(x) * (H_j_inv * (lambda_j + K_j C_j(x))). | |
// H_j_inv for a single particle collision is its inverse mass (p_inv_mass). | |
// So, the correction should be along the normal: -normal * (force_term * p_inv_mass) / k_current | |
// The force_term already has the appropriate sign for correction direction when C_x_regularized is positive (penetration). | |
__m256 correction_magnitude_scalar = _mm256_div_ps(_mm256_mul_ps(force_term, p_inv_mass), coll_k_current_vec); | |
// Apply the correction along the normal direction. | |
// The sign of 'penetration' (and thus 'C_x_regularized' and 'force_term') | |
// means positive 'correction_magnitude_scalar' should push OUT of collision. | |
// So we subtract from particle position. | |
__m256 delta_pos_x = _mm256_mul_ps(normal_x, correction_magnitude_scalar); | |
__m256 delta_pos_y = _mm256_mul_ps(normal_y, correction_magnitude_scalar); | |
__m256 delta_pos_z = _mm256_mul_ps(normal_z, correction_magnitude_scalar); | |
// Update positions: p_curr = p_curr - delta_pos (particle moves out of penetration) | |
p_curr_x = _mm256_sub_ps(p_curr_x, delta_pos_x); | |
p_curr_y = _mm256_sub_ps(p_curr_y, delta_pos_y); | |
p_curr_z = _mm256_sub_ps(p_curr_z, delta_pos_z); | |
_mm256_store_ps(&cs->p_pos_x[p_base + i], p_curr_x); | |
_mm256_store_ps(&cs->p_pos_y[p_base + i], p_curr_y); | |
_mm256_store_ps(&cs->p_pos_z[p_base + i], p_curr_z); | |
// Apply friction (post-stabilization step) | |
// Recalculate velocity based on updated positions | |
__m256 vel_x = _mm256_sub_ps(p_curr_x, p_prev_x); | |
__m256 vel_y = _mm256_sub_ps(p_curr_y, p_prev_y); | |
__m256 vel_z = _mm256_sub_ps(p_curr_z, p_prev_z); | |
// Project velocity onto normal to get normal component | |
__m256 vel_dot_normal = _mm256_mul_ps(vel_x, normal_x); | |
vel_dot_normal = _mm256_fmadd_ps(vel_y, normal_y, vel_dot_normal); | |
vel_dot_normal = _mm256_fmadd_ps(vel_z, normal_z, vel_dot_normal); | |
// Get tangential velocity component | |
__m256 tangent_x = _mm256_sub_ps(vel_x, _mm256_mul_ps(vel_dot_normal, normal_x)); | |
__m256 tangent_y = _mm256_sub_ps(vel_y, _mm256_mul_ps(vel_dot_normal, normal_y)); | |
__m256 tangent_z = _mm256_sub_ps(vel_z, _mm256_mul_ps(vel_dot_normal, normal_z)); | |
__m256 tangent_mag_sq = _mm256_mul_ps(tangent_x, tangent_x); | |
tangent_mag_sq = _mm256_fmadd_ps(tangent_y, tangent_y, tangent_mag_sq); | |
tangent_mag_sq = _mm256_fmadd_ps(tangent_z, tangent_z, tangent_mag_sq); | |
__m256 inv_tangent_mag = _mm256_rsqrt_ps(_mm256_max_ps(tangent_mag_sq, eps_vec)); | |
// Calculate friction force magnitude limit (based on normal force proxy) | |
__m256 friction_mag_limit = _mm256_mul_ps(penetration, friction_vec); | |
// Actual tangential velocity length | |
__m256 actual_tangent_mag = _mm256_mul_ps(_mm256_sqrt_ps(_mm256_max_ps(tangent_mag_sq, eps_vec)), one_vec); // sqrt(tangent_mag_sq) is needed here | |
// Calculate the scaling factor for friction. If actual_tangent_mag is greater than limit, scale down. | |
__m256 friction_scale = _mm256_div_ps(friction_mag_limit, _mm256_max_ps(actual_tangent_mag, eps_vec)); | |
friction_scale = _mm256_min_ps(friction_scale, one_vec); // Clamp to 1.0 (no acceleration from friction) | |
// Apply friction as a reduction of tangential velocity, converting to position change | |
// The idea is to reduce the tangential displacement that happened *this* iteration. | |
// The original `vel_x/y/z` are already displacements from `p_prev` to `p_curr`. | |
// So, the tangential displacement is `tangent_x/y/z`. | |
// We want to reduce this by `(1 - friction_scale)`. | |
__m256 friction_x_corr = _mm256_mul_ps(tangent_x, friction_scale); | |
__m256 friction_y_corr = _mm256_mul_ps(tangent_y, friction_scale); | |
__m256 friction_z_corr = _mm256_mul_ps(tangent_z, friction_scale); | |
// Only apply if actively colliding | |
friction_x_corr = _mm256_and_ps(friction_x_corr, collision_mask); | |
friction_y_corr = _mm256_and_ps(friction_y_corr, collision_mask); | |
friction_z_corr = _mm256_and_ps(friction_z_corr, collision_mask); | |
// Apply friction by moving the particle along the tangential direction. | |
// If friction_scale is 0 (no friction), no change. If 1 (full friction), moves back to p_prev. | |
p_curr_x = _mm256_sub_ps(p_curr_x, friction_x_corr); | |
p_curr_y = _mm256_sub_ps(p_curr_y, friction_y_corr); | |
p_curr_z = _mm256_sub_ps(p_curr_z, friction_z_corr); | |
_mm256_store_ps(&cs->p_pos_x[p_base + i], p_curr_x); | |
_mm256_store_ps(&cs->p_pos_y[p_base + i], p_curr_y); | |
_mm256_store_ps(&cs->p_pos_z[p_base + i], p_curr_z); | |
// Dual Update for Collision Constraints --- | |
// Recalculate collision error *after* position updates (normal & friction) | |
// Reload updated positions | |
p_curr_x = _mm256_load_ps(&cs->p_pos_x[p_base + i]); | |
p_curr_y = _mm256_load_ps(&cs->p_pos_y[p_base + i]); | |
p_curr_z = _mm256_load_ps(&cs->p_pos_z[p_base + i]); | |
dx = _mm256_sub_ps(p_curr_x, sphere_x_vec); | |
dy = _mm256_sub_ps(p_curr_y, sphere_y_vec); | |
dz = _mm256_sub_ps(p_curr_z, sphere_z_vec); | |
dist_sq = _mm256_mul_ps(dx, dx); | |
dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq); | |
dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq); | |
inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); | |
dist = _mm256_rcp_ps(inv_dist); | |
__m256 C_x_updated_collision = _mm256_sub_ps(sphere_r_vec, dist); // New error after primal update | |
// Apply the collision mask again for updates, so only active constraints update their AVBD state | |
C_x_updated_collision = _mm256_and_ps(C_x_updated_collision, collision_mask); | |
// Update Lambda (Eq. 11: lambda_new = lambda_old + k_current * C_j(x_updated)) | |
__m256 new_coll_lambda_vec = _mm256_fmadd_ps(coll_k_current_vec, C_x_updated_collision, coll_lambda_vec); | |
_mm256_store_ps(&cs->coll_lambda[p_base + i], new_coll_lambda_vec); | |
// Update k_current (Stiffness ramping, Eq. 12) | |
// k_new = min(K_MAX, k_current + BETA * |C_j(x_updated)|) | |
__m256 abs_C_x_updated_collision = _mm256_and_ps(C_x_updated_collision, abs_mask_ps); // Absolute value | |
__m256 k_updated_coll_vec = _mm256_fmadd_ps(beta_coll_vec, abs_C_x_updated_collision, coll_k_current_vec); | |
__m256 new_coll_k_vec = _mm256_min_ps(k_max_coll_vec, k_updated_coll_vec); | |
_mm256_store_ps(&cs->coll_k_current[p_base + i], new_coll_k_vec); | |
// --- END AVBD ADDITION --- | |
} | |
} | |
} | |
// Step 5: Motion Constraints | |
for (int c = 0; c < MAX_CHAINS; c++) { | |
int p_base = c * PARTICLES_PER_CHAIN; | |
for (int i = 0; i < PARTICLES_PER_CHAIN; i += 8) { | |
__m256 px = _mm256_load_ps(&cs->p_pos_x[p_base + i]); | |
__m256 py = _mm256_load_ps(&cs->p_pos_y[p_base + i]); | |
__m256 pz = _mm256_load_ps(&cs->p_pos_z[p_base + i]); | |
__m256 pm = _mm256_load_ps(&cs->inv_mass[p_base + i]); | |
pm = _mm256_max_ps(_mm256_min_ps(pm, inv_mass_clamp_max), inv_mass_clamp_min); | |
__m256 rx = _mm256_load_ps(&cs->rest_pos_x[p_base + i]); | |
__m256 ry = _mm256_load_ps(&cs->rest_pos_y[p_base + i]); | |
__m256 rz = _mm256_load_ps(&cs->rest_pos_z[p_base + i]); | |
__m256 radius_vec = _mm256_load_ps(&cs->motion_radius[p_base + i]); | |
__m256 dx = _mm256_sub_ps(px, rx); | |
__m256 dy = _mm256_sub_ps(py, ry); | |
__m256 dz = _mm256_sub_ps(pz, rz); | |
__m256 dist_sq = _mm256_mul_ps(dx, dx); | |
dist_sq = _mm256_fmadd_ps(dy, dy, dist_sq); | |
dist_sq = _mm256_fmadd_ps(dz, dz, dist_sq); | |
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); | |
__m256 dist = _mm256_rcp_ps(inv_dist); | |
__m256 penetration = _mm256_sub_ps(dist, radius_vec); | |
__m256 mask = _mm256_cmp_ps(penetration, zero_vec, _CMP_GT_OQ); | |
__m256 corr_factor = _mm256_mul_ps(penetration, inv_dist); | |
corr_factor = _mm256_and_ps(corr_factor, mask); | |
__m256 delta_x = _mm256_mul_ps(dx, corr_factor); | |
__m256 delta_y = _mm256_mul_ps(dy, corr_factor); | |
__m256 delta_z = _mm256_mul_ps(dz, corr_factor); | |
delta_x = _mm256_mul_ps(delta_x, pm); | |
delta_y = _mm256_mul_ps(delta_y, pm); | |
delta_z = _mm256_mul_ps(delta_z, pm); | |
_mm256_store_ps(&cs->p_pos_x[p_base + i], _mm256_sub_ps(px, delta_x)); | |
_mm256_store_ps(&cs->p_pos_y[p_base + i], _mm256_sub_ps(py, delta_y)); | |
_mm256_store_ps(&cs->p_pos_z[p_base + i], _mm256_sub_ps(pz, delta_z)); | |
} | |
} | |
// Store Final Errors for Next Frame's Regularization --- | |
// This happens once at the very end of simulate_chains, after all iterations are complete for the frame. | |
for (int c = 0; c < MAX_CHAINS; ++c) { | |
int p_base = c * PARTICLES_PER_CHAIN; | |
int r_base = c * CONSTRAINTS_PER_CHAIN; | |
// Loop through each block of 8 constraints (distance constraints are N-1 for N particles) | |
for (int i = 0; i < CONSTRAINTS_PER_CHAIN - 1; i += VEC_FLOAT_COUNT) { | |
// Calculate the current error for this distance constraint block | |
// (Similar to C_x_original calculation from Step 3, but using final positions) | |
// Load final positions for p0 and p1 (from global arrays now, as local buffer is stale) | |
__m256 px0 = _mm256_load_ps(&cs->p_pos_x[p_base + i + 0]); | |
__m256 py0 = _mm256_load_ps(&cs->p_pos_y[p_base + i + 0]); | |
__m256 pz0 = _mm256_load_ps(&cs->p_pos_z[p_base + i + 0]); | |
__m256 px1 = _mm256_load_ps(&cs->p_pos_x[p_base + i + 1]); // Use _mm256_load_ps as this is from global | |
__m256 py1 = _mm256_load_ps(&cs->p_pos_y[p_base + i + 1]); | |
__m256 pz1 = _mm256_load_ps(&cs->p_pos_z[p_base + i + 1]); | |
__m256 diff_x = _mm256_sub_ps(px1, px0); | |
__m256 diff_y = _mm256_sub_ps(py1, py0); | |
__m256 diff_z = _mm256_sub_ps(pz1, pz0); | |
__m256 dist_sq = _mm256_mul_ps(diff_x, diff_x); | |
dist_sq = _mm256_fmadd_ps(diff_y, diff_y, dist_sq); | |
dist_sq = _mm256_fmadd_ps(diff_z, diff_z, dist_sq); | |
__m256 inv_dist = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq, eps_vec)); | |
__m256 dist = _mm256_rcp_ps(inv_dist); | |
__m256 rest_len_vec = _mm256_load_ps(&cs->rest_lengths[r_base + i]); | |
__m256 final_error_vec = _mm256_sub_ps(dist, rest_len_vec); | |
_mm256_store_ps(&cs->dist_prev_frame_error[r_base + i], final_error_vec); // Store for next frame's regularization | |
} | |
} | |
for (int c_idx = 0; c_idx < MAX_CHAINS; ++c_idx) { | |
int p_base = c_idx * PARTICLES_PER_CHAIN; | |
int s_base = c_idx * SPHERES_PER_CHAIN; // For accessing sphere data for this chain | |
for (int i = 0; i < PARTICLES_PER_CHAIN; i += VEC_FLOAT_COUNT) { | |
__m256 p_curr_x = _mm256_load_ps(&cs->p_pos_x[p_base + i]); | |
__m256 p_curr_y = _mm256_load_ps(&cs->p_pos_y[p_base + i]); | |
__m256 p_curr_z = _mm256_load_ps(&cs->p_pos_z[p_base + i]); | |
__m256 deepest_penetration_vec = neg_flt_max_vec; // Initialize with negative infinity | |
// Loop through all spheres for this chain to find the max penetration for each particle in the block | |
for (int s_idx = 0; s_idx < SPHERES_PER_CHAIN; ++s_idx) { | |
__m256 s_x = _mm256_set1_ps(cs->sphere_x[s_base + s_idx]); | |
__m256 s_y = _mm256_set1_ps(cs->sphere_y[s_base + s_idx]); | |
__m256 s_z = _mm256_set1_ps(cs->sphere_z[s_base + s_idx]); | |
__m256 s_r = _mm256_set1_ps(cs->sphere_radius[s_base + s_idx]); | |
__m256 dx_s = _mm256_sub_ps(p_curr_x, s_x); | |
__m256 dy_s = _mm256_sub_ps(p_curr_y, s_y); | |
__m256 dz_s = _mm256_sub_ps(p_curr_z, s_z); | |
__m256 dist_sq_s = _mm256_mul_ps(dx_s, dx_s); | |
dist_sq_s = _mm256_fmadd_ps(dy_s, dy_s, dist_sq_s); | |
dist_sq_s = _mm256_fmadd_ps(dz_s, dz_s, dist_sq_s); | |
__m256 inv_dist_s = _mm256_rsqrt_ps(_mm256_max_ps(dist_sq_s, eps_vec)); | |
__m256 dist_s = _mm256_rcp_ps(inv_dist_s); | |
__m256 current_penetration_s = _mm256_sub_ps(s_r, dist_s); // C(x) = radius - distance | |
// We only care about positive penetration (i.e., actually colliding) | |
current_penetration_s = _mm256_max_ps(zero_vec, current_penetration_s); | |
deepest_penetration_vec = _mm256_max_ps(deepest_penetration_vec, current_penetration_s); | |
} | |
_mm256_store_ps(&cs->coll_prev_frame_error[p_base + i], deepest_penetration_vec); // Store for next frame's regularization | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment