|
#include <immintrin.h> |
|
#include <math.h> |
|
#include <assert.h> |
|
#include <string.h> |
|
|
|
/* --- Compiler Feature Macros --- */ |
|
#if defined(_MSC_VER) |
|
#define ALWAYS_INLINE __forceinline |
|
#define RESTRICT __restrict |
|
#define ALIGN_VAR(x) __declspec(align(x)) |
|
#define UNROLL_HINT __pragma(loop(unroll)) |
|
#elif defined(__GNUC__) || defined(__clang__) |
|
#define ALWAYS_INLINE __attribute__((always_inline)) inline |
|
#define RESTRICT __restrict__ |
|
#define ALIGN_VAR(x) __attribute__((aligned(x))) |
|
#define UNROLL_HINT _Pragma("GCC unroll 2") |
|
#else |
|
#define ALWAYS_INLINE inline |
|
#define RESTRICT |
|
#define ALIGN_VAR(x) |
|
#define UNROLL_HINT |
|
#endif |
|
|
|
/* --- Constants --- */ |
|
#define MAX_MOTION_RADIUS 10000.0f |
|
enum { |
|
MAX_CHNS = 1024, |
|
PTC_PER_CHN = 16, |
|
SPH_PER_CHN = 8, |
|
MAX_PTC = (MAX_CHNS * PTC_PER_CHN), |
|
MAX_SPH = (MAX_CHNS * SPH_PER_CHN), |
|
CON_PER_CHN = PTC_PER_CHN, |
|
MAX_REST_LEN = (MAX_CHNS * CON_PER_CHN), |
|
MAX_ITR = 16, |
|
}; |
|
struct chn_cfg { |
|
float damping; |
|
float stiffness; |
|
float friction; |
|
float linear_inertia; |
|
float angular_inertia; |
|
float centrifugal_inertia; |
|
float bend_stiffness; |
|
float rest_curvature; |
|
float _pad[8]; |
|
} ALIGN_VAR(32); |
|
|
|
struct chn_storage_slot { |
|
uint16_t config_id; // 0:2 - Physics Preset ID |
|
uint16_t local_extent; // 2:4 - Max radius in Millimeters (Fixed Point) |
|
uint32_t ptc_packed[15]; // 4:64 - 15 particles, 10-10-10 format |
|
}; // Exactly 64 Bytes |
|
|
|
struct chn_sim { |
|
unsigned char free_idx_cnt; |
|
unsigned char free_idx[MAX_CHNS]; |
|
|
|
// World Transforms (Anchor Point) |
|
float chn_pos_x[MAX_CHNS] ALIGN_VAR(32); |
|
float chn_pos_y[MAX_CHNS] ALIGN_VAR(32); |
|
float chn_pos_z[MAX_CHNS] ALIGN_VAR(32); |
|
float chn_quat_x[MAX_CHNS] ALIGN_VAR(32); |
|
float chn_quat_y[MAX_CHNS] ALIGN_VAR(32); |
|
float chn_quat_z[MAX_CHNS] ALIGN_VAR(32); |
|
float chn_quat_w[MAX_CHNS] ALIGN_VAR(32); |
|
|
|
float chn_prev_pos_x[MAX_CHNS] ALIGN_VAR(32); |
|
float chn_prev_pos_y[MAX_CHNS] ALIGN_VAR(32); |
|
float chn_prev_pos_z[MAX_CHNS] ALIGN_VAR(32); |
|
float chn_prev_quat_x[MAX_CHNS] ALIGN_VAR(32); |
|
float chn_prev_quat_y[MAX_CHNS] ALIGN_VAR(32); |
|
float chn_prev_quat_z[MAX_CHNS] ALIGN_VAR(32); |
|
float chn_prev_quat_w[MAX_CHNS] ALIGN_VAR(32); |
|
|
|
float chn_grav_x[MAX_CHNS] ALIGN_VAR(32); |
|
float chn_grav_y[MAX_CHNS] ALIGN_VAR(32); |
|
float chn_grav_z[MAX_CHNS] ALIGN_VAR(32); |
|
float chn_wind_x[MAX_CHNS] ALIGN_VAR(32); |
|
float chn_wind_y[MAX_CHNS] ALIGN_VAR(32); |
|
float chn_wind_z[MAX_CHNS] ALIGN_VAR(32); |
|
struct chn_cfg chn_cfg[MAX_CHNS] ALIGN_VAR(32); |
|
|
|
// Local Particles |
|
float ptc_pos_x[MAX_PTC] ALIGN_VAR(32); |
|
float ptc_pos_y[MAX_PTC] ALIGN_VAR(32); |
|
float ptc_pos_z[MAX_PTC] ALIGN_VAR(32); |
|
float ptc_prev_pos_x[MAX_PTC] ALIGN_VAR(32); |
|
float ptc_prev_pos_y[MAX_PTC] ALIGN_VAR(32); |
|
float ptc_prev_pos_z[MAX_PTC] ALIGN_VAR(32); |
|
float ptc_inv_mass[MAX_PTC] ALIGN_VAR(32); |
|
|
|
// Motion Radius (Hard Constraints) |
|
float rest_pos_x[MAX_PTC] ALIGN_VAR(32); |
|
float rest_pos_y[MAX_PTC] ALIGN_VAR(32); |
|
float rest_pos_z[MAX_PTC] ALIGN_VAR(32); |
|
float motion_radius[MAX_PTC] ALIGN_VAR(32); |
|
|
|
// Constraints |
|
float ptc_rest_len[MAX_REST_LEN] ALIGN_VAR(32); |
|
float ptc_lambda[MAX_REST_LEN] ALIGN_VAR(32); |
|
float ptc_bend_lambda[MAX_REST_LEN] ALIGN_VAR(32); |
|
|
|
// Colliders |
|
float sph_x[MAX_SPH] ALIGN_VAR(32); |
|
float sph_y[MAX_SPH] ALIGN_VAR(32); |
|
float sph_z[MAX_SPH] ALIGN_VAR(32); |
|
float sph_r[MAX_SPH] ALIGN_VAR(32); |
|
}; |
|
static ALWAYS_INLINE unsigned |
|
chn_sys__enq_val(float val, unsigned ext) { |
|
// Convert extent from mm to meters |
|
float scale_meters = (float)ext / 1000.0f; |
|
|
|
// Normalized coordinate in range [-1, 1] (roughly) |
|
// Protect against div-by-zero if extent is 0 |
|
float inv_scale = (ext > 0) ? (1.0f / scale_meters) : 0.0f; |
|
float normalized = val * inv_scale; |
|
|
|
// Map to 10-bit signed integer range [-512, 511] |
|
int q = (int)roundf(normalized * 511.0f); |
|
|
|
// Hard clamp to prevent bit-bleeding into other channels |
|
if (q > 511) q = 511; |
|
if (q < -512) q = -512; |
|
|
|
// Mask to 10 bits (bit-level representation of the signed int) |
|
return (unsigned)q & 0x3FFu; |
|
} |
|
|
|
static ALWAYS_INLINE unsigned |
|
chn_sys__enq_ptc(const float *RESTRICT pos3, unsigned ext) { |
|
unsigned x = chn_sys__enq_val(pos3[0], ext); |
|
unsigned y = chn_sys__enq_val(pos3[1], ext); |
|
unsigned z = chn_sys__enq_val(pos3[2], ext); |
|
|
|
// Pack into 10-10-10-2 |
|
return x | (y << 10u) | (z << 20u); |
|
} |
|
|
|
static ALWAYS_INLINE float |
|
chn_sys__deq_val(unsigned val_paq, unsigned ext) { |
|
// 1. Sign-extend the 10-bit value to a 32-bit signed integer |
|
// Shift left 22 bits to put the 10th bit at the sign position, |
|
// then arithmetic shift right 22 bits to fill the top with the sign bit. |
|
int s_val = (int)(val_paq << 22u) >> 22; |
|
|
|
// 2. Map back to float |
|
float normalized = (float)s_val / 511.0f; |
|
float scale_meters = (float)ext / 1000.0f; |
|
return normalized * scale_meters; |
|
} |
|
|
|
static ALWAYS_INLINE void |
|
chn_sys__deq_ptc(float *RESTRICT pos3, unsigned ptc, unsigned ext) { |
|
// Extract and dequantize each component with the correct sign extension |
|
pos3[0] = chn_sys__deq_val(ptc & 0x3FFu, ext); |
|
pos3[1] = chn_sys__deq_val((ptc >> 10u) & 0x3FFu, ext); |
|
pos3[2] = chn_sys__deq_val((ptc >> 20u) & 0x3FFu, ext); |
|
} |
|
|
|
extern void |
|
chn_sim_init(struct chn_sim *cns) { |
|
assert(cns); |
|
memset(cns, 0, sizeof(*cns)); |
|
cns->free_idx_cnt = MAX_CHNS; |
|
for (int i = 0; i < MAX_CHNS; ++i) { |
|
cns->free_idx[i] = MAX_CHNS - i - 1; |
|
cns->chn_quat_w[i] = 1.0f; |
|
cns->chn_prev_quat_w[i] = 1.0f; |
|
} |
|
// 2. Initialize Quaternions using AVX2 |
|
__m256 one_v = _mm256_set1_ps(1.0f); |
|
for (int i = 0; i < MAX_CHNS; i += 8) { |
|
_mm256_store_ps(&cns->chn_quat_w[i], one_v); |
|
_mm256_store_ps(&cns->chn_prev_quat_w[i], one_v); |
|
} |
|
|
|
// 3. Initialize Motion Radius using AVX2 |
|
__m256 rad_v = _mm256_set1_ps(MAX_MOTION_RADIUS); |
|
for (int i = 0; i < MAX_PTC; i += 8) { |
|
_mm256_store_ps(&cns->motion_radius[i], rad_v); |
|
} |
|
for (int i = 0; i < MAX_PTC; ++i) { |
|
cns->motion_radius[i] = MAX_MOTION_RADIUS; |
|
} |
|
} |
|
extern int |
|
chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *RESTRICT rest_pos, int cnt) { |
|
assert(cns); |
|
assert(cfg); |
|
assert(rest_pos); |
|
assert(cns->free_idx_cnt > 0); |
|
|
|
int chn = cns->free_idx[--cns->free_idx_cnt]; |
|
cns->chn_cfg[chn] = *cfg; |
|
|
|
int p_base = (chn * PTC_PER_CHN); |
|
int r_base = (chn * CON_PER_CHN); |
|
for (int i = 0; i < PTC_PER_CHN; ++i) { |
|
int logical_i = (i < 8) ? (i * 2) : ((i - 8) * 2 + 1); |
|
int physical_i = p_base + i; |
|
if (logical_i < cnt) { |
|
cns->ptc_pos_x[physical_i] = rest_pos[logical_i*4+0]; |
|
cns->ptc_pos_y[physical_i] = rest_pos[logical_i*4+1]; |
|
cns->ptc_pos_z[physical_i] = rest_pos[logical_i*4+2]; |
|
cns->ptc_inv_mass[physical_i] = rest_pos[logical_i*4+3]; |
|
cns->motion_radius[physical_i] = MAX_MOTION_RADIUS; |
|
} else { |
|
cns->ptc_pos_x[physical_i] = 0.0f; |
|
cns->ptc_pos_y[physical_i] = 0.0f; |
|
cns->ptc_pos_z[physical_i] = 0.0f; |
|
cns->ptc_inv_mass[physical_i] = 0.0f; |
|
cns->motion_radius[physical_i] = MAX_MOTION_RADIUS; |
|
} |
|
cns->rest_pos_x[physical_i] = cns->ptc_pos_x[physical_i]; |
|
cns->rest_pos_y[physical_i] = cns->ptc_pos_y[physical_i]; |
|
cns->rest_pos_z[physical_i] = cns->ptc_pos_z[physical_i]; |
|
|
|
cns->ptc_prev_pos_x[physical_i] = cns->ptc_pos_x[physical_i]; |
|
cns->ptc_prev_pos_y[physical_i] = cns->ptc_pos_y[physical_i]; |
|
cns->ptc_prev_pos_z[physical_i] = cns->ptc_pos_z[physical_i]; |
|
} |
|
for (int i = 0; i < PTC_PER_CHN; ++i) { |
|
int physical_r = r_base + i; |
|
cns->ptc_rest_len[physical_r] = 0.0f; |
|
cns->ptc_lambda[physical_r] = 0.0f; |
|
cns->ptc_bend_lambda[physical_r] = 0.0f; |
|
|
|
int logical_con_start = (i < 8) ? (i * 2) : ((i - 8) * 2 + 1); |
|
if (logical_con_start + 1 < cnt) { |
|
float dx = rest_pos[(logical_con_start+1)*4+0] - rest_pos[logical_con_start*4+0]; |
|
float dy = rest_pos[(logical_con_start+1)*4+1] - rest_pos[logical_con_start*4+1]; |
|
float dz = rest_pos[(logical_con_start+1)*4+2] - rest_pos[logical_con_start*4+2]; |
|
float len = sqrtf(dx*dx + dy*dy + dz*dz); |
|
cns->ptc_rest_len[physical_r] = (len > 1e-6f) ? len : 1e-6f; |
|
} |
|
} |
|
return chn; |
|
} |
|
extern void |
|
chn_sim_del(struct chn_sim *cns, int chn) { |
|
assert(cns); |
|
assert(chn >= 0); |
|
assert(chn < MAX_CHNS); |
|
|
|
int p_base = (chn * PTC_PER_CHN); |
|
int r_base = (chn * CON_PER_CHN); |
|
int s_base = (chn * SPH_PER_CHN); |
|
|
|
cns->free_idx[cns->free_idx_cnt++] = chn; |
|
memset(&cns->chn_cfg[chn], 0, sizeof(cns->chn_cfg[0])); |
|
|
|
cns->chn_quat_x[chn] = 0.0f; |
|
cns->chn_quat_y[chn] = 0.0f; |
|
cns->chn_quat_z[chn] = 0.0f; |
|
cns->chn_quat_w[chn] = 1.0f; |
|
|
|
cns->chn_prev_quat_x[chn] = 0.0f; |
|
cns->chn_prev_quat_y[chn] = 0.0f; |
|
cns->chn_prev_quat_z[chn] = 0.0f; |
|
cns->chn_prev_quat_w[chn] = 1.0f; |
|
|
|
memset(&cns->ptc_pos_x[p_base], 0, PTC_PER_CHN * sizeof(float)); |
|
memset(&cns->ptc_pos_y[p_base], 0, PTC_PER_CHN * sizeof(float)); |
|
memset(&cns->ptc_pos_z[p_base], 0, PTC_PER_CHN * sizeof(float)); |
|
memset(&cns->ptc_inv_mass[p_base], 0, PTC_PER_CHN * sizeof(float)); |
|
memset(&cns->ptc_rest_len[r_base], 0, CON_PER_CHN * sizeof(float)); |
|
for (int i = 0; i < PTC_PER_CHN; ++i) { |
|
cns->motion_radius[p_base + i] = MAX_MOTION_RADIUS; |
|
} |
|
memset(&cns->sph_r[s_base], 0, SPH_PER_CHN * sizeof(float)); |
|
} |
|
extern void |
|
chn_sim_mov(struct chn_sim *cns, int chn, const float *RESTRICT pos3, const float *restrict rot4) { |
|
assert(cns); |
|
assert(pos3); |
|
assert(rot4); |
|
assert(chn >= 0); |
|
assert(chn < MAX_CHNS); |
|
|
|
cns->chn_pos_x[chn] = pos3[0]; |
|
cns->chn_pos_y[chn] = pos3[1]; |
|
cns->chn_pos_z[chn] = pos3[2]; |
|
|
|
cns->chn_quat_x[chn] = rot4[0]; |
|
cns->chn_quat_y[chn] = rot4[1]; |
|
cns->chn_quat_z[chn] = rot4[2]; |
|
cns->chn_quat_w[chn] = rot4[3]; |
|
} |
|
extern void |
|
chn_sim_tel(struct chn_sim *cns, int chn, const float *RESTRICT pos3, const float *restrict rot4) { |
|
assert(cns); |
|
assert(pos3); |
|
assert(rot4); |
|
assert(chn >= 0); |
|
assert(chn < MAX_CHNS); |
|
|
|
chn_sim_mov(cns, chn, pos3, rot4); |
|
cns->chn_prev_pos_x[chn] = pos3[0]; |
|
cns->chn_prev_pos_y[chn] = pos3[1]; |
|
cns->chn_prev_pos_z[chn] = pos3[2]; |
|
|
|
cns->chn_prev_quat_x[chn] = rot4[0]; |
|
cns->chn_prev_quat_y[chn] = rot4[1]; |
|
cns->chn_prev_quat_z[chn] = rot4[2]; |
|
cns->chn_prev_quat_w[chn] = rot4[3]; |
|
} |
|
extern void |
|
chn_sim_grav(struct chn_sim *cns, int chn, const float *RESTRICT grav3) { |
|
assert(cns); |
|
assert(grav3); |
|
assert(chn >= 0); |
|
assert(chn < MAX_CHNS); |
|
|
|
cns->chn_grav_x[chn] = grav3[0]; |
|
cns->chn_grav_y[chn] = grav3[1]; |
|
cns->chn_grav_z[chn] = grav3[2]; |
|
} |
|
extern void |
|
chn_sim_wind(struct chn_sim *cns, int chn, const float *RESTRICT wind3) { |
|
assert(cns); |
|
assert(wind3); |
|
assert(chn >= 0); |
|
assert(chn < MAX_CHNS); |
|
|
|
cns->chn_wind_x[chn] = wind3[0]; |
|
cns->chn_wind_y[chn] = wind3[1]; |
|
cns->chn_wind_z[chn] = wind3[2]; |
|
} |
|
extern void |
|
chn_sim_set_sph(struct chn_sim *cns, int chn, const float *RESTRICT spheres, int cnt) { |
|
assert(cns); |
|
assert(spheres); |
|
assert(chn >= 0); |
|
assert(chn < MAX_CHNS); |
|
assert(spheres); |
|
assert(cnt >= 0); |
|
assert(cnt < MAX_SPH); |
|
|
|
int s_base = chn * SPH_PER_CHN; |
|
for (int i = 0; i < cnt && i < SPH_PER_CHN; ++i) { |
|
cns->sph_x[s_base + i] = spheres[i*4+0]; |
|
cns->sph_y[s_base + i] = spheres[i*4+1]; |
|
cns->sph_z[s_base + i] = spheres[i*4+2]; |
|
cns->sph_r[s_base + i] = spheres[i*4+3]; |
|
} |
|
for (int i = cnt; i < SPH_PER_CHN; ++i) { |
|
cns->sph_r[s_base + i] = 0.0f; |
|
} |
|
} |
|
extern void |
|
chn_sim_con_motion(struct chn_sim *cns, int chn, const float *RESTRICT radius, int cnt) { |
|
assert(cns); |
|
assert(radius); |
|
assert(chn >= 0); |
|
assert(chn < MAX_CHNS); |
|
assert(cnt >= 0); |
|
assert(cnt < PTC_PER_CHN); |
|
|
|
int p_base = chn * PTC_PER_CHN; |
|
for (int i = 0; i < cnt && i < PTC_PER_CHN; ++i) { |
|
int physical_i = (i < 8) ? (p_base + i) : (p_base + 8 + (i-8)); |
|
cns->motion_radius[physical_i] = radius[i]; |
|
} |
|
} |
|
static ALWAYS_INLINE void |
|
quat_mul_batch8(__m256 *RESTRICT rx, __m256 *RESTRICT ry, |
|
__m256 *RESTRICT rz, __m256 *RESTRICT rw, |
|
__m256 x1, __m256 y1, __m256 z1, __m256 w1, |
|
__m256 x2, __m256 y2, __m256 z2, __m256 w2) { |
|
|
|
// 1. Compute Cross Product terms (Q1.xyz x Q2.xyz) |
|
__m256 cross_x = _mm256_sub_ps(_mm256_mul_ps(y1, z2), _mm256_mul_ps(z1, y2)); |
|
__m256 cross_y = _mm256_sub_ps(_mm256_mul_ps(z1, x2), _mm256_mul_ps(x1, z2)); |
|
__m256 cross_z = _mm256_sub_ps(_mm256_mul_ps(x1, y2), _mm256_mul_ps(y1, x2)); |
|
|
|
// 2. Compute Dot Product term (Q1.xyz . Q2.xyz) |
|
__m256 dot_x = _mm256_mul_ps(x1, x2); |
|
__m256 dot_y = _mm256_mul_ps(y1, y2); |
|
__m256 dot_z = _mm256_mul_ps(z1, z2); |
|
__m256 dot_sum = _mm256_add_ps(dot_x, _mm256_add_ps(dot_y, dot_z)); |
|
|
|
// 3. Compute W scale terms |
|
__m256 w1x2 = _mm256_mul_ps(w1, x2); |
|
__m256 x1w2 = _mm256_mul_ps(x1, w2); |
|
|
|
__m256 w1y2 = _mm256_mul_ps(w1, y2); |
|
__m256 y1w2 = _mm256_mul_ps(y1, w2); |
|
|
|
__m256 w1z2 = _mm256_mul_ps(w1, z2); |
|
__m256 z1w2 = _mm256_mul_ps(z1, w2); |
|
|
|
// 4. Final Assembly |
|
// X = (w1*x2 + x1*w2) - cross_x |
|
*rx = _mm256_sub_ps(_mm256_add_ps(w1x2, x1w2), cross_x); |
|
// Y = (w1*y2 + y1*w2) - cross_y |
|
*ry = _mm256_sub_ps(_mm256_add_ps(w1y2, y1w2), cross_y); |
|
// Z = (w1*z2 + z1*w2) - cross_z |
|
*rz = _mm256_sub_ps(_mm256_add_ps(w1z2, z1w2), cross_z); |
|
// W = (w1*w2) - dot_sum |
|
*rw = _mm256_sub_ps(_mm256_mul_ps(w1, w2), dot_sum); |
|
} |
|
static ALWAYS_INLINE void |
|
rot_vec_batch8(__m256 *RESTRICT rx, __m256 *RESTRICT ry, __m256 *RESTRICT rz, |
|
__m256 qx, __m256 qy, __m256 qz, __m256 qw, |
|
__m256 vx, __m256 vy, __m256 vz) { |
|
|
|
__m256 two = _mm256_set1_ps(2.0f); |
|
|
|
// 1. Calculate Q.xyz x V |
|
__m256 q_cross_v_x = _mm256_sub_ps(_mm256_mul_ps(qy, vz), _mm256_mul_ps(qz, vy)); |
|
__m256 q_cross_v_y = _mm256_sub_ps(_mm256_mul_ps(qz, vx), _mm256_mul_ps(qx, vz)); |
|
__m256 q_cross_v_z = _mm256_sub_ps(_mm256_mul_ps(qx, vy), _mm256_mul_ps(qy, vx)); |
|
|
|
// 2. Calculate T = 2 * (Q.xyz x V) |
|
__m256 tx = _mm256_mul_ps(two, q_cross_v_x); |
|
__m256 ty = _mm256_mul_ps(two, q_cross_v_y); |
|
__m256 tz = _mm256_mul_ps(two, q_cross_v_z); |
|
|
|
// 3. Calculate Q.xyz x T |
|
__m256 q_cross_t_x = _mm256_sub_ps(_mm256_mul_ps(qy, tz), _mm256_mul_ps(qz, ty)); |
|
__m256 q_cross_t_y = _mm256_sub_ps(_mm256_mul_ps(qz, tx), _mm256_mul_ps(qx, tz)); |
|
__m256 q_cross_t_z = _mm256_sub_ps(_mm256_mul_ps(qx, ty), _mm256_mul_ps(qy, tx)); |
|
|
|
// 4. Calculate Q.w * T |
|
__m256 w_times_t_x = _mm256_mul_ps(qw, tx); |
|
__m256 w_times_t_y = _mm256_mul_ps(qw, ty); |
|
__m256 w_times_t_z = _mm256_mul_ps(qw, tz); |
|
|
|
// 5. Final Result: V + (Q.w * T) + (Q.xyz x T) |
|
*rx = _mm256_add_ps(vx, _mm256_add_ps(w_times_t_x, q_cross_t_x)); |
|
*ry = _mm256_add_ps(vy, _mm256_add_ps(w_times_t_y, q_cross_t_y)); |
|
*rz = _mm256_add_ps(vz, _mm256_add_ps(w_times_t_z, q_cross_t_z)); |
|
} |
|
static ALWAYS_INLINE void |
|
rot_vec_inv_batch8(__m256 *RESTRICT rx, __m256 *RESTRICT ry, __m256 *RESTRICT rz, |
|
__m256 qx, __m256 qy, __m256 qz, __m256 qw, |
|
__m256 vx, __m256 vy, __m256 vz) { |
|
|
|
// Conjugate the quaternion (-x, -y, -z, w) |
|
__m256 neg_qx = _mm256_sub_ps(_mm256_setzero_ps(), qx); |
|
__m256 neg_qy = _mm256_sub_ps(_mm256_setzero_ps(), qy); |
|
__m256 neg_qz = _mm256_sub_ps(_mm256_setzero_ps(), qz); |
|
|
|
// Perform standard rotation using the conjugate |
|
rot_vec_batch8(rx, ry, rz, neg_qx, neg_qy, neg_qz, qw, vx, vy, vz); |
|
} |
|
static ALWAYS_INLINE void |
|
solve_dist(__m256 *RESTRICT p1x, __m256 *RESTRICT p1y, __m256 *RESTRICT p1z, |
|
__m256 invm1, __m256 *RESTRICT p2x, |
|
__m256 *RESTRICT p2y, __m256 *RESTRICT p2z, |
|
__m256 invm2, __m256 rest_len, __m256 compliance, |
|
__m256 *RESTRICT lambda, __m256 eps) { |
|
|
|
__m256 dx = _mm256_sub_ps(*p1x, *p2x); |
|
__m256 dy = _mm256_sub_ps(*p1y, *p2y); |
|
__m256 dz = _mm256_sub_ps(*p1z, *p2z); |
|
__m256 d2 = _mm256_fmadd_ps(dx, dx, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dz, dz))); |
|
|
|
__m256 inv_d = _mm256_rsqrt_ps(_mm256_max_ps(d2, eps)); |
|
__m256 dL = _mm256_div_ps(_mm256_sub_ps(_mm256_setzero_ps(), |
|
_mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(d2, inv_d), rest_len), |
|
_mm256_mul_ps(compliance, *lambda))), |
|
_mm256_add_ps(_mm256_add_ps(invm1, invm2), compliance)); |
|
dL = _mm256_and_ps(dL, _mm256_cmp_ps(rest_len, _mm256_setzero_ps(), _CMP_GT_OQ)); |
|
*lambda = _mm256_add_ps(*lambda, dL); |
|
|
|
__m256 px = _mm256_mul_ps(_mm256_mul_ps(dx, inv_d), dL); |
|
__m256 py = _mm256_mul_ps(_mm256_mul_ps(dy, inv_d), dL); |
|
__m256 pz = _mm256_mul_ps(_mm256_mul_ps(dz, inv_d), dL); |
|
|
|
*p1x = _mm256_fmadd_ps(px, invm1, *p1x); |
|
*p1y = _mm256_fmadd_ps(py, invm1, *p1y); |
|
*p1z = _mm256_fmadd_ps(pz, invm1, *p1z); |
|
|
|
*p2x = _mm256_fnmadd_ps(px, invm2, *p2x); |
|
*p2y = _mm256_fnmadd_ps(py, invm2, *p2y); |
|
*p2z = _mm256_fnmadd_ps(pz, invm2, *p2z); |
|
} |
|
static ALWAYS_INLINE void |
|
solve_angle(__m256* RESTRICT p0x, __m256* RESTRICT p0y, __m256 *RESTRICT p0z, |
|
__m256 invm0, __m256 *RESTRICT p1x, __m256 *RESTRICT p1y, |
|
__m256 *RESTRICT p1z, __m256 invm1, __m256 *RESTRICT p2x, |
|
__m256 *RESTRICT p2y, __m256 *RESTRICT p2z, __m256 invm2, |
|
__m256 compliance, __m256 *RESTRICT lambda, __m256 target_cos, |
|
__m256 mask, __m256 eps) { |
|
|
|
__m256 e1x = _mm256_sub_ps(*p1x, *p0x); |
|
__m256 e1y = _mm256_sub_ps(*p1y, *p0y); |
|
__m256 e1z = _mm256_sub_ps(*p1z, *p0z); |
|
__m256 l1_sq = _mm256_fmadd_ps(e1x, e1x, _mm256_fmadd_ps(e1y, e1y, _mm256_mul_ps(e1z, e1z))); |
|
__m256 i1 = _mm256_rsqrt_ps(_mm256_max_ps(l1_sq, eps)); |
|
|
|
__m256 e2x = _mm256_sub_ps(*p2x, *p1x); |
|
__m256 e2y = _mm256_sub_ps(*p2y, *p1y); |
|
__m256 e2z = _mm256_sub_ps(*p2z, *p1z); |
|
__m256 l2_sq = _mm256_fmadd_ps(e2x, e2x, _mm256_fmadd_ps(e2y, e2y, _mm256_mul_ps(e2z, e2z))); |
|
__m256 i2 = _mm256_rsqrt_ps(_mm256_max_ps(l2_sq, eps)); |
|
|
|
__m256 n1x = _mm256_mul_ps(e1x, i1); |
|
__m256 n1y = _mm256_mul_ps(e1y, i1); |
|
__m256 n1z = _mm256_mul_ps(e1z, i1); |
|
|
|
__m256 n2x = _mm256_mul_ps(e2x, i2); |
|
__m256 n2y = _mm256_mul_ps(e2y, i2); |
|
__m256 n2z = _mm256_mul_ps(e2z, i2); |
|
|
|
__m256 dot = _mm256_fmadd_ps(n1x, n2x, _mm256_fmadd_ps(n1y, n2y, _mm256_mul_ps(n1z, n2z))); |
|
__m256 C = _mm256_sub_ps(dot, target_cos); |
|
|
|
__m256 gp0x = _mm256_mul_ps(_mm256_fmsub_ps(dot, n1x, n2x), i1); |
|
__m256 gp0y = _mm256_mul_ps(_mm256_fmsub_ps(dot, n1y, n2y), i1); |
|
__m256 gp0z = _mm256_mul_ps(_mm256_fmsub_ps(dot, n1z, n2z), i1); |
|
|
|
__m256 gp2x = _mm256_mul_ps(_mm256_fmsub_ps(dot, n2x, n1x), _mm256_sub_ps(_mm256_setzero_ps(), i2)); |
|
__m256 gp2y = _mm256_mul_ps(_mm256_fmsub_ps(dot, n2y, n1y), _mm256_sub_ps(_mm256_setzero_ps(), i2)); |
|
__m256 gp2z = _mm256_mul_ps(_mm256_fmsub_ps(dot, n2z, n1z), _mm256_sub_ps(_mm256_setzero_ps(), i2)); |
|
|
|
__m256 gp1x = _mm256_sub_ps(_mm256_setzero_ps(), _mm256_add_ps(gp0x, gp2x)); |
|
__m256 gp1y = _mm256_sub_ps(_mm256_setzero_ps(), _mm256_add_ps(gp0y, gp2y)); |
|
__m256 gp1z = _mm256_sub_ps(_mm256_setzero_ps(), _mm256_add_ps(gp0z, gp2z)); |
|
|
|
__m256 w = _mm256_add_ps(_mm256_mul_ps(invm0, _mm256_fmadd_ps(gp0x, gp0x, _mm256_fmadd_ps(gp0y, gp0y, _mm256_mul_ps(gp0z, gp0z)))), |
|
_mm256_add_ps(_mm256_mul_ps(invm1, _mm256_fmadd_ps(gp1x, gp1x, _mm256_fmadd_ps(gp1y, gp1y, _mm256_mul_ps(gp1z, gp1z)))), |
|
_mm256_mul_ps(invm2, _mm256_fmadd_ps(gp2x, gp2x, _mm256_fmadd_ps(gp2y, gp2y, _mm256_mul_ps(gp2z, gp2z)))))); |
|
__m256 dL = _mm256_div_ps(_mm256_sub_ps(_mm256_setzero_ps(), _mm256_add_ps(C, _mm256_mul_ps(compliance, *lambda))), _mm256_add_ps(w, compliance)); |
|
dL = _mm256_and_ps(dL, mask); // CRITICAL: Mask out the delta lambda for the wrap-around lane |
|
|
|
*lambda = _mm256_add_ps(*lambda, dL); |
|
*p0x = _mm256_fmadd_ps(gp0x, _mm256_mul_ps(dL, invm0), *p0x); |
|
*p0y = _mm256_fmadd_ps(gp0y, _mm256_mul_ps(dL, invm0), *p0y); |
|
*p0z = _mm256_fmadd_ps(gp0z, _mm256_mul_ps(dL, invm0), *p0z); |
|
|
|
*p1x = _mm256_fmadd_ps(gp1x, _mm256_mul_ps(dL, invm1), *p1x); |
|
*p1y = _mm256_fmadd_ps(gp1y, _mm256_mul_ps(dL, invm1), *p1y); |
|
*p1z = _mm256_fmadd_ps(gp1z, _mm256_mul_ps(dL, invm1), *p1z); |
|
|
|
*p2x = _mm256_fmadd_ps(gp2x, _mm256_mul_ps(dL, invm2), *p2x); |
|
*p2y = _mm256_fmadd_ps(gp2y, _mm256_mul_ps(dL, invm2), *p2y); |
|
*p2z = _mm256_fmadd_ps(gp2z, _mm256_mul_ps(dL, invm2), *p2z); |
|
} |
|
/* --- Simulation Loop --- */ |
|
extern void |
|
chn_sim_run(struct chn_sim *cns, float dt) { |
|
const __m256 dt2_v = _mm256_set1_ps(dt * dt); |
|
const __m256 eps_v = _mm256_set1_ps(1e-7f); |
|
const __m256 zero_v = _mm256_setzero_ps(); |
|
const __m256i sl = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0); |
|
const __m256i sr = _mm256_setr_epi32(7, 0, 1, 2, 3, 4, 5, 6); |
|
|
|
ALIGN_VAR(32) float b_fx[8], b_fy[8], b_fz[8]; |
|
ALIGN_VAR(32) float b_vx[8], b_vy[8], b_vz[8]; |
|
ALIGN_VAR(32) float b_qrx[8], b_qry[8], b_qrz[8], b_qrw[8]; |
|
for (int c_blk = 0; c_blk < MAX_CHNS; c_blk += 8) { |
|
/* --- PHASE 1: Batched Anchor Updates --- */ |
|
{ |
|
__m256 qx = _mm256_load_ps(&cns->chn_quat_x[c_blk]); |
|
__m256 qy = _mm256_load_ps(&cns->chn_quat_y[c_blk]); |
|
__m256 qz = _mm256_load_ps(&cns->chn_quat_z[c_blk]); |
|
__m256 qw = _mm256_load_ps(&cns->chn_quat_w[c_blk]); |
|
|
|
__m256 pqx = _mm256_load_ps(&cns->chn_prev_quat_x[c_blk]); |
|
__m256 pqy = _mm256_load_ps(&cns->chn_prev_quat_y[c_blk]); |
|
__m256 pqz = _mm256_load_ps(&cns->chn_prev_quat_z[c_blk]); |
|
__m256 pqw = _mm256_load_ps(&cns->chn_prev_quat_w[c_blk]); |
|
|
|
__m256 cx = _mm256_load_ps(&cns->chn_pos_x[c_blk]); |
|
__m256 cy = _mm256_load_ps(&cns->chn_pos_y[c_blk]); |
|
__m256 cz = _mm256_load_ps(&cns->chn_pos_z[c_blk]); |
|
|
|
__m256 px = _mm256_load_ps(&cns->chn_prev_pos_x[c_blk]); |
|
__m256 py = _mm256_load_ps(&cns->chn_prev_pos_y[c_blk]); |
|
__m256 pz = _mm256_load_ps(&cns->chn_prev_pos_z[c_blk]); |
|
|
|
__m256 fx = _mm256_add_ps(_mm256_load_ps(&cns->chn_grav_x[c_blk]),_mm256_load_ps(&cns->chn_wind_x[c_blk])); |
|
__m256 fy = _mm256_add_ps(_mm256_load_ps(&cns->chn_grav_y[c_blk]),_mm256_load_ps(&cns->chn_wind_y[c_blk])); |
|
__m256 fz = _mm256_add_ps(_mm256_load_ps(&cns->chn_grav_z[c_blk]),_mm256_load_ps(&cns->chn_wind_z[c_blk])); |
|
|
|
__m256 wfx, wfy, wfz; |
|
rot_vec_inv_batch8(&wfx, &wfy, &wfz, qx, qy, qz, qw, fx, fy, fz); |
|
_mm256_store_ps(b_fx, wfx); |
|
_mm256_store_ps(b_fy, wfy); |
|
_mm256_store_ps(b_fz, wfz); |
|
|
|
__m256 lvx, lvy, lvz; |
|
rot_vec_inv_batch8(&lvx, &lvy, &lvz, qx, qy, qz, qw, |
|
_mm256_sub_ps(cx, px), _mm256_sub_ps(cy, py), _mm256_sub_ps(cz, pz)); |
|
|
|
_mm256_store_ps(b_vx, lvx); |
|
_mm256_store_ps(b_vy, lvy); |
|
_mm256_store_ps(b_vz, lvz); |
|
|
|
__m256 qrx, qry, qrz, qrw; |
|
quat_mul_batch8(&qrx, &qry, &qrz, &qrw, |
|
_mm256_sub_ps(zero_v, qx), _mm256_sub_ps(zero_v, qy), _mm256_sub_ps(zero_v, qz), |
|
qw, pqx, pqy, pqz, pqw); |
|
|
|
_mm256_store_ps(b_qrx, qrx); |
|
_mm256_store_ps(b_qry, qry); |
|
_mm256_store_ps(b_qrz, qrz); |
|
_mm256_store_ps(b_qrw, qrw); |
|
|
|
_mm256_store_ps(&cns->chn_prev_pos_x[c_blk], cx); |
|
_mm256_store_ps(&cns->chn_prev_pos_y[c_blk], cy); |
|
_mm256_store_ps(&cns->chn_prev_pos_z[c_blk], cz); |
|
|
|
_mm256_store_ps(&cns->chn_prev_quat_x[c_blk], qx); |
|
_mm256_store_ps(&cns->chn_prev_quat_y[c_blk], qy); |
|
_mm256_store_ps(&cns->chn_prev_quat_z[c_blk], qz); |
|
_mm256_store_ps(&cns->chn_prev_quat_w[c_blk], qw); |
|
} |
|
/* --- PHASE 2: Local Chain Simulation --- */ |
|
for (int i = 0; i < 8; ++i) { |
|
int c = c_blk + i; |
|
int p_base = c * PTC_PER_CHN, r_base = c * CON_PER_CHN; |
|
struct chn_cfg *cfg = &cns->chn_cfg[c]; |
|
|
|
__m256 damp = _mm256_set1_ps(cfg->damping); |
|
__m256 compl = _mm256_div_ps(dt2_v, _mm256_max_ps(_mm256_set1_ps(cfg->stiffness), eps_v)); |
|
__m256 b_compl = _mm256_div_ps(dt2_v, _mm256_max_ps(_mm256_set1_ps(cfg->bend_stiffness), eps_v)); |
|
__m256 t_cos = _mm256_set1_ps(1.0f - (cfg->rest_curvature * 0.5f)); |
|
|
|
__m256 qrx = _mm256_set1_ps(b_qrx[i]); |
|
__m256 qry = _mm256_set1_ps(b_qry[i]); |
|
__m256 qrz = _mm256_set1_ps(b_qrz[i]); |
|
__m256 qrw = _mm256_set1_ps(b_qrw[i]); |
|
__m256 o_sq = _mm256_add_ps(_mm256_mul_ps(qrx, qrx), _mm256_add_ps(_mm256_mul_ps(qry, qry), _mm256_mul_ps(qrz, qrz))); |
|
|
|
__m256 lv_x = _mm256_set1_ps(b_vx[i]); |
|
__m256 lv_y = _mm256_set1_ps(b_vy[i]); |
|
__m256 lv_z = _mm256_set1_ps(b_vz[i]); |
|
|
|
__m256 i_l = _mm256_set1_ps(cfg->linear_inertia); |
|
__m256 i_a = _mm256_set1_ps(cfg->angular_inertia); |
|
__m256 i_c = _mm256_set1_ps(cfg->centrifugal_inertia); |
|
|
|
__m256 p_pos[3][2], p_prev[3][2]; |
|
__m256 inv_m[2], rl[2], lm[2], blm[2], r_pos[3][2], m_rad[2]; |
|
|
|
UNROLL_HINT |
|
for (int k = 0; k < 2; ++k) { |
|
int off = p_base + k*8; |
|
int r_off = r_base + k*8; |
|
|
|
p_pos[0][k] = _mm256_load_ps(&cns->ptc_pos_x[off]); |
|
p_pos[1][k] = _mm256_load_ps(&cns->ptc_pos_y[off]); |
|
p_pos[2][k] = _mm256_load_ps(&cns->ptc_pos_z[off]); |
|
|
|
p_prev[0][k] = _mm256_load_ps(&cns->ptc_prev_pos_x[off]); |
|
p_prev[1][k] = _mm256_load_ps(&cns->ptc_prev_pos_y[off]); |
|
p_prev[2][k] = _mm256_load_ps(&cns->ptc_prev_pos_z[off]); |
|
|
|
inv_m[k] = _mm256_load_ps(&cns->ptc_inv_mass[off]); |
|
rl[k] = _mm256_load_ps(&cns->ptc_rest_len[r_off]); |
|
lm[k] = _mm256_load_ps(&cns->ptc_lambda[r_off]); |
|
blm[k] = _mm256_load_ps(&cns->ptc_bend_lambda[r_off]); |
|
|
|
r_pos[0][k] = _mm256_load_ps(&cns->rest_pos_x[off]); |
|
r_pos[1][k] = _mm256_load_ps(&cns->rest_pos_y[off]); |
|
r_pos[2][k] = _mm256_load_ps(&cns->rest_pos_z[off]); |
|
m_rad[k] = _mm256_load_ps(&cns->motion_radius[off]); |
|
} |
|
/* --- Predict + Inertia --- */ |
|
UNROLL_HINT |
|
for (int k = 0; k < 2; ++k) { |
|
__m256 dyn = _mm256_cmp_ps(inv_m[k], zero_v, _CMP_GT_OQ); |
|
__m256 vx = _mm256_sub_ps(p_pos[0][k], p_prev[0][k]); |
|
__m256 vy = _mm256_sub_ps(p_pos[1][k], p_prev[1][k]); |
|
__m256 vz = _mm256_sub_ps(p_pos[2][k], p_prev[2][k]); |
|
|
|
vx = _mm256_sub_ps(vx, _mm256_and_ps(_mm256_mul_ps(lv_x, i_l), dyn)); |
|
vy = _mm256_sub_ps(vy, _mm256_and_ps(_mm256_mul_ps(lv_y, i_l), dyn)); |
|
vz = _mm256_sub_ps(vz, _mm256_and_ps(_mm256_mul_ps(lv_z, i_l), dyn)); |
|
|
|
__m256 rax, ray, raz; |
|
rot_vec_batch8(&rax, &ray, &raz, qrx, qry, qrz, qrw, p_pos[0][k], p_pos[1][k], p_pos[2][k]); |
|
vx = _mm256_sub_ps(vx, _mm256_and_ps(_mm256_mul_ps(_mm256_sub_ps(p_pos[0][k], rax), i_a), dyn)); |
|
vy = _mm256_sub_ps(vy, _mm256_and_ps(_mm256_mul_ps(_mm256_sub_ps(p_pos[1][k], ray), i_a), dyn)); |
|
vz = _mm256_sub_ps(vz, _mm256_and_ps(_mm256_mul_ps(_mm256_sub_ps(p_pos[2][k], raz), i_a), dyn)); |
|
|
|
vx = _mm256_add_ps(vx, _mm256_and_ps(_mm256_mul_ps(_mm256_mul_ps(p_pos[0][k], o_sq), i_c), dyn)); |
|
vy = _mm256_add_ps(vy, _mm256_and_ps(_mm256_mul_ps(_mm256_mul_ps(p_pos[1][k], o_sq), i_c), dyn)); |
|
vz = _mm256_add_ps(vz, _mm256_and_ps(_mm256_mul_ps(_mm256_mul_ps(p_pos[2][k], o_sq), i_c), dyn)); |
|
|
|
p_prev[0][k] = p_pos[0][k]; |
|
p_prev[1][k] = p_pos[1][k]; |
|
p_prev[2][k] = p_pos[2][k]; |
|
|
|
__m256 fx = _mm256_set1_ps(b_fx[i]); |
|
__m256 fy = _mm256_set1_ps(b_fy[i]); |
|
__m256 fz = _mm256_set1_ps(b_fz[i]); |
|
|
|
p_pos[0][k] = _mm256_fmadd_ps(_mm256_mul_ps(fx, inv_m[k]), dt2_v, _mm256_fmadd_ps(vx, damp, p_pos[0][k])); |
|
p_pos[1][k] = _mm256_fmadd_ps(_mm256_mul_ps(fy, inv_m[k]), dt2_v, _mm256_fmadd_ps(vy, damp, p_pos[1][k])); |
|
p_pos[2][k] = _mm256_fmadd_ps(_mm256_mul_ps(fz, inv_m[k]), dt2_v, _mm256_fmadd_ps(vz, damp, p_pos[2][k])); |
|
} |
|
/* --- XPBD Solver Iterations --- */ |
|
for (int it = 0; it < MAX_ITR; ++it) { |
|
{ |
|
// Even(i) <-> Odd(i) |
|
__m256 p2x = p_pos[0][1]; |
|
__m256 p2y = p_pos[1][1]; |
|
__m256 p2z = p_pos[2][1]; |
|
__m256 p2m = inv_m[1]; |
|
solve_dist(&p_pos[0][0], &p_pos[1][0], &p_pos[2][0], inv_m[0], |
|
&p2x, &p2y, &p2z, p2m, rl[0], compl, &lm[0], eps_v); |
|
|
|
// Direct write-back for k=0 |
|
p_pos[0][1] = p2x; |
|
p_pos[1][1] = p2y; |
|
p_pos[2][1] = p2z; |
|
} |
|
{ |
|
// Odd(i) <-> Shifted Even(i+1) |
|
__m256 p2x = _mm256_permutevar8x32_ps(p_pos[0][0], sl); |
|
__m256 p2y = _mm256_permutevar8x32_ps(p_pos[1][0], sl); |
|
__m256 p2z = _mm256_permutevar8x32_ps(p_pos[2][0], sl); |
|
__m256 p2m = _mm256_permutevar8x32_ps(inv_m[0], sl); |
|
|
|
__m256 o2x = p2x; |
|
__m256 o2y = p2y; |
|
__m256 o2z = p2z; |
|
solve_dist(&p_pos[0][1], &p_pos[1][1], &p_pos[2][1], inv_m[1], |
|
&p2x, &p2y, &p2z, p2m, rl[1], compl, &lm[1], eps_v); |
|
|
|
// Shifted accumulation for k=1 |
|
p_pos[0][0] = _mm256_add_ps(p_pos[0][0], _mm256_permutevar8x32_ps(_mm256_sub_ps(p2x, o2x), sr)); |
|
p_pos[1][0] = _mm256_add_ps(p_pos[1][0], _mm256_permutevar8x32_ps(_mm256_sub_ps(p2y, o2y), sr)); |
|
p_pos[2][0] = _mm256_add_ps(p_pos[2][0], _mm256_permutevar8x32_ps(_mm256_sub_ps(p2z, o2z), sr)); |
|
} |
|
// Bending (Group 1: Odd Center) |
|
{ |
|
__m256 v_b = _mm256_and_ps(_mm256_cmp_ps(rl[0], zero_v, _CMP_GT_OQ), _mm256_cmp_ps(rl[1], zero_v, _CMP_GT_OQ)); |
|
__m256 rx = _mm256_permutevar8x32_ps(p_pos[0][0], sl); |
|
__m256 ry = _mm256_permutevar8x32_ps(p_pos[1][0], sl); |
|
__m256 rz = _mm256_permutevar8x32_ps(p_pos[2][0], sl); |
|
__m256 rm = _mm256_permutevar8x32_ps(inv_m[0], sl); |
|
|
|
solve_angle(&p_pos[0][0], &p_pos[1][0], &p_pos[2][0], inv_m[0], |
|
&p_pos[0][1], &p_pos[1][1], &p_pos[2][1], inv_m[1], &rx, &ry, &rz, rm, |
|
b_compl, &blm[1], t_cos, v_b, eps_v); |
|
|
|
p_pos[0][0] = _mm256_blendv_ps(p_pos[0][0], _mm256_permutevar8x32_ps(rx, sr), v_b); |
|
p_pos[1][0] = _mm256_blendv_ps(p_pos[1][0], _mm256_permutevar8x32_ps(ry, sr), v_b); |
|
p_pos[2][0] = _mm256_blendv_ps(p_pos[2][0], _mm256_permutevar8x32_ps(rz, sr), v_b); |
|
} |
|
// Bending (Group 2: Even Center) |
|
{ |
|
__m256 v_b = _mm256_and_ps(_mm256_cmp_ps(_mm256_permutevar8x32_ps(rl[1], sr), zero_v, _CMP_GT_OQ), _mm256_cmp_ps(rl[0], zero_v, _CMP_GT_OQ)); |
|
__m256 lx = _mm256_permutevar8x32_ps(p_pos[0][1], sr); |
|
__m256 ly = _mm256_permutevar8x32_ps(p_pos[1][1], sr); |
|
__m256 lz = _mm256_permutevar8x32_ps(p_pos[2][1], sr); |
|
__m256 lmm = _mm256_permutevar8x32_ps(inv_m[1], sr); |
|
|
|
solve_angle(&lx, &ly, &lz, lmm, |
|
&p_pos[0][0], &p_pos[1][0], &p_pos[2][0], inv_m[0], |
|
&p_pos[0][1], &p_pos[1][1], &p_pos[2][1], inv_m[1], |
|
b_compl, &blm[0], t_cos, v_b, eps_v); |
|
|
|
p_pos[0][1] = _mm256_blendv_ps(p_pos[0][1], _mm256_permutevar8x32_ps(lx, sl), v_b); |
|
p_pos[1][1] = _mm256_blendv_ps(p_pos[1][1], _mm256_permutevar8x32_ps(ly, sl), v_b); |
|
p_pos[2][1] = _mm256_blendv_ps(p_pos[2][1], _mm256_permutevar8x32_ps(lz, sl), v_b); |
|
} |
|
} |
|
/* --- Collisions --- */ |
|
int s_base = c * SPH_PER_CHN; |
|
const __m256 f_coef = _mm256_set1_ps(cfg->friction); |
|
for (int s = 0; s < SPH_PER_CHN; ++s) { |
|
__m256 sr = _mm256_broadcast_ss(&cns->sph_r[s_base + s]); |
|
__m256 sx = _mm256_broadcast_ss(&cns->sph_x[s_base + s]); |
|
__m256 sy = _mm256_broadcast_ss(&cns->sph_y[s_base + s]); |
|
__m256 sz = _mm256_broadcast_ss(&cns->sph_z[s_base + s]); |
|
|
|
UNROLL_HINT |
|
for (int k = 0; k < 2; ++k) { |
|
__m256 dx = _mm256_sub_ps(p_pos[0][k], sx); |
|
__m256 dy = _mm256_sub_ps(p_pos[1][k], sy); |
|
__m256 dz = _mm256_sub_ps(p_pos[2][k], sz); |
|
__m256 d2 = _mm256_add_ps(_mm256_mul_ps(dx, dx), _mm256_add_ps(_mm256_mul_ps(dy, dy), _mm256_mul_ps(dz, dz))); |
|
|
|
// 1. Calculate safe distance |
|
__m256 id = _mm256_rsqrt_ps(_mm256_max_ps(d2, eps_v)); |
|
__m256 dist = _mm256_mul_ps(d2, id); |
|
|
|
// 2. Only calculate push if there is actual penetration and sr > 0 |
|
__m256 active_sph = _mm256_cmp_ps(sr, eps_v, _CMP_GT_OQ); |
|
__m256 colliding = _mm256_cmp_ps(dist, sr, _CMP_LT_OQ); |
|
__m256 mask = _mm256_and_ps(active_sph, colliding); |
|
|
|
__m256 pen = _mm256_sub_ps(sr, dist); |
|
__m256 push = _mm256_and_ps(pen, mask); |
|
|
|
// 3. Normal vector (zeroed if not colliding to prevent NaN propagation) |
|
__m256 nx = _mm256_mul_ps(dx, id); |
|
__m256 ny = _mm256_mul_ps(dy, id); |
|
__m256 nz = _mm256_mul_ps(dz, id); |
|
|
|
p_pos[0][k] = _mm256_add_ps(p_pos[0][k], _mm256_mul_ps(nx, push)); |
|
p_pos[1][k] = _mm256_add_ps(p_pos[1][k], _mm256_mul_ps(ny, push)); |
|
p_pos[2][k] = _mm256_add_ps(p_pos[2][k], _mm256_mul_ps(nz, push)); |
|
|
|
// 4. Friction (Stable Step-by-Step) |
|
__m256 vx = _mm256_sub_ps(p_pos[0][k], p_prev[0][k]); |
|
__m256 vy = _mm256_sub_ps(p_pos[1][k], p_prev[1][k]); |
|
__m256 vz = _mm256_sub_ps(p_pos[2][k], p_prev[2][k]); |
|
__m256 dot = _mm256_add_ps(_mm256_mul_ps(vx, nx), _mm256_add_ps(_mm256_mul_ps(vy, ny), _mm256_mul_ps(vz, nz))); |
|
|
|
// Tangent Velocity = Total Velocity - Normal Velocity |
|
__m256 tx = _mm256_sub_ps(vx, _mm256_mul_ps(dot, nx)); |
|
__m256 ty = _mm256_sub_ps(vy, _mm256_mul_ps(dot, ny)); |
|
__m256 tz = _mm256_sub_ps(vz, _mm256_mul_ps(dot, nz)); |
|
__m256 f_scale = _mm256_mul_ps(push, f_coef); |
|
|
|
// Apply friction only where colliding |
|
p_pos[0][k] = _mm256_sub_ps(p_pos[0][k], _mm256_and_ps(_mm256_mul_ps(tx, f_scale), mask)); |
|
p_pos[1][k] = _mm256_sub_ps(p_pos[1][k], _mm256_and_ps(_mm256_mul_ps(ty, f_scale), mask)); |
|
p_pos[2][k] = _mm256_sub_ps(p_pos[2][k], _mm256_and_ps(_mm256_mul_ps(tz, f_scale), mask)); |
|
} |
|
} |
|
/* --- Motion Radius Guardrail --- */ |
|
UNROLL_HINT |
|
for (int k = 0; k < 2; ++k) { |
|
__m256 dx = _mm256_sub_ps(p_pos[0][k], r_pos[0][k]); |
|
__m256 dy = _mm256_sub_ps(p_pos[1][k], r_pos[1][k]); |
|
__m256 dz = _mm256_sub_ps(p_pos[2][k], r_pos[2][k]); |
|
|
|
__m256 d2 = _mm256_fmadd_ps(dx, dx, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dz, dz))); |
|
__m256 dist = _mm256_sqrt_ps(d2); |
|
__m256 mask = _mm256_cmp_ps(dist, m_rad[k], _CMP_GT_OQ); |
|
__m256 scale = _mm256_div_ps(m_rad[k], _mm256_max_ps(dist, eps_v)); |
|
|
|
p_pos[0][k] = _mm256_blendv_ps(p_pos[0][k], _mm256_fmadd_ps(dx, scale, r_pos[0][k]), mask); |
|
p_pos[1][k] = _mm256_blendv_ps(p_pos[1][k], _mm256_fmadd_ps(dy, scale, r_pos[1][k]), mask); |
|
p_pos[2][k] = _mm256_blendv_ps(p_pos[2][k], _mm256_fmadd_ps(dz, scale, r_pos[2][k]), mask); |
|
} |
|
/* --- Final Store --- */ |
|
UNROLL_HINT |
|
for (int k = 0; k < 2; ++k) { |
|
int off = p_base + k*8, r_off = r_base + k*8; |
|
_mm256_store_ps(&cns->ptc_pos_x[off], p_pos[0][k]); |
|
_mm256_store_ps(&cns->ptc_pos_y[off], p_pos[1][k]); |
|
_mm256_store_ps(&cns->ptc_pos_z[off], p_pos[2][k]); |
|
|
|
_mm256_store_ps(&cns->ptc_prev_pos_x[off], p_prev[0][k]); |
|
_mm256_store_ps(&cns->ptc_prev_pos_y[off], p_prev[1][k]); |
|
_mm256_store_ps(&cns->ptc_prev_pos_z[off], p_prev[2][k]); |
|
|
|
_mm256_store_ps(&cns->ptc_lambda[r_off], lm[k]); |
|
_mm256_store_ps(&cns->ptc_bend_lambda[r_off], blm[k]); |
|
} |
|
} |
|
} |
|
} |
|
|