Last active
July 23, 2020 20:16
-
-
Save SeijiEmery/b2e2e3ab41c289e7a9ae038063b659c8 to your computer and use it in GitHub Desktop.
fast sphere collision tests w/ 128bit simd
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/// fast sphere collision checks w/ 128-bit SIMD | |
/// (d pseudocode; all these intrinsics -should- exist, however, | |
/// and ofc neon_check_sphere_collision could just be written | |
/// in armv8 assembly or whatever) | |
float dist2 (vec3 a, vec3 b) { | |
return (a - b).dot; | |
} | |
float dot (vec3 a) { | |
return a.x + a.y + a.z; | |
} | |
bool check_sphere_collision(vec3 c, float r, vec3 p) { | |
return dist2(c, p) <= r * r; | |
} | |
neon_vec4 vp_neon_load (vec4*); | |
float vp_neon_dp (neon_vec4 a, neon_vec4 b); | |
neon_vec4 vp_neon_mul (neon_vec4 a, neon_vec4 b); | |
neon_vec4 vp_neon_sub (neon_vec4 a, neon_vec4 b); | |
/// can treat spheres as vec4s!!!!! :)))) | |
struct Sphere { float x, y, z, r; } | |
/// idea: | |
/// - do sphere distance -and- radius check w/ a dot product + sub op | |
/// - subtract to get dist btw two points | |
/// - flip the .r value (adds a mul op w/ a vec4 constant - alternatively could you just do a xor on the .w sign bit???) | |
/// - as .r is now negative (and will be the -only- negative value after a dp), | |
/// can do the rest of this w/ just a dp op + floating point compare | |
/// - need to convert p to a vec4 w/ .w = 0, but... if stack is laid out | |
/// correctly (and we have a 32-bit zero value in the .w position following the vec3), | |
/// then we -should- be able to just load it as a vec4 w/ .w = 0 :) | |
/// - that said if we're just passing in 2 pointers instead of 32 bytes of stack memory, | |
/// then it would probably be prudent to just mask out the .w value w/ neon instructions | |
/// sidenote: | |
/// if we're regularly treating vec3s as masked out vec4s, then vec3s -must- always | |
/// have sufficient padding to NOT potentially cause a segmentation fault. | |
/// for structs, this could be done by always placing vec3 values at the front or center | |
/// of the struct, ie. don't ever place a vec3 as the last member of a struct or class, | |
/// or if you do, add padding | |
bool neon_check_sphere_collision(Sphere s, vec3 p, float __pad = 0) { | |
static const vec4 sign_flip = vec4(1, 1, 1, -1); | |
auto a = (cast(vec4*)(&s)).vp_neon_load; | |
auto b = (cast(vec4*)(&p)).vp_neon_load; | |
assert(b.w == 0); | |
// (s.x - p.x, s.y - p.y, s.z - p.z, s.r) | |
auto d = a.vp_neon_sub(b); | |
// (d.x, d.y, d.z, -d.r) | |
auto d2 = d.vp_neon_mul((&sign_flip).vp_neon_load); | |
// d.x ** 2 + d.y ** 2 + d.z ** 2 - d.r ** 2 <= 0 | |
// ie. dist2(s.xyz, p.xyz) <= s.r * s.r | |
return vp_neon_dp(d, d2) <= 0; | |
} | |
/// somewhat brilliant: this translates perfectly well to sphere-sphere checks as well!! :) | |
bool neon_check_sphere_collision(Sphere s1, Sphere s2) { | |
auto a = s1.vp_neon_load; | |
auto b = s2.vp_neon_load; | |
/// flip .r sign on the second argument so we're adding s1.r + s2.r instead of subtracting | |
b.flip_w_sign(); | |
/// (s1.x - s2.x, s1.y - s2.y, s1.z - s2.z, s1.r + s2.r) | |
auto d = vp_neon_sub(a, b); | |
/// (d.x, d.y, d.z, -d.r) | |
auto d2 = d; | |
d2.flip_w_sign(); | |
/// d.x ** 2 + d.y ** 2 + d.z ** 2 - (s1.r + s2.r) ** 2 <= 0 | |
// e. dist2(s1.xyz, s2.xyz) <= (s1.r + s2.r) ** 2 | |
return vp_neon_dp(d, d2) <= 0; | |
} | |
/// note that in principle this should be quite a bit faster than AABB tests, | |
/// is short-circuiting but -many- more CPU instructions + branches... unless that could | |
/// be accelerated? | |
struct AABB3 { | |
vec3 a; | |
vec3 b; | |
} | |
bool check_aabb_point_intersection(AABB3 bounds, vec3 p) { | |
return !( | |
p.x < bounds.a.x || p.x > bounds.b.x || | |
p.y < bounds.a.y || p.y > bounds.b.y || | |
p.z < bounds.a.z || p.z > bounds.b.z | |
); | |
} | |
bool check_aabb_aabb_intersection(AABB3 b1, AABB b2) { | |
return !( | |
b2.b.x < b1.a.x || b2.a.x > b1.b.x || | |
b2.b.y < b1.a.y || b2.a.y > b1.b.y || | |
b2.b.z < b1.a.z || b2.a.z > b1.b.z | |
); | |
} | |
bool check_aabb_intersection2(vec4 a, vec4 b, vec4 c, vec4 d) { | |
return !( | |
d.x < a.x || b.x < c.x || | |
d.y < a.y || b.y < c.y || | |
d.z < a.z || b.z < c.z | |
); | |
} | |
bool neon_check_aabb_intersection2(vec4 a, vec4 b, vec4 c, vec4 d) { | |
auto r1 = d - a; | |
auto r2 = b - c; | |
return !(r1.any_negative || r2.any_negative); | |
} | |
bool neon_check_aabb_point_intersection(AABB3 bounds, vec3 p) { | |
return !( | |
(p - bounds.a).any_negative || | |
(bounds.b - p).any_negative | |
); | |
} | |
bool neon_check_aabb_aabb_intersection(AABB3 b1, AABB b2) { | |
return !( | |
(b2.b - b1.a).any_negative || | |
(b1.b - b2.a).any_negative | |
); | |
} | |
// so yes, this can also be accelerated :) | |
// (assuming that an instruction to check / count signs exists, anyways...) | |
// edit: ok, if nothing else: | |
// - use neon sub ops to calculate t1 = b2.b - b1.a, t2 = b1.b - b2.a | |
// - store back on the stack | |
// - use mask ops to check for negative signs: | |
// i32 mask; | |
// for (int i = 6, i32* v = cast(i32*)&t1.x; i --> 0; ++v) { | |
// mask |= *v; | |
// } | |
// return !(mask & F32_SIGN_BIT); | |
// | |
// should... probably look up IEEE spec to see if that's even valid / works | |
// (what about +/- infinity, NAN, etc) | |
// ALSO note that w/ 256bit SIMD and popcnt, we can do an AABB-AABB test w/ 3 simd ops!!! | |
// (plus a bunch of memory shuffling to get values laid out correctly, which may / may not make this worth it...) | |
bool check_aabb_aabb_test (AABB3 b1, AABB3 b2) { | |
float[8] v1; | |
float[8] v2; | |
*(cast(vec3*)(&v1[0])) = b2.b; | |
*(cast(vec3*)(&v1[3])) = b1.a; | |
*(cast(vec2*)(&v1[6])) = vec2(0, 0); | |
*(cast(vec3*)(&v2[0])) = b1.b; | |
*(cast(vec3*)(&v2[3])) = b2.a; | |
*(cast(vec2*)(&v2[6])) = vec2(0, 0); | |
auto s = v1.load.sub(v2.load); | |
auto m = load_f8(F32_SIGN_BIT); | |
return !(popcnt(s & m) == 0); | |
} | |
// remaining question: OBBs (and what is the optimal OBB rotation type - | |
// quaternions, euler angles, normalized vectors...?) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment