Skip to content

Instantly share code, notes, and snippets.

@SeijiEmery
Last active July 23, 2020 20:16
Show Gist options
  • Save SeijiEmery/b2e2e3ab41c289e7a9ae038063b659c8 to your computer and use it in GitHub Desktop.
Save SeijiEmery/b2e2e3ab41c289e7a9ae038063b659c8 to your computer and use it in GitHub Desktop.
fast sphere collision tests w/ 128bit simd
/// fast sphere collision checks w/ 128-bit SIMD
/// (d pseudocode; all these intrinsics -should- exist, however,
/// and ofc neon_check_sphere_collision could just be written
/// in armv8 assembly or whatever)
float dist2 (vec3 a, vec3 b) {
return (a - b).dot;
}
float dot (vec3 a) {
return a.x + a.y + a.z;
}
bool check_sphere_collision(vec3 c, float r, vec3 p) {
return dist2(c, p) <= r * r;
}
neon_vec4 vp_neon_load (vec4*);
float vp_neon_dp (neon_vec4 a, neon_vec4 b);
neon_vec4 vp_neon_mul (neon_vec4 a, neon_vec4 b);
neon_vec4 vp_neon_sub (neon_vec4 a, neon_vec4 b);
/// can treat spheres as vec4s!!!!! :))))
struct Sphere { float x, y, z, r; }
/// idea:
/// - do sphere distance -and- radius check w/ a dot product + sub op
/// - subtract to get dist btw two points
/// - flip the .r value (adds a mul op w/ a vec4 constant - alternatively could you just do a xor on the .w sign bit???)
/// - as .r is now negative (and will be the -only- negative value after a dp),
/// can do the rest of this w/ just a dp op + floating point compare
/// - need to convert p to a vec4 w/ .w = 0, but... if stack is laid out
/// correctly (and we have a 32-bit zero value in the .w position following the vec3),
/// then we -should- be able to just load it as a vec4 w/ .w = 0 :)
/// - that said if we're just passing in 2 pointers instead of 32 bytes of stack memory,
/// then it would probably be prudent to just mask out the .w value w/ neon instructions
/// sidenote:
/// if we're regularly treating vec3s as masked out vec4s, then vec3s -must- always
/// have sufficient padding to NOT potentially cause a segmentation fault.
/// for structs, this could be done by always placing vec3 values at the front or center
/// of the struct, ie. don't ever place a vec3 as the last member of a struct or class,
/// or if you do, add padding
bool neon_check_sphere_collision(Sphere s, vec3 p, float __pad = 0) {
static const vec4 sign_flip = vec4(1, 1, 1, -1);
auto a = (cast(vec4*)(&s)).vp_neon_load;
auto b = (cast(vec4*)(&p)).vp_neon_load;
assert(b.w == 0);
// (s.x - p.x, s.y - p.y, s.z - p.z, s.r)
auto d = a.vp_neon_sub(b);
// (d.x, d.y, d.z, -d.r)
auto d2 = d.vp_neon_mul((&sign_flip).vp_neon_load);
// d.x ** 2 + d.y ** 2 + d.z ** 2 - d.r ** 2 <= 0
// ie. dist2(s.xyz, p.xyz) <= s.r * s.r
return vp_neon_dp(d, d2) <= 0;
}
/// somewhat brilliant: this translates perfectly well to sphere-sphere checks as well!! :)
bool neon_check_sphere_collision(Sphere s1, Sphere s2) {
auto a = s1.vp_neon_load;
auto b = s2.vp_neon_load;
/// flip .r sign on the second argument so we're adding s1.r + s2.r instead of subtracting
b.flip_w_sign();
/// (s1.x - s2.x, s1.y - s2.y, s1.z - s2.z, s1.r + s2.r)
auto d = vp_neon_sub(a, b);
/// (d.x, d.y, d.z, -d.r)
auto d2 = d;
d2.flip_w_sign();
/// d.x ** 2 + d.y ** 2 + d.z ** 2 - (s1.r + s2.r) ** 2 <= 0
// e. dist2(s1.xyz, s2.xyz) <= (s1.r + s2.r) ** 2
return vp_neon_dp(d, d2) <= 0;
}
/// note that in principle this should be quite a bit faster than AABB tests,
/// is short-circuiting but -many- more CPU instructions + branches... unless that could
/// be accelerated?
struct AABB3 {
vec3 a;
vec3 b;
}
bool check_aabb_point_intersection(AABB3 bounds, vec3 p) {
return !(
p.x < bounds.a.x || p.x > bounds.b.x ||
p.y < bounds.a.y || p.y > bounds.b.y ||
p.z < bounds.a.z || p.z > bounds.b.z
);
}
bool check_aabb_aabb_intersection(AABB3 b1, AABB b2) {
return !(
b2.b.x < b1.a.x || b2.a.x > b1.b.x ||
b2.b.y < b1.a.y || b2.a.y > b1.b.y ||
b2.b.z < b1.a.z || b2.a.z > b1.b.z
);
}
bool check_aabb_intersection2(vec4 a, vec4 b, vec4 c, vec4 d) {
return !(
d.x < a.x || b.x < c.x ||
d.y < a.y || b.y < c.y ||
d.z < a.z || b.z < c.z
);
}
bool neon_check_aabb_intersection2(vec4 a, vec4 b, vec4 c, vec4 d) {
auto r1 = d - a;
auto r2 = b - c;
return !(r1.any_negative || r2.any_negative);
}
bool neon_check_aabb_point_intersection(AABB3 bounds, vec3 p) {
return !(
(p - bounds.a).any_negative ||
(bounds.b - p).any_negative
);
}
bool neon_check_aabb_aabb_intersection(AABB3 b1, AABB b2) {
return !(
(b2.b - b1.a).any_negative ||
(b1.b - b2.a).any_negative
);
}
// so yes, this can also be accelerated :)
// (assuming that an instruction to check / count signs exists, anyways...)
// edit: ok, if nothing else:
// - use neon sub ops to calculate t1 = b2.b - b1.a, t2 = b1.b - b2.a
// - store back on the stack
// - use mask ops to check for negative signs:
// i32 mask;
// for (int i = 6, i32* v = cast(i32*)&t1.x; i --> 0; ++v) {
// mask |= *v;
// }
// return !(mask & F32_SIGN_BIT);
//
// should... probably look up IEEE spec to see if that's even valid / works
// (what about +/- infinity, NAN, etc)
// ALSO note that w/ 256bit SIMD and popcnt, we can do an AABB-AABB test w/ 3 simd ops!!!
// (plus a bunch of memory shuffling to get values laid out correctly, which may / may not make this worth it...)
bool check_aabb_aabb_test (AABB3 b1, AABB3 b2) {
float[8] v1;
float[8] v2;
*(cast(vec3*)(&v1[0])) = b2.b;
*(cast(vec3*)(&v1[3])) = b1.a;
*(cast(vec2*)(&v1[6])) = vec2(0, 0);
*(cast(vec3*)(&v2[0])) = b1.b;
*(cast(vec3*)(&v2[3])) = b2.a;
*(cast(vec2*)(&v2[6])) = vec2(0, 0);
auto s = v1.load.sub(v2.load);
auto m = load_f8(F32_SIGN_BIT);
return !(popcnt(s & m) == 0);
}
// remaining question: OBBs (and what is the optimal OBB rotation type -
// quaternions, euler angles, normalized vectors...?)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment