SeijiEmery · July 23, 2020 20:16
diff --git a/collision_tests.d b/collision_tests.d
 /// fast sphere collision checks w/ 128-bit SIMD
 /// (d pseudocode; all these intrinsics -should- exist, however,
 ///  and ofc neon_check_sphere_collision could just be written
 ///  in armv8 assembly or whatever)

 float dist2 (vec3 a, vec3 b) {
    return (a - b).dot;
 }
 float dot (vec3 a) {
    return a.x + a.y + a.z;
 }
 bool check_sphere_collision(vec3 c, float r, vec3 p) {
    return dist2(c, p) <= r * r;
 }

 neon_vec4  vp_neon_load  (vec4*);
 float      vp_neon_dp    (neon_vec4 a, neon_vec4 b);
 neon_vec4  vp_neon_mul   (neon_vec4 a, neon_vec4 b);
 neon_vec4  vp_neon_sub   (neon_vec4 a, neon_vec4 b);

 /// can treat spheres as vec4s!!!!! :))))
 struct Sphere { float x, y, z, r; }

 /// idea: 
 ///     - do sphere distance -and- radius check w/ a dot product + sub op
 ///         - subtract to get dist btw two points
 ///         - flip the .r value (adds a mul op w/ a vec4 constant - alternatively could you just do a xor on the .w sign bit???)
 ///         - as .r is now negative (and will be the -only- negative value after a dp),
 ///           can do the rest of this w/ just a dp op + floating point compare
 ///     - need to convert p to a vec4 w/ .w = 0, but... if stack is laid out 
 ///         correctly (and we have a 32-bit zero value in the .w position following the vec3),
 ///         then we -should- be able to just load it as a vec4 w/ .w = 0 :)
 ///     - that said if we're just passing in 2 pointers instead of 32 bytes of stack memory,
 ///         then it would probably be prudent to just mask out the .w value w/ neon instructions
 /// sidenote:
 ///     if we're regularly treating vec3s as masked out vec4s, then vec3s -must- always
 ///     have sufficient padding to NOT potentially cause a segmentation fault.
 ///     for structs, this could be done by always placing vec3 values at the front or center
 ///     of the struct, ie. don't ever place a vec3 as the last member of a struct or class,
 ///     or if you do, add padding
 bool neon_check_sphere_collision(Sphere s, vec3 p, float __pad = 0) {
    static const vec4 sign_flip = vec4(1, 1, 1, -1);
    auto a = (cast(vec4*)(&s)).vp_neon_load;
    auto b = (cast(vec4*)(&p)).vp_neon_load;
    assert(b.w == 0);

    // (s.x - p.x, s.y - p.y, s.z - p.z, s.r)
    auto d = a.vp_neon_sub(b);
    
    // (d.x, d.y, d.z, -d.r)
    auto d2 = d.vp_neon_mul((&sign_flip).vp_neon_load);
    
    // d.x ** 2 + d.y ** 2 + d.z ** 2 - d.r ** 2 <= 0
    // ie. dist2(s.xyz, p.xyz) <= s.r * s.r
    return vp_neon_dp(d, d2) <= 0;
 }

 /// somewhat brilliant: this translates perfectly well to sphere-sphere checks as well!! :)
 bool neon_check_sphere_collision(Sphere s1, Sphere s2) {
    auto a = s1.vp_neon_load;
    auto b = s2.vp_neon_load;

    /// flip .r sign on the second argument so we're adding s1.r + s2.r instead of subtracting
    b.flip_w_sign();

    /// (s1.x - s2.x, s1.y - s2.y, s1.z - s2.z, s1.r + s2.r)
    auto d = vp_neon_sub(a, b);

    /// (d.x, d.y, d.z, -d.r)
    auto d2 = d;
    d2.flip_w_sign();

    /// d.x ** 2 + d.y ** 2 + d.z ** 2 - (s1.r + s2.r) ** 2 <= 0
    // e. dist2(s1.xyz, s2.xyz) <= (s1.r + s2.r) ** 2
    return vp_neon_dp(d, d2) <= 0;
 }

 /// note that in principle this should be quite a bit faster than AABB tests,
 /// is short-circuiting but -many- more CPU instructions + branches... unless that could
 /// be accelerated?

 struct AABB3 {
    vec3 a;
    vec3 b;
 }

 bool check_aabb_point_intersection(AABB3 bounds, vec3 p) {
    return !(
        p.x < bounds.a.x || p.x > bounds.b.x ||
        p.y < bounds.a.y || p.y > bounds.b.y ||
        p.z < bounds.a.z || p.z > bounds.b.z
    );
 }
 bool check_aabb_aabb_intersection(AABB3 b1, AABB b2) {
    return !(
        b2.b.x < b1.a.x || b2.a.x > b1.b.x ||
        b2.b.y < b1.a.y || b2.a.y > b1.b.y ||
        b2.b.z < b1.a.z || b2.a.z > b1.b.z
    );
 }

 bool check_aabb_intersection2(vec4 a, vec4 b, vec4 c, vec4 d) {
    return !(
        d.x < a.x || b.x < c.x ||
        d.y < a.y || b.y < c.y ||
        d.z < a.z || b.z < c.z
    );
 }
 bool neon_check_aabb_intersection2(vec4 a, vec4 b, vec4 c, vec4 d) {
    auto r1 = d - a;
    auto r2 = b - c;
    return !(r1.any_negative || r2.any_negative);
 }

 bool neon_check_aabb_point_intersection(AABB3 bounds, vec3 p) {
    return !(
        (p - bounds.a).any_negative ||
        (bounds.b - p).any_negative
    );
 }
 bool neon_check_aabb_aabb_intersection(AABB3 b1, AABB b2) {
    return !(
        (b2.b - b1.a).any_negative ||
        (b1.b - b2.a).any_negative
    );
 }

 // so yes, this can also be accelerated :)
 // (assuming that an instruction to check / count signs exists, anyways...)

 // edit: ok, if nothing else:
 // - use neon sub ops to calculate t1 = b2.b - b1.a, t2 = b1.b - b2.a
 // - store back on the stack
 // - use mask ops to check for negative signs:
 //  i32 mask;
 //  for (int i = 6, i32* v = cast(i32*)&t1.x; i --> 0; ++v) {
 //      mask |= *v;
 //  }
 //  return !(mask & F32_SIGN_BIT);
 //
 // should... probably look up IEEE spec to see if that's even valid / works
 // (what about +/- infinity, NAN, etc)

 // ALSO note that w/ 256bit SIMD and popcnt, we can do an AABB-AABB test w/ 3 simd ops!!!
 // (plus a bunch of memory shuffling to get values laid out correctly, which may / may not make this worth it...)
 bool check_aabb_aabb_test (AABB3 b1, AABB3 b2) {
    float[8] v1;
    float[8] v2;
    
    *(cast(vec3*)(&v1[0])) = b2.b;
    *(cast(vec3*)(&v1[3])) = b1.a;
    *(cast(vec2*)(&v1[6])) = vec2(0, 0);
    
    *(cast(vec3*)(&v2[0])) = b1.b;
    *(cast(vec3*)(&v2[3])) = b2.a;
    *(cast(vec2*)(&v2[6])) = vec2(0, 0);
    
    auto s = v1.load.sub(v2.load);
    auto m = load_f8(F32_SIGN_BIT);
    
    return !(popcnt(s & m) == 0);
 }


 // remaining question: OBBs (and what is the optimal OBB rotation type -
 //   quaternions, euler angles, normalized vectors...?)
	/// fast sphere collision checks w/ 128-bit SIMD
	/// (d pseudocode; all these intrinsics -should- exist, however,
	/// and ofc neon_check_sphere_collision could just be written
	/// in armv8 assembly or whatever)

	float dist2 (vec3 a, vec3 b) {
	return (a - b).dot;
	}
	float dot (vec3 a) {
	return a.x + a.y + a.z;
	}
	bool check_sphere_collision(vec3 c, float r, vec3 p) {
	return dist2(c, p) <= r * r;
	}

	neon_vec4 vp_neon_load (vec4*);
	float vp_neon_dp (neon_vec4 a, neon_vec4 b);
	neon_vec4 vp_neon_mul (neon_vec4 a, neon_vec4 b);
	neon_vec4 vp_neon_sub (neon_vec4 a, neon_vec4 b);

	/// can treat spheres as vec4s!!!!! :))))
	struct Sphere { float x, y, z, r; }

	/// idea:
	/// - do sphere distance -and- radius check w/ a dot product + sub op
	/// - subtract to get dist btw two points
	/// - flip the .r value (adds a mul op w/ a vec4 constant - alternatively could you just do a xor on the .w sign bit???)
	/// - as .r is now negative (and will be the -only- negative value after a dp),
	/// can do the rest of this w/ just a dp op + floating point compare
	/// - need to convert p to a vec4 w/ .w = 0, but... if stack is laid out
	/// correctly (and we have a 32-bit zero value in the .w position following the vec3),
	/// then we -should- be able to just load it as a vec4 w/ .w = 0 :)
	/// - that said if we're just passing in 2 pointers instead of 32 bytes of stack memory,
	/// then it would probably be prudent to just mask out the .w value w/ neon instructions
	/// sidenote:
	/// if we're regularly treating vec3s as masked out vec4s, then vec3s -must- always
	/// have sufficient padding to NOT potentially cause a segmentation fault.
	/// for structs, this could be done by always placing vec3 values at the front or center
	/// of the struct, ie. don't ever place a vec3 as the last member of a struct or class,
	/// or if you do, add padding
	bool neon_check_sphere_collision(Sphere s, vec3 p, float __pad = 0) {
	static const vec4 sign_flip = vec4(1, 1, 1, -1);
	auto a = (cast(vec4*)(&s)).vp_neon_load;
	auto b = (cast(vec4*)(&p)).vp_neon_load;
	assert(b.w == 0);

	// (s.x - p.x, s.y - p.y, s.z - p.z, s.r)
	auto d = a.vp_neon_sub(b);

	// (d.x, d.y, d.z, -d.r)
	auto d2 = d.vp_neon_mul((&sign_flip).vp_neon_load);

	// d.x 2 + d.y 2 + d.z 2 - d.r 2 <= 0
	// ie. dist2(s.xyz, p.xyz) <= s.r * s.r
	return vp_neon_dp(d, d2) <= 0;
	}

	/// somewhat brilliant: this translates perfectly well to sphere-sphere checks as well!! :)
	bool neon_check_sphere_collision(Sphere s1, Sphere s2) {
	auto a = s1.vp_neon_load;
	auto b = s2.vp_neon_load;

	/// flip .r sign on the second argument so we're adding s1.r + s2.r instead of subtracting
	b.flip_w_sign();

	/// (s1.x - s2.x, s1.y - s2.y, s1.z - s2.z, s1.r + s2.r)
	auto d = vp_neon_sub(a, b);

	/// (d.x, d.y, d.z, -d.r)
	auto d2 = d;
	d2.flip_w_sign();

	/// d.x 2 + d.y 2 + d.z 2 - (s1.r + s2.r) 2 <= 0
	// e. dist2(s1.xyz, s2.xyz) <= (s1.r + s2.r) ** 2
	return vp_neon_dp(d, d2) <= 0;
	}

	/// note that in principle this should be quite a bit faster than AABB tests,
	/// is short-circuiting but -many- more CPU instructions + branches... unless that could
	/// be accelerated?

	struct AABB3 {
	vec3 a;
	vec3 b;
	}

	bool check_aabb_point_intersection(AABB3 bounds, vec3 p) {
	return !(
	p.x < bounds.a.x \|\| p.x > bounds.b.x \|\|
	p.y < bounds.a.y \|\| p.y > bounds.b.y \|\|
	p.z < bounds.a.z \|\| p.z > bounds.b.z
	);
	}
	bool check_aabb_aabb_intersection(AABB3 b1, AABB b2) {
	return !(
	b2.b.x < b1.a.x \|\| b2.a.x > b1.b.x \|\|
	b2.b.y < b1.a.y \|\| b2.a.y > b1.b.y \|\|
	b2.b.z < b1.a.z \|\| b2.a.z > b1.b.z
	);
	}

	bool check_aabb_intersection2(vec4 a, vec4 b, vec4 c, vec4 d) {
	return !(
	d.x < a.x \|\| b.x < c.x \|\|
	d.y < a.y \|\| b.y < c.y \|\|
	d.z < a.z \|\| b.z < c.z
	);
	}
	bool neon_check_aabb_intersection2(vec4 a, vec4 b, vec4 c, vec4 d) {
	auto r1 = d - a;
	auto r2 = b - c;
	return !(r1.any_negative \|\| r2.any_negative);
	}

	bool neon_check_aabb_point_intersection(AABB3 bounds, vec3 p) {
	return !(
	(p - bounds.a).any_negative \|\|
	(bounds.b - p).any_negative
	);
	}
	bool neon_check_aabb_aabb_intersection(AABB3 b1, AABB b2) {
	return !(
	(b2.b - b1.a).any_negative \|\|
	(b1.b - b2.a).any_negative
	);
	}

	// so yes, this can also be accelerated :)
	// (assuming that an instruction to check / count signs exists, anyways...)

	// edit: ok, if nothing else:
	// - use neon sub ops to calculate t1 = b2.b - b1.a, t2 = b1.b - b2.a
	// - store back on the stack
	// - use mask ops to check for negative signs:
	// i32 mask;
	// for (int i = 6, i32* v = cast(i32*)&t1.x; i --> 0; ++v) {
	// mask \|= *v;
	// }
	// return !(mask & F32_SIGN_BIT);
	//
	// should... probably look up IEEE spec to see if that's even valid / works
	// (what about +/- infinity, NAN, etc)

	// ALSO note that w/ 256bit SIMD and popcnt, we can do an AABB-AABB test w/ 3 simd ops!!!
	// (plus a bunch of memory shuffling to get values laid out correctly, which may / may not make this worth it...)
	bool check_aabb_aabb_test (AABB3 b1, AABB3 b2) {
	float[8] v1;
	float[8] v2;

	(cast(vec3)(&v1[0])) = b2.b;
	(cast(vec3)(&v1[3])) = b1.a;
	(cast(vec2)(&v1[6])) = vec2(0, 0);

	(cast(vec3)(&v2[0])) = b1.b;
	(cast(vec3)(&v2[3])) = b2.a;
	(cast(vec2)(&v2[6])) = vec2(0, 0);

	auto s = v1.load.sub(v2.load);
	auto m = load_f8(F32_SIGN_BIT);

	return !(popcnt(s & m) == 0);
	}


	// remaining question: OBBs (and what is the optimal OBB rotation type -
	// quaternions, euler angles, normalized vectors...?)