Skip to content

Instantly share code, notes, and snippets.

@syoyo
Created September 27, 2009 15:13
Show Gist options
  • Save syoyo/194813 to your computer and use it in GitHub Desktop.
Save syoyo/194813 to your computer and use it in GitHub Desktop.
//
// SIMD Ray triangle intersection with ARM/NEON instruction.
//
// Compile:
// CC=/Developer/Platforms/iPhoneOS.platform/Developer/usr/bin/arm-apple-darwin9-gcc-4.2.1
// CFLAGS=-march=armv7-a -mtune=cortex-a8 -mfloat-abi=softfp -mfpu=neon
// INCLUDES=-I/Developer/Platforms/iPhoneOS.platform/Developer/usr/lib/gcc/arm-apple-darwin9/4.2.1/include
// OPTFLAGS=-O3
//
// Copyright 2009. Syoyo Fujita.
//
#include <arm_neon.h>
#define float4 float32x4_t
#define int4 int32x4_t
#define uint4 uint32x4_t
#define vset_f4(x) vdupq_n_f32((x))
#define fabs_f4(x) vabsq_f32((x))
// TODO: Use Newton-Raphson for better accuracy.
#define vdiv_f4(x, y) vrecpsq_f32((x), (y))
typedef struct _ray4_t
{
float4 rox, roy, roz;
float4 rdx, rdy, rdz;
float4 t;
} ray4_t;
typedef struct _triangle4_t
{
float4 v0x, v0y, v0z;
float4 e1x, e1y, e1z;
float4 e2x, e2y, e2z;
} triangle4_t;
typedef struct _triangle_t
{
float v[3][4]; // (x,y,z,w) * 3
} triangle_t;
static inline float4
mycross4(float4 a, float4 b, float4 c, float4 d)
{
return ((a * c) - (b * d));
}
static inline float4
mydot4(float4 ax, float4 ay, float4 az, float4 bx, float4 by, float4 bz)
{
return (ax * bx + ay * by + az * bz);
}
extern uint4
isect4(
float4 *t_out,
float4 *u_out,
float4 *v_out,
ray4_t ray,
triangle4_t tri)
{
const float4 px = mycross4(tri.e2z, tri.e2y, ray.rdy, ray.rdz);
const float4 py = mycross4(tri.e2x, tri.e2z, ray.rdz, ray.rdx);
const float4 pz = mycross4(tri.e2y, tri.e2x, ray.rdx, ray.rdy);
const float4 sx = ray.rox - tri.v0x;
const float4 sy = ray.roy - tri.v0y;
const float4 sz = ray.roz - tri.v0z;
const float4 vone = vset_f4(1.0f);
const float4 vzero = vset_f4(0.0f);
const float4 veps = vset_f4(1.0e-6f);
const float4 det = mydot4(px, py, pz, tri.e1x, tri.e1y, tri.e1z);
const float4 invdet = vdiv_f4(vone, det);
const float4 qx = mycross4(tri.e1z, tri.e1y, sy, sz);
const float4 qy = mycross4(tri.e1x, tri.e1z, sz, sx);
const float4 qz = mycross4(tri.e1y, tri.e1y, sx, sy);
const float4 u = mydot4(sx, sy, sz, px, py, pz) * invdet;
const float4 v = mydot4(ray.rdx, ray.rdy, ray.rdz, qx, qy, qz) * invdet;
const float4 t = mydot4(tri.e2x, tri.e2y, tri.e2z, qx, qy, qz) * invdet;
uint4 mask0 = vcgtq_f32(fabs_f4(det), veps);
uint4 mask1 = vcltq_f32((u+v), vone);
uint4 mask2 = vcgtq_f32(u, vzero);
uint4 mask3 = vcgtq_f32(v, vzero);
uint4 mask4 = vcgtq_f32(t, vzero);
uint4 mask5 = vcltq_f32(t, ray.t);
uint4 mask = mask0 & mask1 & mask2 & mask3 & mask4 & mask5;
(*t_out) = t;
(*u_out) = u;
(*v_out) = v;
return mask;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment