Created
September 27, 2009 15:13
-
-
Save syoyo/194813 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// | |
// SIMD Ray triangle intersection with ARM/NEON instruction. | |
// | |
// Compile: | |
// CC=/Developer/Platforms/iPhoneOS.platform/Developer/usr/bin/arm-apple-darwin9-gcc-4.2.1 | |
// CFLAGS=-march=armv7-a -mtune=cortex-a8 -mfloat-abi=softfp -mfpu=neon | |
// INCLUDES=-I/Developer/Platforms/iPhoneOS.platform/Developer/usr/lib/gcc/arm-apple-darwin9/4.2.1/include | |
// OPTFLAGS=-O3 | |
// | |
// Copyright 2009. Syoyo Fujita. | |
// | |
#include <arm_neon.h> | |
#define float4 float32x4_t | |
#define int4 int32x4_t | |
#define uint4 uint32x4_t | |
#define vset_f4(x) vdupq_n_f32((x)) | |
#define fabs_f4(x) vabsq_f32((x)) | |
// TODO: Use Newton-Raphson for better accuracy. | |
#define vdiv_f4(x, y) vrecpsq_f32((x), (y)) | |
typedef struct _ray4_t | |
{ | |
float4 rox, roy, roz; | |
float4 rdx, rdy, rdz; | |
float4 t; | |
} ray4_t; | |
typedef struct _triangle4_t | |
{ | |
float4 v0x, v0y, v0z; | |
float4 e1x, e1y, e1z; | |
float4 e2x, e2y, e2z; | |
} triangle4_t; | |
typedef struct _triangle_t | |
{ | |
float v[3][4]; // (x,y,z,w) * 3 | |
} triangle_t; | |
static inline float4 | |
mycross4(float4 a, float4 b, float4 c, float4 d) | |
{ | |
return ((a * c) - (b * d)); | |
} | |
static inline float4 | |
mydot4(float4 ax, float4 ay, float4 az, float4 bx, float4 by, float4 bz) | |
{ | |
return (ax * bx + ay * by + az * bz); | |
} | |
extern uint4 | |
isect4( | |
float4 *t_out, | |
float4 *u_out, | |
float4 *v_out, | |
ray4_t ray, | |
triangle4_t tri) | |
{ | |
const float4 px = mycross4(tri.e2z, tri.e2y, ray.rdy, ray.rdz); | |
const float4 py = mycross4(tri.e2x, tri.e2z, ray.rdz, ray.rdx); | |
const float4 pz = mycross4(tri.e2y, tri.e2x, ray.rdx, ray.rdy); | |
const float4 sx = ray.rox - tri.v0x; | |
const float4 sy = ray.roy - tri.v0y; | |
const float4 sz = ray.roz - tri.v0z; | |
const float4 vone = vset_f4(1.0f); | |
const float4 vzero = vset_f4(0.0f); | |
const float4 veps = vset_f4(1.0e-6f); | |
const float4 det = mydot4(px, py, pz, tri.e1x, tri.e1y, tri.e1z); | |
const float4 invdet = vdiv_f4(vone, det); | |
const float4 qx = mycross4(tri.e1z, tri.e1y, sy, sz); | |
const float4 qy = mycross4(tri.e1x, tri.e1z, sz, sx); | |
const float4 qz = mycross4(tri.e1y, tri.e1y, sx, sy); | |
const float4 u = mydot4(sx, sy, sz, px, py, pz) * invdet; | |
const float4 v = mydot4(ray.rdx, ray.rdy, ray.rdz, qx, qy, qz) * invdet; | |
const float4 t = mydot4(tri.e2x, tri.e2y, tri.e2z, qx, qy, qz) * invdet; | |
uint4 mask0 = vcgtq_f32(fabs_f4(det), veps); | |
uint4 mask1 = vcltq_f32((u+v), vone); | |
uint4 mask2 = vcgtq_f32(u, vzero); | |
uint4 mask3 = vcgtq_f32(v, vzero); | |
uint4 mask4 = vcgtq_f32(t, vzero); | |
uint4 mask5 = vcltq_f32(t, ray.t); | |
uint4 mask = mask0 & mask1 & mask2 & mask3 & mask4 & mask5; | |
(*t_out) = t; | |
(*u_out) = u; | |
(*v_out) = v; | |
return mask; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment