Last active
October 18, 2017 13:10
-
-
Save Triang3l/e594a7cde9acce45db3f4b36ad7abfcf to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <DirectXMath.h> | |
inline DirectX::XMVECTOR Vector4TransformRM(DirectX::XMVECTOR v, const DirectX::XMMATRIX &m) { | |
// XMVector4Transform doesn't work since the matrix is row-major (dot) and DirectXMath expects column-major (mul, mad). | |
#if defined(_XM_SSE_INTRINSICS_) | |
// XMVector4Transform uses 11 instructions, and XMMatrixTranspose uses 8. This uses 15. | |
// r0x | r0y | r1z | r1w | |
DirectX::XMVECTOR x0y0z1w1 = _mm_shuffle_ps(m.r[0], m.r[1], _MM_SHUFFLE(3, 2, 1, 0)); | |
// r1x | r1y | r0z | r0w | |
DirectX::XMVECTOR x1y1z0w0 = _mm_shuffle_ps(m.r[1], m.r[0], _MM_SHUFFLE(3, 2, 1, 0)); | |
// r2x | r2y | r3z | r3w | |
DirectX::XMVECTOR x2y2z3w3 = _mm_shuffle_ps(m.r[2], m.r[3], _MM_SHUFFLE(3, 2, 1, 0)); | |
// r3x | r3y | r2z | r2w | |
DirectX::XMVECTOR x3y3z2w2 = _mm_shuffle_ps(m.r[3], m.r[2], _MM_SHUFFLE(3, 2, 1, 0)); | |
// vx * r0x | vy * r0y | vz * r1z | vw * r1w | |
x0y0z1w1 = _mm_mul_ps(v, x0y0z1w1); | |
// vx * r2x | vy * r2y | vz * r3z | vw * r3w | |
x2y2z3w3 = _mm_mul_ps(v, x2y2z3w3); | |
// vx * r0x | vz * r1z | vx * r2x | vz * r3z | |
DirectX::XMVECTOR merge1 = _mm_shuffle_ps(x0y0z1w1, x2y2z3w3, _MM_SHUFFLE(2, 0, 2, 0)); | |
// vy * r0y | vw * r1w | vy * r2y | vw * r3w | |
DirectX::XMVECTOR merge2 = _mm_shuffle_ps(x0y0z1w1, x2y2z3w3, _MM_SHUFFLE(3, 1, 3, 1)); | |
// vx * r0x | vz * r1z | vx * r2x | vz * r3z | |
// + | + | + | + | |
// vy * r0y | vw * r1w | vy * r2y | vw * r3w | |
DirectX::XMVECTOR result = _mm_add_ps(merge1, merge2); | |
// vx * r1x | vy * r1y | vz * r0z | vw * r0w | |
x1y1z0w0 = _mm_mul_ps(v, x1y1z0w0); | |
// vx * r3x | vy * r3y | vz * r2z | vw * r2w | |
x3y3z2w2 = _mm_mul_ps(v, x3y3z2w2); | |
// vz * r0z | vx * r1x | vz * r2z | vx * r3x | |
merge1 = _mm_shuffle_ps(x1y1z0w0, x3y3z2w2, _MM_SHUFFLE(0, 2, 0, 2)); | |
// vw * r0w | vy * r1y | vw * r2w | vy * r3y | |
merge2 = _mm_shuffle_ps(x1y1z0w0, x3y3z2w2, _MM_SHUFFLE(1, 3, 1, 3)); | |
// vz * r0z | vx * r1x | vz * r2z | vx * r3x | |
// + | + | + | + | |
// vw * r0w | vy * r1y | vw * r2w | vy * r3y | |
merge1 = _mm_add_ps(merge1, merge2); | |
// xyzw | zwxy | xyzw | zwxy | |
return _mm_add_ps(result, merge1); | |
#else | |
#pragma message("Using XMMatrixTranspose and XMVector4Transform for Vector4Transform on this platform.") | |
return DirectX::XMVector4Transform(v, DirectX::XMMatrixTranspose(m)); | |
#endif | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment