Last active
November 1, 2018 08:02
-
-
Save recp/8ccc5ad0d19f5516de55f9bf7b5045b2 to your computer and use it in GitHub Desktop.
mat4_inv AVX
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CGLM_INLINE | |
void | |
glm_mat4_inv_avx(mat4 mat, mat4 dest) { | |
__m256 y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13; | |
__m256 yt0, yt1, yt2; | |
__m256 t0, t1, t2; | |
__m256 r1, r2; | |
__m256 flpsign; | |
__m256i yi1, yi2, yi3; | |
y0 = glmm_load256(mat[0]); /* h g f e d c b a */ | |
y1 = glmm_load256(mat[2]); /* p o n m l k j i */ | |
y2 = _mm256_permute2f128_ps(y1, y1, 0x00); /* l k j i l k j i */ | |
y3 = _mm256_permute2f128_ps(y1, y1, 0x11); /* p o n m p o n m */ | |
y4 = _mm256_permute2f128_ps(y0, y0, 0x03); /* d c b a h g f e */ | |
y13 = _mm256_permute2f128_ps(y4, y4, 0x00); /* h g f e h g f e */ | |
yi1 = _mm256_set_epi32(0, 0, 0, 0, 0, 1, 1, 2); | |
yi2 = _mm256_set_epi32(1, 1, 1, 2, 3, 2, 3, 3); | |
flpsign = _mm256_set_ps(0.f, -0.f, 0.f, -0.f, -0.f, 0.f, -0.f, 0.f); | |
/* i i i i i j j k */ | |
/* n n n o p o p p */ | |
/* m m m m m n n o */ | |
/* j j j k l k l l */ | |
/* e e e e e f f g */ | |
/* f f f g h g h h */ | |
y5 = _mm256_permutevar_ps(y2, yi1); | |
y6 = _mm256_permutevar_ps(y3, yi2); | |
y7 = _mm256_permutevar_ps(y3, yi1); | |
y8 = _mm256_permutevar_ps(y2, yi2); | |
y2 = _mm256_permutevar_ps(y13, yi1); | |
y3 = _mm256_permutevar_ps(y13, yi2); | |
yi1 = _mm256_set_epi32(2, 1, 0, 0, 2, 1, 0, 0); | |
yi2 = _mm256_set_epi32(2, 1, 1, 0, 2, 1, 1, 0); | |
yi3 = _mm256_set_epi32(3, 3, 2, 0, 3, 3, 2, 0); | |
/* | |
t0[0] = k * p - o * l; t1[0] = g * p - o * h; t2[0] = g * l - k * h; | |
t0[1] = j * p - n * l; t1[1] = f * p - n * h; t2[1] = f * l - j * h; | |
t0[2] = j * o - n * k; t1[2] = f * o - n * g; t2[2] = f * k - j * g; | |
t0[3] = i * p - m * l; t1[3] = e * p - m * h; t2[3] = e * l - i * h; | |
t0[4] = i * o - m * k; t1[4] = e * o - m * g; t2[4] = e * k - i * g; | |
t0[5] = i * n - m * j; t1[5] = e * n - m * f; t2[5] = e * j - i * f; | |
*/ | |
yt0 = _mm256_sub_ps(_mm256_mul_ps(y5, y6), _mm256_mul_ps(y7, y8)); | |
yt1 = _mm256_sub_ps(_mm256_mul_ps(y2, y6), _mm256_mul_ps(y7, y3)); | |
yt2 = _mm256_sub_ps(_mm256_mul_ps(y2, y8), _mm256_mul_ps(y5, y3)); | |
/* t3 t2 t1 t0 t3 t2 t1 t0 */ | |
/* t5 t5 t5 t4 t5 t5 t5 t4 */ | |
y9 = _mm256_permute2f128_ps(yt0, yt0, 0x00); | |
y10 = _mm256_permute2f128_ps(yt0, yt0, 0x11); | |
/* t2 t1 t0 t0 t2 t1 t0 t0 */ | |
t0 = _mm256_permutevar_ps(y9, yi1); | |
/* t4 t3 t3 t1 t4 t3 t3 t1 */ | |
y11 = _mm256_shuffle_ps(y9, y10, 0x4D); | |
y12 = _mm256_permutevar_ps(y11, yi2); | |
t1 = _mm256_permute2f128_ps(y12, y9, 0x00); | |
/* t5 t5 t4 t2 t5 t5 t4 t2 */ | |
y11 = _mm256_shuffle_ps(y9, y10, 0x4A); | |
y12 = _mm256_permutevar_ps(y11, yi3); | |
t2 = _mm256_permute2f128_ps(y12, y12, 0x00); | |
/* a a a b e e e f */ | |
/* b b c c f f g g */ | |
/* c d d d g h h h */ | |
y9 = _mm256_permute_ps(y4, 0x01); | |
y10 = _mm256_permute_ps(y4, 0x5A); | |
y11 = _mm256_permute_ps(y4, 0xBF); | |
/* | |
dest[0][0] = f * t[0] - g * t[1] + h * t[2]; | |
dest[1][0] =-(e * t[0] - g * t[3] + h * t[4]); | |
dest[2][0] = e * t[1] - f * t[3] + h * t[5]; | |
dest[3][0] =-(e * t[2] - f * t[4] + g * t[5]); | |
dest[0][1] =-(b * t[0] - c * t[1] + d * t[2]); | |
dest[1][1] = a * t[0] - c * t[3] + d * t[4]; | |
dest[2][1] =-(a * t[1] - b * t[3] + d * t[5]); | |
dest[3][1] = a * t[2] - b * t[4] + c * t[5]; | |
*/ | |
r1 = _mm256_xor_ps(_mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(y9, t0), | |
_mm256_mul_ps(y10, t1)), | |
_mm256_mul_ps(y11, t2)), | |
flpsign); | |
/* d c b a d c b a */ | |
y2 = _mm256_permute2f128_ps(y0, y0, 0x0); | |
/* a a a b a a a b */ | |
/* b b c c b b c c */ | |
/* c d d d c d d d */ | |
y3 = _mm256_permutevar_ps(y2, _mm256_set_epi32(0, 0, 0, 1, 0, 0, 0, 1)); | |
y4 = _mm256_permutevar_ps(y2, _mm256_set_epi32(1, 1, 2, 2, 1, 1, 2, 2)); | |
y5 = _mm256_permutevar_ps(y2, _mm256_set_epi32(2, 3, 3, 3, 2, 3, 3, 3)); | |
/* t2[3] t2[2] t2[1] t2[0] t1[3] t1[2] t1[1] t1[0] */ | |
/* t2[5] t2[5] t2[5] t2[4] t1[5] t1[5] t1[5] t1[4] */ | |
y6 = _mm256_permute2f128_ps(yt1, yt2, 0x20); | |
y7 = _mm256_permute2f128_ps(yt1, yt2, 0x31); | |
/* t2[2] t2[1] t2[0] t2[0] t1[2] t1[1] t1[0] t1[0] */ | |
t0 = _mm256_permutevar_ps(y6, yi1); | |
/* t1[4] t1[3] t1[3] t1[1] t1[4] t1[3] t1[3] t1[1] */ | |
/* t1[4] t1[3] t1[3] t1[1] t1[4] t1[3] t1[3] t1[1] */ | |
y11 = _mm256_shuffle_ps(y6, y7, 0x4D); | |
t1 = _mm256_permutevar_ps(y11, yi2); | |
/* t2[5] t2[5] t2[4] t2[2] t1[5] t1[5] t1[4] t1[2] */ | |
y11 = _mm256_shuffle_ps(y6, y7, 0x4A); | |
t2 = _mm256_permutevar_ps(y11, yi3); | |
/* | |
dest[0][2] = b * t1[0] - c * t1[1] + d * t1[2]; | |
dest[1][2] =-(a * t1[0] - c * t1[3] + d * t1[4]); | |
dest[2][2] = a * t1[1] - b * t1[3] + d * t1[5]; | |
dest[3][2] =-(a * t1[2] - b * t1[4] + c * t1[5]); | |
dest[0][3] =-(b * t2[0] - c * t2[1] + d * t2[2]); | |
dest[1][3] = a * t2[0] - c * t2[3] + d * t2[4]; | |
dest[2][3] =-(a * t2[1] - b * t2[3] + d * t2[5]); | |
dest[3][3] = a * t2[2] - b * t2[4] + c * t2[5]; | |
*/ | |
r2 = _mm256_xor_ps(_mm256_add_ps(_mm256_sub_ps(_mm256_mul_ps(y3, t0), | |
_mm256_mul_ps(y4, t1)), | |
_mm256_mul_ps(y5, t2)), | |
flpsign); | |
/* determinant */ | |
y4 = _mm256_dp_ps(y0, r1, 0xff); | |
y4 = _mm256_permute2f128_ps(y4, y4, 0x00); | |
y5 = _mm256_div_ps(_mm256_set1_ps(1.0f), y4); | |
r1 = _mm256_mul_ps(r1, y5); | |
r2 = _mm256_mul_ps(r2, y5); | |
/* transpose */ | |
/* d c b a h g f e */ | |
/* l k j i p o n m */ | |
y0 = _mm256_permute2f128_ps(r1, r1, 0x03); | |
y1 = _mm256_permute2f128_ps(r2, r2, 0x03); | |
/* b a f e f e b a */ | |
/* j i n m n m j i */ | |
/* i m a e m i e a */ | |
/* j n b f n j f b */ | |
/* n j f b m i e a */ | |
y2 = _mm256_shuffle_ps(r1, y0, 0x44); | |
y3 = _mm256_shuffle_ps(r2, y1, 0x44); | |
y4 = _mm256_shuffle_ps(y2, y3, 0x88); | |
y5 = _mm256_shuffle_ps(y2, y3, 0xDD); | |
y6 = _mm256_permute2f128_ps(y4, y5, 0x20); | |
/* d c h g h g d c */ | |
/* l k p o p o l k */ | |
/* k o c g o k g c */ | |
/* l p d h p l h d */ | |
/* p l h d o k g c */ | |
y2 = _mm256_shuffle_ps(r1, y0, 0xEE); | |
y3 = _mm256_shuffle_ps(r2, y1, 0xEE); | |
y4 = _mm256_shuffle_ps(y2, y3, 0x88); | |
y5 = _mm256_shuffle_ps(y2, y3, 0xDD); | |
y7 = _mm256_permute2f128_ps(y4, y5, 0x20); | |
glmm_store256(dest[0], y6); | |
glmm_store256(dest[2], y7); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment