Last active
July 31, 2016 08:11
-
-
Save chikuzen/e48222ad603d08270ea21e2ee7dfa3d4 to your computer and use it in GitHub Desktop.
why there is no_MM_TRANSPOSE_8_PS macro ?
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <immintrin.h> | |
void transpose_8x8_avx(const float* srcp, float* dstp) noexcept | |
{ | |
__m256 a = _mm256_load_ps(srcp + 0); // a0 a1 a2 a3 a4 a5 a6 a7 | |
__m256 b = _mm256_load_ps(srcp + 8); // b0 b1 b2 b3 b4 b5 b6 b7 | |
__m256 c = _mm256_load_ps(srcp + 16); // c0 c1 c2 c3 c4 c5 c6 c7 | |
__m256 d = _mm256_load_ps(srcp + 24); // d0 d1 d2 d3 d4 d5 d6 d7 | |
__m256 e = _mm256_load_ps(srcp + 32); // e0 e1 e2 e3 e4 e5 e6 e7 | |
__m256 f = _mm256_load_ps(srcp + 40); // f0 f1 f2 f3 f4 f5 f6 f7 | |
__m256 g = _mm256_load_ps(srcp + 48); // g0 g1 g2 g3 g4 g5 g6 g7 | |
__m256 h = _mm256_load_ps(srcp + 56); // h0 h1 h2 h3 h4 h5 h6 h7 | |
__m256 ac0145 = _mm256_unpacklo_ps(a, c); // a0 c0 a1 c1 a4 c4 a5 c5 | |
__m256 ac2367 = _mm256_unpackhi_ps(a, c); // a2 c2 a3 c3 a6 c6 a7 c7 | |
__m256 bd0145 = _mm256_unpacklo_ps(b, d); // b0 d0 b1 d1 b4 d4 b5 d5 | |
__m256 bd2367 = _mm256_unpackhi_ps(b, d); // b2 d2 b3 d3 b6 d6 b7 d7 | |
__m256 eg0145 = _mm256_unpacklo_ps(e, g); // e0 g0 e1 g1 e4 g4 e5 g5 | |
__m256 eg2367 = _mm256_unpackhi_ps(e, g); // e2 g2 e3 g3 e6 g6 e7 g7 | |
__m256 fh0145 = _mm256_unpacklo_ps(f, h); // f0 h0 f1 h1 f4 h4 f5 h5 | |
__m256 fh2367 = _mm256_unpackhi_ps(f, h); // f2 h2 f3 h3 f6 h6 f7 h7 | |
__m256 abcd04 = _mm256_unpacklo_ps(ac0145, bd0145); // a0 b0 c0 d0 a4 b4 c4 d4 | |
__m256 abcd15 = _mm256_unpackhi_ps(ac0145, bd0145); // a1 b1 c1 d1 a5 b5 c5 d5 | |
__m256 abcd26 = _mm256_unpacklo_ps(ac2367, bd2367); // a2 b2 c2 d2 a6 b6 c6 d6 | |
__m256 abcd37 = _mm256_unpackhi_ps(ac2367, bd2367); // a3 b3 c3 d3 a7 b7 c7 d7 | |
__m256 efgh04 = _mm256_unpacklo_ps(eg0145, fh0145); // e0 f0 g0 h0 e4 f4 g4 h4 | |
__m256 efgh15 = _mm256_unpackhi_ps(eg0145, fh0145); // e1 f1 g1 h1 e5 f5 g5 h5 | |
__m256 efgh26 = _mm256_unpacklo_ps(eg2367, fh2367); // e2 f2 g2 h2 e6 f6 g6 h6 | |
__m256 efgh37 = _mm256_unpackhi_ps(eg2367, fh2367); // e3 f3 g3 h3 e7 f7 g7 h7 | |
__m256 abcdefgh0 = _mm256_permute2f128_ps(abcd04, efgh04, (2 << 4) | 0); | |
__m256 abcdefgh4 = _mm256_permute2f128_ps(abcd04, efgh04, (3 << 4) | 1); | |
__m256 abcdefgh1 = _mm256_permute2f128_ps(abcd15, efgh15, (2 << 4) | 0); | |
__m256 abcdefgh5 = _mm256_permute2f128_ps(abcd15, efgh15, (3 << 4) | 1); | |
__m256 abcdefgh2 = _mm256_permute2f128_ps(abcd26, efgh26, (2 << 4) | 0); | |
__m256 abcdefgh6 = _mm256_permute2f128_ps(abcd26, efgh26, (3 << 4) | 1); | |
__m256 abcdefgh3 = _mm256_permute2f128_ps(abcd37, efgh37, (2 << 4) | 0); | |
__m256 abcdefgh7 = _mm256_permute2f128_ps(abcd37, efgh37, (3 << 4) | 1); | |
_mm256_store_ps(dstp + 0, abcdefgh0); | |
_mm256_store_ps(dstp + 8, abcdefgh1); | |
_mm256_store_ps(dstp + 16, abcdefgh2); | |
_mm256_store_ps(dstp + 24, abcdefgh3); | |
_mm256_store_ps(dstp + 32, abcdefgh4); | |
_mm256_store_ps(dstp + 40, abcdefgh5); | |
_mm256_store_ps(dstp + 48, abcdefgh6); | |
_mm256_store_ps(dstp + 56, abcdefgh7); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment