Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save crazyguitar/88592a48b62f97c0ec1d7a6fb2ba7d6e to your computer and use it in GitHub Desktop.
Save crazyguitar/88592a48b62f97c0ec1d7a6fb2ba7d6e to your computer and use it in GitHub Desktop.
[AVX2 matrix transpose] A double precision matrix transpose kernel implemented in avx2 intrinsics. You can compile this code with -mavx2 flag. #NLA
#include <iostream>
#include <immintrin.h>
inline void transpose_kernel(double *A, int lda, double *B, int ldb) {
__m256d row0, row1, row2, row3;
row0 = _mm256_load_pd(A + 0 * lda);
row1 = _mm256_load_pd(A + 1 * lda);
row2 = _mm256_load_pd(A + 2 * lda);
row3 = _mm256_load_pd(A + 3 * lda);
__m256d tmp3, tmp2, tmp1, tmp0;
tmp0 = _mm256_shuffle_pd(row0, row1, 0x0);
tmp2 = _mm256_shuffle_pd(row0, row1, 0xF);
tmp1 = _mm256_shuffle_pd(row2, row3, 0x0);
tmp3 = _mm256_shuffle_pd(row2, row3, 0xF);
row0 = _mm256_permute2f128_pd(tmp0, tmp1, 0x20);
row1 = _mm256_permute2f128_pd(tmp2, tmp3, 0x20);
row2 = _mm256_permute2f128_pd(tmp0, tmp1, 0x31);
row3 = _mm256_permute2f128_pd(tmp2, tmp3, 0x31);
_mm256_store_pd(B + 0 * ldb, row0);
_mm256_store_pd(B + 1 * ldb, row1);
_mm256_store_pd(B + 2 * ldb, row2);
_mm256_store_pd(B + 3 * ldb, row3);
}
int main() {
double A[16];
for (int i = 0; i < 16; ++i) {
A[i] = i + 1;
}
transpose_kernel(A, 4, A, 4);
for (int i = 0; i < 4; ++i) {
for (int j = 0; j < 4; ++j) {
std::cout << A[i * 4 + j] << "\t";
}
std::cout << std::endl;
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment