Last active
October 14, 2023 22:57
-
-
Save degski/b5fbac1ec6c8200d1d8ad102f89df89f to your computer and use it in GitHub Desktop.
lane-crossing shift and rotate instructions in AVX2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// MIT License | |
// | |
// Copyright (c) 2018 degski | |
// | |
// Permission is hereby granted, free of charge, to any person obtaining a copy | |
// of this software and associated documentation files (the "Software"), to deal | |
// in the Software without restriction, including without limitation the rights | |
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
// copies of the Software, and to permit persons to whom the Software is | |
// furnished to do so, subject to the following conditions: | |
// | |
// The above copyright notice and this permission notice shall be included in all | |
// copies or substantial portions of the Software. | |
// | |
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
// SOFTWARE. | |
#include "shift_rotate_avx2.hpp" | |
#ifdef __AVX2__ | |
__m256i left_shift_000_063 ( __m256i a, int n ) { // 6 | |
return _mm256_or_si256 ( _mm256_slli_epi64 ( a, n ), _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), _mm256_permute4x64_epi64 ( _mm256_srli_epi64 ( a, 64 - n ), _MM_SHUFFLE ( 2, 1, 0, 0 ) ), _MM_SHUFFLE ( 3, 3, 3, 0 ) ) ); | |
} | |
__m256i left_shift_064_127 ( __m256i a, int n ) { // 7 | |
__m256i b = _mm256_slli_epi64 ( a, n ); | |
__m256i d = _mm256_permute4x64_epi64 ( b, _MM_SHUFFLE ( 2, 1, 0, 0 ) ); | |
__m256i c = _mm256_srli_epi64 ( a, 64 - n ); | |
__m256i e = _mm256_permute4x64_epi64 ( c, _MM_SHUFFLE ( 1, 0, 0, 0 ) ); | |
__m256i f = _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), d, _MM_SHUFFLE ( 3, 3, 3, 0 ) ); | |
__m256i g = _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), e, _MM_SHUFFLE ( 3, 3, 0, 0 ) ); // 6 | |
return _mm256_or_si256 ( f, g ); | |
} | |
__m256i left_shift_128_191 ( __m256i a, int n ) { // 7 | |
__m256i b = _mm256_slli_epi64 ( a, n ); | |
__m256i d = _mm256_permute4x64_epi64 ( b, _MM_SHUFFLE ( 1, 0, 0, 0 ) ); | |
__m256i c = _mm256_srli_epi64 ( a, 64 - n ); | |
__m256i e = _mm256_permute4x64_epi64 ( c, _MM_SHUFFLE ( 1, 0, 0, 0 ) ); | |
__m256i f = _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), d, _MM_SHUFFLE ( 3, 3, 0, 0 ) ); | |
__m256i g = _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), e, _MM_SHUFFLE ( 3, 0, 0, 0 ) ); | |
return _mm256_or_si256 ( f, g ); | |
} | |
__m256i left_shift_192_255 ( __m256i a, int n ) { // 5 | |
return _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), _mm256_slli_epi64 ( _mm256_permute4x64_epi64 ( a, _MM_SHUFFLE ( 0, 0, 0, 0 ) ), n ), _MM_SHUFFLE ( 3, 0, 0, 0 ) ); | |
} | |
__m256i _mm256_sli_si256 ( __m256i a, int n ) { | |
if ( n < 128 ) return n < 64 ? left_shift_000_063 ( a, n ) : left_shift_064_127 ( a, n % 64 ); | |
else return n < 192 ? left_shift_128_191 ( a, n % 64 ) : left_shift_192_255 ( a, n % 64 ); | |
} | |
__m256i right_shift_000_063 ( __m256i a, int n ) { // 6 | |
return _mm256_or_si256 ( _mm256_srli_epi64 ( a, n ), _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), _mm256_permute4x64_epi64 ( _mm256_slli_epi64 ( a, 64 - n ), _MM_SHUFFLE ( 0, 3, 2, 1 ) ), _MM_SHUFFLE ( 0, 3, 3, 3 ) ) ); | |
} | |
__m256i right_shift_064_127 ( __m256i a, int n ) { // 7 | |
__m256i b = _mm256_srli_epi64 ( a, n ); | |
__m256i d = _mm256_permute4x64_epi64 ( b, _MM_SHUFFLE ( 3, 3, 2, 1 ) ); | |
__m256i c = _mm256_slli_epi64 ( a, 64 - n ); | |
__m256i e = _mm256_permute4x64_epi64 ( c, _MM_SHUFFLE ( 3, 3, 3, 2 ) ); | |
__m256i f = _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), d, _MM_SHUFFLE ( 0, 3, 3, 3 ) ); | |
__m256i g = _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), e, _MM_SHUFFLE ( 0, 0, 3, 3 ) ); | |
return _mm256_or_si256 ( f, g ); | |
} | |
__m256i right_shift_128_191 ( __m256i a, int n ) { // 7 | |
__m256i b = _mm256_srli_epi64 ( a, n ); | |
__m256i d = _mm256_permute4x64_epi64 ( b, _MM_SHUFFLE ( 3, 2, 3, 2 ) ); | |
__m256i c = _mm256_slli_epi64 ( a, 64 - n ); | |
__m256i e = _mm256_permute4x64_epi64 ( c, _MM_SHUFFLE ( 3, 2, 1, 3 ) ); | |
__m256i f = _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), d, _MM_SHUFFLE ( 0, 0, 3, 3 ) ); | |
__m256i g = _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), e, _MM_SHUFFLE ( 0, 0, 0, 3 ) ); | |
return _mm256_or_si256 ( f, g ); | |
} | |
__m256i right_shift_192_255 ( __m256i a, int n ) { // 5 | |
return _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), _mm256_srli_epi64 ( _mm256_permute4x64_epi64 ( a, _MM_SHUFFLE ( 0, 0, 0, 3 ) ), n ), _MM_SHUFFLE ( 0, 0, 0, 3 ) ); | |
} | |
__m256i _mm256_sri_si256 ( __m256i a, int n ) { | |
if ( n < 128 ) return n < 64 ? right_shift_000_063 ( a, n ) : right_shift_064_127 ( a, n % 64 ); | |
else return n < 192 ? right_shift_128_191 ( a, n % 64 ) : right_shift_192_255 ( a, n % 64 ); | |
} | |
__m256i left_rotate_000_063 ( __m256i a, int n ) { // 5 | |
return _mm256_or_si256 ( _mm256_slli_epi64 ( a, n ), _mm256_permute4x64_epi64 ( _mm256_srli_epi64 ( a, 64 - n ), _MM_SHUFFLE ( 2, 1, 0, 3 ) ) ); | |
} | |
__m256i left_rotate_064_127 ( __m256i a, int n ) { // 6 | |
__m256i b = _mm256_slli_epi64 ( a, n ); | |
__m256i c = _mm256_srli_epi64 ( a, 64 - n ); | |
__m256i d = _mm256_permute4x64_epi64 ( b, _MM_SHUFFLE ( 2, 1, 0, 3 ) ); | |
__m256i e = _mm256_permute4x64_epi64 ( c, _MM_SHUFFLE ( 1, 0, 3, 2 ) ); | |
return _mm256_or_si256 ( d, e ); | |
} | |
__m256i left_rotate_128_191 ( __m256i a, int n ) { // 6 | |
__m256i b = _mm256_slli_epi64 ( a, n ); | |
__m256i c = _mm256_srli_epi64 ( a, 64 - n ); | |
__m256i d = _mm256_permute4x64_epi64 ( b, _MM_SHUFFLE ( 1, 0, 3, 2 ) ); | |
__m256i e = _mm256_permute4x64_epi64 ( c, _MM_SHUFFLE ( 0, 3, 2, 1 ) ); | |
return _mm256_or_si256 ( d, e ); | |
} | |
__m256i left_rotate_192_255 ( __m256i a, int n ) { // 5 | |
return _mm256_or_si256 ( _mm256_srli_epi64 ( a, 64 - n ), _mm256_permute4x64_epi64 ( _mm256_slli_epi64 ( a, n ), _MM_SHUFFLE ( 0, 3, 2, 1 ) ) ); | |
} | |
__m256i _mm256_rli_si256 ( __m256i a, int n ) { | |
if ( n < 128 ) return n < 64 ? left_rotate_000_063 ( a, n ) : left_rotate_064_127 ( a, n % 64 ); | |
else return n < 192 ? left_rotate_128_191 ( a, n % 64 ) : left_rotate_192_255 ( a, n % 64 ); | |
} | |
__m256i right_rotate_000_063 ( __m256i a, int n ) { // 5 | |
return _mm256_or_si256 ( _mm256_srli_epi64 ( a, n ), _mm256_permute4x64_epi64 ( _mm256_slli_epi64 ( a, 64 - n ), _MM_SHUFFLE ( 0, 3, 2, 1 ) ) ); | |
} | |
__m256i right_rotate_064_127 ( __m256i a, int n ) { // 6 | |
__m256i b = _mm256_srli_epi64 ( a, n ); | |
__m256i c = _mm256_slli_epi64 ( a, 64 - n ); | |
__m256i d = _mm256_permute4x64_epi64 ( b, _MM_SHUFFLE ( 0, 3, 2, 1 ) ); | |
__m256i e = _mm256_permute4x64_epi64 ( c, _MM_SHUFFLE ( 1, 0, 3, 2 ) ); | |
return _mm256_or_si256 ( d, e ); | |
} | |
__m256i right_rotate_128_191 ( __m256i a, int n ) { // 6 | |
__m256i b = _mm256_srli_epi64 ( a, n ); | |
__m256i c = _mm256_slli_epi64 ( a, 64 - n ); | |
__m256i d = _mm256_permute4x64_epi64 ( b, _MM_SHUFFLE ( 1, 0, 3, 2 ) ); | |
__m256i e = _mm256_permute4x64_epi64 ( c, _MM_SHUFFLE ( 2, 1, 0, 3 ) ); | |
return _mm256_or_si256 ( d, e ); | |
} | |
__m256i right_rotate_192_255 ( __m256i a, int n ) { // 5 | |
return _mm256_or_si256 ( _mm256_slli_epi64 ( a, 64 - n ), _mm256_permute4x64_epi64 ( _mm256_srli_epi64 ( a, n ), _MM_SHUFFLE ( 2, 1, 0, 3 ) ) ); | |
} | |
__m256i _mm256_rri_si256 ( __m256i a, int n ) { | |
if ( n < 128 ) return n < 64 ? right_rotate_000_063 ( a, n ) : right_rotate_064_127 ( a, n % 64 ); | |
else return n < 192 ? right_rotate_128_191 ( a, n % 64 ) : right_rotate_192_255 ( a, n % 64 ); | |
} | |
#endif // __AVX2__ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// MIT License | |
// | |
// Copyright (c) 2018 degski | |
// | |
// Permission is hereby granted, free of charge, to any person obtaining a copy | |
// of this software and associated documentation files (the "Software"), to deal | |
// in the Software without restriction, including without limitation the rights | |
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
// copies of the Software, and to permit persons to whom the Software is | |
// furnished to do so, subject to the following conditions: | |
// | |
// The above copyright notice and this permission notice shall be included in all | |
// copies or substantial portions of the Software. | |
// | |
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
// SOFTWARE. | |
#pragma once | |
#include <immintrin.h> | |
#ifdef __AVX2__ | |
__m256i _mm256_sli_si256 ( __m256i, int ); | |
__m256i _mm256_sri_si256 ( __m256i, int ); | |
__m256i _mm256_rli_si256 ( __m256i, int ); | |
__m256i _mm256_rri_si256 ( __m256i, int ); | |
#endif |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment