Counting the trailing zero bit count (TZCNT) can be done by isolating the lowest bit, then depositing this into the appropriate locations for the count. The leading zero bit count (LZCNT) can be done by reversing bits, then computing the TZCNT.
__m128i _mm_tzcnt_epi8(__m128i a) {
// isolate lowest bit
a = _mm_andnot_si128(_mm_add_epi8(a, _mm_set1_epi8(0xff)), a);
// convert lowest bit to index
return _mm_gf2p8affine_epi64_epi8(a, _mm_set_epi32(0xaaccf0ff, 0, 0xaaccf0ff, 0), 8);
/*
pcmpeqb xmm1, xmm1
paddb xmm1, xmm0
pandn xmm1, xmm0
gf2p8affineqb xmm1, [magic], 0x8
*/
}
__m128i _mm_lzcnt_epi8(__m128i a) {
// just reverse bits and TZCNT
a = _mm_gf2p8affine_epi64_epi8(a, _mm_set_epi32(0x80402010, 0x08040201, 0x80402010, 0x08040201), 0);
return _mm_tzcnt_epi8(a);
}
// bonus: count leading sign bits
__m128i _mm_lscnt_epi8(__m128i a) {
// just reverse, flip bits, and TZCNT
a = _mm_gf2p8affine_epi64_epi8(a, _mm_set_epi32(0x80402010, 0x08040201, 0x80402010, 0x08040201), 0xff);
return _mm_tzcnt_epi8(a);
}
// bonus: inverted-index version of lzcnt (like the BSR instruction)
__m128i _mm_bsr_epi8(__m128i a) {
// reverse bits
a = _mm_gf2p8affine_epi64_epi8(a, _mm_set_epi32(0x80402010, 0x08040201, 0x80402010, 0x08040201), 0);
// isolate lowest bit
a = _mm_andnot_si128(_mm_add_epi8(a, _mm_set1_epi8(0xff)), a);
// convert lowest bit to index (inverted)
return _mm_gf2p8affine_epi64_epi8(a, _mm_set_epi32(0x55330fff, 0, 0x55330fff, 0), 8);
/*
gf2p8affineqb xmm0, [reverse], 0
pcmpeqb xmm1, xmm1
paddb xmm1, xmm0
pandn xmm1, xmm0
gf2p8affineqb xmm1, [magic], 0x8
*/
}
This might not seem so out-of-band, and you might even point out that there’s a GF2P8MULB
instruction in the same extension, however this use case may not be so obvious, and has its benefits.
A key problem of the GF2P8MULB
instruction is that it only applies to GF(28) with polynomial 0x11B. Multiplying via affine, however, does not have this limitation.
Limitations of multiplying via affine, are that all values in an 8-byte group must be multiplied by the same coefficient, and you’ll need to compute the required matrices (or have them pre-computed).
Example: multiply a vector of bytes by b
in GF(28) with polynomial 0x11d (commonly used in error correction, e.g. RAID6):
static const uint64_t gf2p8_11d_mul_matrices[256] = {
0,0x102040810204080ULL,0x8001828488102040ULL,0x8103868c983060c0ULL,0x408041c2c4881020ULL,0x418245cad4a850a0ULL,0xc081c3464c983060ULL,0xc183c74e5cb870e0ULL,0x2040a061e2c48810ULL,0x2142a469f2e4c890ULL,0xa04122e56ad4a850ULL,0xa14326ed7af4e8d0ULL,0x60c0e1a3264c9830ULL,0x61c2e5ab366cd8b0ULL,0xe0c16327ae5cb870ULL,0xe1c3672fbe7cf8f0ULL,0x102050b071e2c488ULL,0x112254b861c28408ULL,0x9021d234f9f2e4c8ULL,0x9123d63ce9d2a448ULL,0x50a01172b56ad4a8ULL,0x51a2157aa54a9428ULL,0xd0a193f63d7af4e8ULL,0xd1a397fe2d5ab468ULL,0x3060f0d193264c98ULL,0x3162f4d983060c18ULL,0xb06172551b366cd8ULL,0xb163765d0b162c58ULL,0x70e0b11357ae5cb8ULL,0x71e2b51b478e1c38ULL,0xf0e13397dfbe7cf8ULL,0xf1e3379fcf9e3c78ULL,0x8810a8d83871e2c4ULL,0x8912acd02851a244ULL,0x8112a5cb061c284ULL,0x9132e54a0418204ULL,0xc890e91afcf9f2e4ULL,0xc992ed12ecd9b264ULL,0x48916b9e74e9d2a4ULL,0x49936f9664c99224ULL,0xa85008b9dab56ad4ULL,0xa9520cb1ca952a54ULL,0x28518a3d52a54a94ULL,0x29538e3542850a14ULL,0xe8d0497b1e3d7af4ULL,0xe9d24d730e1d3a74ULL,0x68d1cbff962d5ab4ULL,0x69d3cff7860d1a34ULL,0x9830f8684993264cULL,0x9932fc6059b366ccULL,0x18317aecc183060cULL,0x19337ee4d1a3468cULL,0xd8b0b9aa8d1b366cULL,0xd9b2bda29d3b76ecULL,0x58b13b2e050b162cULL,0x59b33f26152b56acULL,0xb8705809ab57ae5cULL,0xb9725c01bb77eedcULL,0x3871da8d23478e1cULL,0x3973de853367ce9cULL,0xf8f019cb6fdfbe7cULL,0xf9f21dc37ffffefcULL,0x78f19b4fe7cf9e3cULL,0x79f39f47f7efdebcULL,0xc488d46c1c3871e2ULL,0xc58ad0640c183162ULL,0x448956e8942851a2ULL,0x458b52e084081122ULL,0x840895aed8b061c2ULL,0x850a91a6c8902142ULL,0x409172a50a04182ULL,0x50b132240800102ULL,0xe4c8740dfefcf9f2ULL,0xe5ca7005eedcb972ULL,0x64c9f68976ecd9b2ULL,0x65cbf28166cc9932ULL,0xa44835cf3a74e9d2ULL,0xa54a31c72a54a952ULL,0x2449b74bb264c992ULL,0x254bb343a2448912ULL,0xd4a884dc6ddab56aULL,0xd5aa80d47dfaf5eaULL,0x54a90658e5ca952aULL,0x55ab0250f5ead5aaULL,0x9428c51ea952a54aULL,0x952ac116b972e5caULL,0x1429479a2142850aULL,0x152b43923162c58aULL,0xf4e824bd8f1e3d7aULL,0xf5ea20b59f3e7dfaULL,0x74e9a639070e1d3aULL,0x75eba231172e5dbaULL,0xb468657f4b962d5aULL,0xb56a61775bb66ddaULL,0x3469e7fbc3860d1aULL,0x356be3f3d3a64d9aULL,0x4c987cb424499326ULL,0x4d9a78bc3469d3a6ULL,0xcc99fe30ac59b366ULL,0xcd9bfa38bc79f3e6ULL,0xc183d76e0c18306ULL,0xd1a397ef0e1c386ULL,0x8c19bff268d1a346ULL,0x8d1bbbfa78f1e3c6ULL,0x6cd8dcd5c68d1b36ULL,0x6ddad8ddd6ad5bb6ULL,0xecd95e514e9d3b76ULL,0xeddb5a595ebd7bf6ULL,0x2c589d1702050b16ULL,0x2d5a991f12254b96ULL,0xac591f938a152b56ULL,0xad5b1b9b9a356bd6ULL,0x5cb82c0455ab57aeULL,0x5dba280c458b172eULL,0xdcb9ae80ddbb77eeULL,0xddbbaa88cd9b376eULL,0x1c386dc69123478eULL,0x1d3a69ce8103070eULL,0x9c39ef42193367ceULL,0x9d3beb4a0913274eULL,0x7cf88c65b76fdfbeULL,0x7dfa886da74f9f3eULL,0xfcf90ee13f7ffffeULL,0xfdfb0ae92f5fbf7eULL,0x3c78cda773e7cf9eULL,0x3d7ac9af63c78f1eULL,0xbc794f23fbf7efdeULL,0xbd7b4b2bebd7af5eULL,0xe2c46a368e1c3871ULL,0xe3c66e3e9e3c78f1ULL,0x62c5e8b2060c1831ULL,0x63c7ecba162c58b1ULL,0xa2442bf44a942851ULL,0xa3462ffc5ab468d1ULL,0x2245a970c2840811ULL,0x2347ad78d2a44891ULL,0xc284ca576cd8b061ULL,0xc386ce5f7cf8f0e1ULL,0x428548d3e4c89021ULL,0x43874cdbf4e8d0a1ULL,0x82048b95a850a041ULL,0x83068f9db870e0c1ULL,0x205091120408001ULL,0x3070d193060c081ULL,0xf2e43a86fffefcf9ULL,0xf3e63e8eefdebc79ULL,0x72e5b80277eedcb9ULL,0x73e7bc0a67ce9c39ULL,0xb2647b443b76ecd9ULL,0xb3667f4c2b56ac59ULL,0x3265f9c0b366cc99ULL,0x3367fdc8a3468c19ULL,0xd2a49ae71d3a74e9ULL,0xd3a69eef0d1a3469ULL,0x52a51863952a54a9ULL,0x53a71c6b850a1429ULL,0x9224db25d9b264c9ULL,0x9326df2dc9922449ULL,0x122559a151a24489ULL,0x13275da941820409ULL,0x6ad4c2eeb66ddab5ULL,0x6bd6c6e6a64d9a35ULL,0xead5406a3e7dfaf5ULL,0xebd744622e5dba75ULL,0x2a54832c72e5ca95ULL,0x2b56872462c58a15ULL,0xaa5501a8faf5ead5ULL,0xab5705a0ead5aa55ULL,0x4a94628f54a952a5ULL,0x4b96668744891225ULL,0xca95e00bdcb972e5ULL,0xcb97e403cc993265ULL,0xa14234d90214285ULL,0xb16274580010205ULL,0x8a15a1c9183162c5ULL,0x8b17a5c108112245ULL,0x7af4925ec78f1e3dULL,0x7bf69656d7af5ebdULL,0xfaf510da4f9f3e7dULL,0xfbf714d25fbf7efdULL,0x3a74d39c03070e1dULL,0x3b76d79413274e9dULL,0xba7551188b172e5dULL,0xbb7755109b376eddULL,0x5ab4323f254b962dULL,0x5bb63637356bd6adULL,0xdab5b0bbad5bb66dULL,0xdbb7b4b3bd7bf6edULL,0x1a3473fde1c3860dULL,0x1b3677f5f1e3c68dULL,0x9a35f17969d3a64dULL,0x9b37f57179f3e6cdULL,0x264cbe5a92244993ULL,0x274eba5282040913ULL,0xa64d3cde1a3469d3ULL,0xa74f38d60a142953ULL,0x66ccff9856ac59b3ULL,0x67cefb90468c1933ULL,0xe6cd7d1cdebc79f3ULL,0xe7cf7914ce9c3973ULL,0x60c1e3b70e0c183ULL,0x70e1a3360c08103ULL,0x860d9cbff8f0e1c3ULL,0x870f98b7e8d0a143ULL,0x468c5ff9b468d1a3ULL,0x478e5bf1a4489123ULL,0xc68ddd7d3c78f1e3ULL,0xc78fd9752c58b163ULL,0x366ceeeae3c68d1bULL,0x376eeae2f3e6cd9bULL,0xb66d6c6e6bd6ad5bULL,0xb76f68667bf6eddbULL,0x76ecaf28274e9d3bULL,0x77eeab20376eddbbULL,0xf6ed2dacaf5ebd7bULL,0xf7ef29a4bf7efdfbULL,0x162c4e8b0102050bULL,0x172e4a831122458bULL,0x962dcc0f8912254bULL,0x972fc807993265cbULL,0x56ac0f49c58a152bULL,0x57ae0b41d5aa55abULL,0xd6ad8dcd4d9a356bULL,0xd7af89c55dba75ebULL,0xae5c1682aa55ab57ULL,0xaf5e128aba75ebd7ULL,0x2e5d940622458b17ULL,0x2f5f900e3265cb97ULL,0xeedc57406eddbb77ULL,0xefde53487efdfbf7ULL,0x6eddd5c4e6cd9b37ULL,0x6fdfd1ccf6eddbb7ULL,0x8e1cb6e348912347ULL,0x8f1eb2eb58b163c7ULL,0xe1d3467c0810307ULL,0xf1f306fd0a14387ULL,0xce9cf7218c193367ULL,0xcf9ef3299c3973e7ULL,0x4e9d75a504091327ULL,0x4f9f71ad142953a7ULL,0xbe7c4632dbb76fdfULL,0xbf7e423acb972f5fULL,0x3e7dc4b653a74f9fULL,0x3f7fc0be43870f1fULL,0xfefc07f01f3f7fffULL,0xfffe03f80f1f3f7fULL,0x7efd8574972f5fbfULL,0x7fff817c870f1f3fULL,0x9e3ce6533973e7cfULL,0x9f3ee25b2953a74fULL,0x1e3d64d7b163c78fULL,0x1f3f60dfa143870fULL,0xdebca791fdfbf7efULL,0xdfbea399eddbb76fULL,0x5ebd251575ebd7afULL,0x5fbf211d65cb972fULL
};
__m128i _mm_gf2p8mul_11d_epi8(__m128i a, uint8_t b) {
return _mm_gf2p8affine_epi64_epi8(a, _mm_set1_epi64x(gf2p8_11d_mul_matrices[b]), 0);
/*
movddup xmm1, [gf2p8_11d_mul_matrices + b*8]
gf2p8affineqb xmm0, xmm1, 0
# OR, with EVEX broadcast:
vgf2p8affineqb xmm0, [gf2p8_11d_mul_matrices + b*8]{1to2}, 0
*/
}
GF(216) multiplication would require a 16x16 bit matrix, however, this can be constructed with four 8x8 bit matrices. As such, this technique can expand to pretty much any field size, and is currently the most performant solution for large region constant multiplication.
If you’ve got packed 2-bit integers, you can do some basic arithmetic with constants without needing to unpack.
__m128i _mm_add1_epi2(__m128i a) { // same as sub3
return _mm_gf2p8affine_epi64_epi8(a, _mm_set_epi32(0x0103040c, 0x103040c0, 0x0103040c, 0x103040c0), 0x55);
}
__m128i _mm_add2_epi2(__m128i a) { // same as sub2
return _mm_xor_si128(a, _mm_set1_epi8(0xaa));
}
__m128i _mm_add3_epi2(__m128i a) { // same as sub1
return _mm_gf2p8affine_epi64_epi8(a, _mm_set_epi32(0x0103040c, 0x103040c0, 0x0103040c, 0x103040c0), 0xff);
}
__m128i _mm_1sub_epi2(__m128i a) { // 1-x
return _mm_xor_si128(a, _mm_set1_epi8(0x55));
}
__m128i _mm_2sub_epi2(__m128i a) { // 2-x
return _mm_gf2p8affine_epi64_epi8(a, _mm_set_epi32(0x0103040c, 0x103040c0, 0x0103040c, 0x103040c0), 0xaa);
}
__m128i _mm_3sub_epi2(__m128i a) { // 3-x
return _mm_xor_si128(a, _mm_set1_epi8(0xff));
}
__m128i _mm_mul2_epi2(__m128i a) {
return _mm_gf2p8affine_epi64_epi8(a, _mm_set_epi32(0x10004, 0x100080, 0x10004, 0x100080), 0);
}
__m128i _mm_mul3_epi2(__m128i a) { // same as 0-x
return _mm_gf2p8affine_epi64_epi8(a, _mm_set_epi32(0x0103040c, 0x103040c0, 0x0103040c, 0x103040c0), 0);
}
Not using affine instruction, but one could use the mulb instruction for variable left-shifts
__m128i _mm_sllv_epi8(__m128i a, __m128i count) {
__m128i mask = _mm_shuffle_epi8(_mm_set_epi32(0,0, 0x0103070f, 0x1f3f7fff), count);
a = _mm_and_si128(a, mask);
__m128i multiplier = _mm_shuffle_epi8(_mm_set_epi32(0,0, 0x80402010, 0x08040201), count);
return _mm_gf2p8mul_epi8(a, multiplier);
/*
movdqa xmm2, [mask_tbl]
pshufb xmm2, xmm1
pand xmm0, xmm2
movdqa xmm2, [mult_tbl]
pshufb xmm2, xmm1
gf2p8mulb xmm0, xmm2
*/
}
__m128i _mm_srlv_epi8(__m128i a, __m128i count) {
// I can't think of a faster way than reversing the bits twice and using the above :(
a = _mm_gf2p8affine_epi64_epi8(a, _mm_set_epi32(0x80402010, 0x08040201, 0x80402010, 0x08040201), 0);
a = _mm_sllv_epi8(a, count);
a = _mm_gf2p8affine_epi64_epi8(a, _mm_set_epi32(0x80402010, 0x08040201, 0x80402010, 0x08040201), 0);
return a;
// if AVX512 is available, this is probably better
__m128i mask = _mm_set1_epi16(0xff); // alternatively, this can be implemented using a mask register instead
__m128i lo = _mm_srlv_epi16(_mm_and_si128(a, mask), _mm_and_si128(count, mask));
__m128i hi = _mm_srlv_epi16(a, _mm_srli_epi16(count, 8));
return _mm_ternarylogic_epi32(lo, hi, mask, 0xe4); // same as, but generally faster than, _mm_blendv_epi8(hi, lo, mask)
/*
vpcmpeqw xmm2, xmm2, xmm2
vpsrlw xmm3, xmm1, 8
vpsrlw xmm2, xmm2, 8
vpsrlvw xmm3, xmm0, xmm3
vpand xmm0, xmm0, xmm2
vpand xmm1, xmm1, xmm2
vpsrlvw xmm0, xmm0, xmm1
vpternlogd xmm0, xmm3, xmm2, 0xe4
*/
}
// bonus: variable rotates using AVX512
__m128i _mm_rorv_epi8(__m128i a, __m128i count) {
// rotate by 4
a = _mm_mask_gf2p8affine_epi64_epi8(
a, _mm_test_epi8_mask(count, _mm_set1_epi8(4)),
a, _mm_set_epi32(0x10204080, 0x01020408, 0x10204080, 0x01020408),
0
);
// rotate by 2
a = _mm_mask_gf2p8affine_epi64_epi8(
a, _mm_test_epi8_mask(count, _mm_set1_epi8(2)),
a, _mm_set_epi32(0x04081020, 0x40800102, 0x04081020, 0x40800102),
0
);
// rotate by 1
a = _mm_mask_gf2p8affine_epi64_epi8(
a, _mm_test_epi8_mask(count, _mm_set1_epi8(1)),
a, _mm_set_epi32(0x02040810, 0x20408001, 0x02040810, 0x20408001),
0
);
return a;
/*
vptestmb k1, xmm1, [vec4]
vgf2p8affineqb xmm0 {k1}, xmm0, [rot4], 0
vptestmb k1, xmm1, [vec2]
vgf2p8affineqb xmm0 {k1}, xmm0, [rot2], 0
vptestmb k1, xmm1, [vec1]
vgf2p8affineqb xmm0 {k1}, xmm0, [rot1], 0
*/
// is this better?
__m128i lo = _mm_shuffle_epi8(a, _mm_set_epi32(0x0e0e0c0c, 0x0a0a0808, 0x06060404, 0x02020000));
__m128i hi = _mm_shuffle_epi8(a, _mm_set_epi32(0x0f0d0d0b, 0x0b09090f, 0x07050503, 0x03010107));
count = _mm_ternarylogic_epi32(count, _mm_set1_epi8(7), _mm_set_epi32(0x38302820, 0x18100800, 0x38302820, 0x18100800), 0xea); // (count & 7) | magic
// if count is always <= 7, you can use the following line instead of the above
//count = _mm_or_si128(count, _mm_set_epi32(0x38302820, 0x18100800, 0x38302820, 0x18100800));
__m128i result = _mm_multishift_epi64_epi8(count, lo);
return _mm_mask_multishift_epi64_epi8(result, 0xaaaa, count, hi);
/*
mov r8w, 0xaaaa
kmovw k1, r8w
vpshufb xmm2, xmm0, [shuf1]
vmovdqa xmm3, [vec7]
vpternlogd xmm1, xmm3, [positions]
vpmultishiftqb xmm2, xmm1, xmm2
vpshufb xmm0, xmm0, [shuf2]
vpmultishiftqb xmm2 {k1}, xmm1, xmm0
*/
}
__m128i _mm_rolv_epi8(__m128i a, __m128i count) {
// rotate by 4
a = _mm_mask_gf2p8affine_epi64_epi8(
a, _mm_test_epi8_mask(count, _mm_set1_epi8(4)),
a, _mm_set_epi32(0x10204080, 0x01020408, 0x10204080, 0x01020408),
0
);
// rotate by 2
a = _mm_mask_gf2p8affine_epi64_epi8(
a, _mm_test_epi8_mask(count, _mm_set1_epi8(2)),
a, _mm_set_epi32(0x40800102, 0x04081020, 0x40800102, 0x04081020),
0
);
// rotate by 1
a = _mm_mask_gf2p8affine_epi64_epi8(
a, _mm_test_epi8_mask(count, _mm_set1_epi8(1)),
a, _mm_set_epi32(0x80010204, 0x08102040, 0x80010204, 0x08102040),
0
);
return a;
}
@animetosho Thanks a bunch! Looking through it!