Your task is to convert a given C code SIMD to WASM SIMD. Here is an example of another function:
void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
const block_q4_K * restrict x = vx;
const block_q8_K * restrict y = vy;
const int nb = n / QK_K;
static const uint32_t kmask1 = 0x3f3f3f3f;
static const uint32_t kmask2 = 0x0f0f0f0f;
static const uint32_t kmask3 = 0x03030303;
uint32_t utmp[4];
#ifdef __ARM_NEON
const uint8x16_t m4b = vdupq_n_u8(0xf);
const int32x4_t mzero = vdupq_n_s32(0);
ggml_int8x16x2_t q4bytes;
ggml_int8x16x2_t q8bytes;
float sumf = 0;
for (int i = 0; i < nb; ++i) {
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
const int16x8_t q8sums = vpaddq_s16(vld1q_s16(y[i].bsums), vld1q_s16(y[i].bsums + 8));
memcpy(utmp, x[i].scales, 12);
uint32x2_t mins8 = { 0 };
mins8 = vset_lane_u32(utmp[1] & kmask1, mins8, 0);
mins8 = vset_lane_u32(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), mins8, 1);
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
utmp[0] &= kmask1;
const int16x8_t mins = vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(mins8)));
const int32x4_t prod = vaddq_s32(vmull_s16(vget_low_s16 (q8sums), vget_low_s16 (mins)),
vmull_s16(vget_high_s16(q8sums), vget_high_s16(mins)));
sumf -= dmin * vaddvq_s32(prod);
const uint8_t * scales = (const uint8_t *)utmp;
const uint8_t * restrict q4 = x[i].qs;
const int8_t * restrict q8 = y[i].qs;
int32_t sumi1 = 0;
int32_t sumi2 = 0;
for (int j = 0; j < QK_K/64; ++j) {
const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4); q4 += 32;
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
sumi1 += vaddvq_s32(p1) * scales[2*j+0];
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
sumi2 += vaddvq_s32(p2) * scales[2*j+1];
}
sumf += d * (sumi1 + sumi2);
}
*s = sumf;
#elif defined(__wasm_simd128__)
// WASM SIMD128 implementation
const uint8_t * scales = (const uint8_t*)&utmp[0];
float sumf = 0;
for (int i = 0; i < nb; ++i) {
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = y[i].d * GGML_FP16_TO_FP32(x[i].dmin); // Corrected sign
const uint8_t * restrict q4 = x[i].qs;
const int8_t * restrict q8 = y[i].qs;
// Process scales and mins
memcpy(utmp, x[i].scales, 12);
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
const uint32_t uaux = utmp[1] & kmask1;
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
utmp[2] = uaux;
utmp[0] &= kmask1;
// Sum mins * q8sums
int32_t sumi = 0;
const int16_t * restrict q8sums = y[i].bsums;
const uint8_t * m = (const uint8_t *)&utmp[2];
for (int j = 0; j < 16; j += 2) {
sumi += (q8sums[j] + q8sums[j+1]) * m[j/2];
}
sumf -= dmin * sumi;
int32_t sumi1 = 0;
int32_t sumi2 = 0;
for (int j = 0; j < QK_K/64; ++j) {
// Load 64 4-bit weights (32 bytes)
const v128_t q4x0 = wasm_v128_load(q4);
const v128_t q4x1 = wasm_v128_load(q4 + 16);
q4 += 32;
// Split into low/high nibbles
const v128_t q4l0 = wasm_v128_and(q4x0, wasm_i8x16_splat(0x0F));
const v128_t q4h0 = wasm_u8x16_shr(q4x0, 4);
const v128_t q4l1 = wasm_v128_and(q4x1, wasm_i8x16_splat(0x0F));
const v128_t q4h1 = wasm_u8x16_shr(q4x1, 4);
// Load 64 8-bit values (64 bytes)
const v128_t q8x0 = wasm_v128_load(q8);
const v128_t q8x1 = wasm_v128_load(q8 + 16);
const v128_t q8x2 = wasm_v128_load(q8 + 32);
const v128_t q8x3 = wasm_v128_load(q8 + 48);
q8 += 64;
// Low nibble products
v128_t vacc1 = wasm_i32x4_dot_i16x8(
wasm_i16x8_extend_low_i8x16(q4l0),
wasm_i16x8_extend_low_i8x16(q8x0)
);
vacc1 = wasm_i32x4_add(vacc1, wasm_i32x4_dot_i16x8(
wasm_i16x8_extend_high_i8x16(q4l0),
wasm_i16x8_extend_high_i8x16(q8x0)
));
vacc1 = wasm_i32x4_add(vacc1, wasm_i32x4_dot_i16x8(
wasm_i16x8_extend_low_i8x16(q4l1),
wasm_i16x8_extend_low_i8x16(q8x1)
));
vacc1 = wasm_i32x4_add(vacc1, wasm_i32x4_dot_i16x8(
wasm_i16x8_extend_high_i8x16(q4l1),
wasm_i16x8_extend_high_i8x16(q8x1)
));
// High nibble products
v128_t vacc2 = wasm_i32x4_dot_i16x8(
wasm_i16x8_extend_low_i8x16(q4h0),
wasm_i16x8_extend_low_i8x16(q8x2)
);
vacc2 = wasm_i32x4_add(vacc2, wasm_i32x4_dot_i16x8(
wasm_i16x8_extend_high_i8x16(q4h0),
wasm_i16x8_extend_high_i8x16(q8x2)
));
vacc2 = wasm_i32x4_add(vacc2, wasm_i32x4_dot_i16x8(
wasm_i16x8_extend_low_i8x16(q4h1),
wasm_i16x8_extend_low_i8x16(q8x3)
));
vacc2 = wasm_i32x4_add(vacc2, wasm_i32x4_dot_i16x8(
wasm_i16x8_extend_high_i8x16(q4h1),
wasm_i16x8_extend_high_i8x16(q8x3)
));
// Accumulate scaled results
int32_t vacc1_sum = wasm_i32x4_extract_lane(vacc1, 0) + wasm_i32x4_extract_lane(vacc1, 1) +
wasm_i32x4_extract_lane(vacc1, 2) + wasm_i32x4_extract_lane(vacc1, 3);
sumi1 += vacc1_sum * scales[2*j];
int32_t vacc2_sum = wasm_i32x4_extract_lane(vacc2, 0) + wasm_i32x4_extract_lane(vacc2, 1) +
wasm_i32x4_extract_lane(vacc2, 2) + wasm_i32x4_extract_lane(vacc2, 3);
sumi2 += vacc2_sum * scales[2*j+1];
}
sumf += d * (sumi1 + sumi2);
}
*s = sumf;
#elif defined __AVX__
const __m128i m4 = _mm_set1_epi8(0xF);
const __m128i m2 = _mm_set1_epi8(0x2);
__m256 acc = _mm256_setzero_ps();
__m128 acc_m = _mm_setzero_ps();
for (int i = 0; i < nb; ++i) {
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
const uint8_t * restrict q4 = x[i].qs;
const int8_t * restrict q8 = y[i].qs;
memcpy(utmp, x[i].scales, 12);
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
const uint32_t uaux = utmp[1] & kmask1;
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
utmp[2] = uaux;
utmp[0] &= kmask1;
const __m128i utmps = _mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0]);
const __m128i scales = _mm_cvtepu8_epi16(utmps);
const __m128i mins = _mm_cvtepu8_epi16(_mm_unpackhi_epi64(utmps, utmps));
const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)&y[i].bsums[0]);
const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)&y[i].bsums[8]);
const __m128i q8s = _mm_hadd_epi16(q8sums_0, q8sums_1);
const __m128i prod = _mm_madd_epi16(mins, q8s);
acc_m = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(dmin), _mm_cvtepi32_ps(prod)), acc_m);
__m128i sumi_0 = _mm_setzero_si128();
__m128i sumi_1 = _mm_setzero_si128();
__m128i shuffle = _mm_set1_epi16(0x0100);
for (int j = 0; j < QK_K/64; ++j) {
const __m128i scale_l = _mm_shuffle_epi8(scales, shuffle);
shuffle = _mm_add_epi16(shuffle, m2);
const __m128i scale_h = _mm_shuffle_epi8(scales, shuffle);
shuffle = _mm_add_epi16(shuffle, m2);
__m128i q4bits = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
const __m128i q4l_0 = _mm_and_si128(q4bits, m4);
const __m128i q4h_0 = _mm_and_si128(_mm_srli_epi16(q4bits, 4), m4);
q4bits = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
const __m128i q4l_1 = _mm_and_si128(q4bits, m4);
const __m128i q4h_1 = _mm_and_si128(_mm_srli_epi16(q4bits, 4), m4);
const __m128i q8l_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
__m128i p16l = _mm_maddubs_epi16(q4l_0, q8l_0);
p16l = _mm_madd_epi16(scale_l, p16l);
sumi_0 = _mm_add_epi32(sumi_0, p16l);
const __m128i q8l_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
p16l = _mm_maddubs_epi16(q4l_1, q8l_1);
p16l = _mm_madd_epi16(scale_l, p16l);
sumi_1 = _mm_add_epi32(sumi_1, p16l);
const __m128i q8h_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
__m128i p16h = _mm_maddubs_epi16(q4h_0, q8h_0);
p16h = _mm_madd_epi16(scale_h, p16h);
sumi_0 = _mm_add_epi32(sumi_0, p16h);
const __m128i q8h_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
p16h = _mm_maddubs_epi16(q4h_1, q8h_1);
p16h = _mm_madd_epi16(scale_h, p16h);
sumi_1 = _mm_add_epi32(sumi_1, p16h);
}
__m256 vd = _mm256_set1_ps(d);
__m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
acc = _mm256_add_ps(_mm256_mul_ps(vd, _mm256_cvtepi32_ps(sumi)), acc);
}
acc_m = _mm_add_ps(acc_m, _mm_movehl_ps(acc_m, acc_m));
acc_m = _mm_add_ss(acc_m, _mm_movehdup_ps(acc_m));
*s = hsum_float_8(acc) + _mm_cvtss_f32(acc_m);
#else
const uint8_t * scales = (const uint8_t*)&utmp[0];
const uint8_t * mins = (const uint8_t*)&utmp[2];
int8_t aux8[QK_K];
int16_t aux16[8];
float sums [8];
int32_t aux32[8];
memset(sums, 0, 8*sizeof(float));
float sumf = 0;
for (int i = 0; i < nb; ++i) {
const uint8_t * restrict q4 = x[i].qs;
const int8_t * restrict q8 = y[i].qs;
memset(aux32, 0, 8*sizeof(int32_t));
int8_t * restrict a = aux8;
for (int j = 0; j < QK_K/64; ++j) {
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF);
a += 32;
for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] >> 4);
a += 32; q4 += 32;
}
memcpy(utmp, x[i].scales, 12);
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
const uint32_t uaux = utmp[1] & kmask1;
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
utmp[2] = uaux;
utmp[0] &= kmask1;
int sumi = 0;
for (int j = 0; j < QK_K/16; ++j) sumi += y[i].bsums[j] * mins[j/2];
a = aux8;
int is = 0;
for (int j = 0; j < QK_K/32; ++j) {
int32_t scale = scales[is++];
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
q8 += 8; a += 8;
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
q8 += 8; a += 8;
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
q8 += 8; a += 8;
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
q8 += 8; a += 8;
}
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d;
sumf -= dmin * sumi;
}
for (int l = 0; l < 8; ++l) sumf += sums[l];
*s = sumf;
#endif
}
Here is a function. You need to convert it to WASM SIMD.
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
assert(n % QK_K == 0);
assert(nrc == 1);
UNUSED(nrc);
UNUSED(bx);
UNUSED(by);
UNUSED(bs);
const block_q6_K * restrict x = vx;
const block_q8_K * restrict y = vy;
const int nb = n / QK_K;
#ifdef __ARM_NEON
float sum = 0;
const uint8x16_t m4b = vdupq_n_u8(0xF);
const int32x4_t vzero = vdupq_n_s32(0);
//const int8x16_t m32s = vdupq_n_s8(32);
const uint8x16_t mone = vdupq_n_u8(3);
ggml_int8x16x4_t q6bytes;
ggml_uint8x16x4_t q6h;
for (int i = 0; i < nb; ++i) {
const float d_all = GGML_FP16_TO_FP32(x[i].d);
const uint8_t * restrict q6 = x[i].ql;
const uint8_t * restrict qh = x[i].qh;
const int8_t * restrict q8 = y[i].qs;
const int8_t * restrict scale = x[i].scales;
const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
const int8x16_t scales = vld1q_s8(scale);
const ggml_int16x8x2_t q6scales = {{vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))}};
const int32x4_t prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[0]), vget_low_s16 (q6scales.val[0])),
vmull_s16(vget_high_s16(q8sums.val[0]), vget_high_s16(q6scales.val[0]))),
vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[1]), vget_low_s16 (q6scales.val[1])),
vmull_s16(vget_high_s16(q8sums.val[1]), vget_high_s16(q6scales.val[1]))));
int32_t isum_mins = vaddvq_s32(prod);
int32_t isum = 0;
for (int j = 0; j < QK_K/128; ++j) {
ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh); qh += 32;
ggml_uint8x16x4_t q6bits = ggml_vld1q_u8_x4(q6); q6 += 64;
ggml_int8x16x4_t q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
q6h.val[0] = vshlq_n_u8(vandq_u8(mone, qhbits.val[0]), 4);
q6h.val[1] = vshlq_n_u8(vandq_u8(mone, qhbits.val[1]), 4);
uint8x16_t shifted = vshrq_n_u8(qhbits.val[0], 2);
q6h.val[2] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
shifted = vshrq_n_u8(qhbits.val[1], 2);
q6h.val[3] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
q6bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[0], m4b), q6h.val[0]));
q6bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[1], m4b), q6h.val[1]));
q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2]));
q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3]));
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
scale += 4;
q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
shifted = vshrq_n_u8(qhbits.val[0], 4);
q6h.val[0] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
shifted = vshrq_n_u8(qhbits.val[1], 4);
q6h.val[1] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
shifted = vshrq_n_u8(qhbits.val[0], 6);
q6h.val[2] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
shifted = vshrq_n_u8(qhbits.val[1], 6);
q6h.val[3] = vshlq_n_u8(vandq_u8(mone, shifted), 4);
q6bytes.val[0] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[0]));
q6bytes.val[1] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[1]));
q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2]));
q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3]));
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
scale += 4;
}
//sum += isum * d_all * y[i].d;
sum += d_all * y[i].d * (isum - 32 * isum_mins);
}
*s = sum;
#elif defined __AVX__
const __m128i m3 = _mm_set1_epi8(3);
const __m128i m15 = _mm_set1_epi8(15);
__m256 acc = _mm256_setzero_ps();
for (int i = 0; i < nb; ++i) {
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
const uint8_t * restrict q4 = x[i].ql;
const uint8_t * restrict qh = x[i].qh;
const int8_t * restrict q8 = y[i].qs;
// handle the q6_k -32 offset separately using bsums
const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)y[i].bsums);
const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)y[i].bsums + 1);
const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
const __m128i scales_16_0 = _mm_cvtepi8_epi16(scales);
const __m128i scales_16_1 = _mm_cvtepi8_epi16(_mm_bsrli_si128(scales, 8));
const __m128i q8sclsub_0 = _mm_slli_epi32(_mm_madd_epi16(q8sums_0, scales_16_0), 5);
const __m128i q8sclsub_1 = _mm_slli_epi32(_mm_madd_epi16(q8sums_1, scales_16_1), 5);
__m128i sumi_0 = _mm_setzero_si128();
__m128i sumi_1 = _mm_setzero_si128();
int is = 0;
for (int j = 0; j < QK_K/128; ++j) {
const __m128i q4bitsH_0 = _mm_loadu_si128((const __m128i*)qh); qh += 16;
const __m128i q4bitsH_1 = _mm_loadu_si128((const __m128i*)qh); qh += 16;
const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, m3), 4);
const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, m3), 4);
const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(12)), 2);
const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(12)), 2);
const __m128i q4h_4 = _mm_and_si128(q4bitsH_0, _mm_set1_epi8(48));
const __m128i q4h_5 = _mm_and_si128(q4bitsH_1, _mm_set1_epi8(48));
const __m128i q4h_6 = _mm_srli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(-64)), 2);
const __m128i q4h_7 = _mm_srli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(-64)), 2);
const __m128i q4bits1_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
const __m128i q4bits1_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
const __m128i q4bits2_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
const __m128i q4bits2_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
const __m128i q4_0 = _mm_or_si128(_mm_and_si128(q4bits1_0, m15), q4h_0);
const __m128i q4_1 = _mm_or_si128(_mm_and_si128(q4bits1_1, m15), q4h_1);
const __m128i q4_2 = _mm_or_si128(_mm_and_si128(q4bits2_0, m15), q4h_2);
const __m128i q4_3 = _mm_or_si128(_mm_and_si128(q4bits2_1, m15), q4h_3);
const __m128i q4_4 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_0, 4), m15), q4h_4);
const __m128i q4_5 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_1, 4), m15), q4h_5);
const __m128i q4_6 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m15), q4h_6);
const __m128i q4_7 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m15), q4h_7);
const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
__m128i p16_0 = _mm_maddubs_epi16(q4_0, q8_0);
__m128i p16_1 = _mm_maddubs_epi16(q4_1, q8_1);
__m128i p16_2 = _mm_maddubs_epi16(q4_2, q8_2);
__m128i p16_3 = _mm_maddubs_epi16(q4_3, q8_3);
__m128i p16_4 = _mm_maddubs_epi16(q4_4, q8_4);
__m128i p16_5 = _mm_maddubs_epi16(q4_5, q8_5);
__m128i p16_6 = _mm_maddubs_epi16(q4_6, q8_6);
__m128i p16_7 = _mm_maddubs_epi16(q4_7, q8_7);
const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0));
const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1));
const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2));
const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3));
is += 4;
p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0);
p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_0, 8)), p16_1);
p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2);
p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_1, 8)), p16_3);
p16_4 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_2), p16_4);
p16_5 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_2, 8)), p16_5);
p16_6 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_3), p16_6);
p16_7 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_3, 8)), p16_7);
sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_4, p16_6));
sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_5, p16_7));
}
sumi_0 = _mm_sub_epi32(sumi_0, q8sclsub_0);
sumi_1 = _mm_sub_epi32(sumi_1, q8sclsub_1);
const __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi)), acc);
}
*s = hsum_float_8(acc);
#else
int8_t aux8[QK_K];
int16_t aux16[8];
float sums [8];
int32_t aux32[8];
memset(sums, 0, 8*sizeof(float));
float sumf = 0;
for (int i = 0; i < nb; ++i) {
const uint8_t * restrict q4 = x[i].ql;
const uint8_t * restrict qh = x[i].qh;
const int8_t * restrict q8 = y[i].qs;
memset(aux32, 0, 8*sizeof(int32_t));
int8_t * restrict a = aux8;
for (int j = 0; j < QK_K; j += 128) {
for (int l = 0; l < 32; ++l) {
a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
a[l + 64] = (int8_t)((q4[l + 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
a[l + 96] = (int8_t)((q4[l + 32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
}
a += 128;
q4 += 64;
qh += 32;
}
a = aux8;
int is = 0;
for (int j = 0; j < QK_K/16; ++j) {
int scale = x[i].scales[is++];
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
q8 += 8; a += 8;
for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
q8 += 8; a += 8;
}
const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
}
for (int l = 0; l < 8; ++l) sumf += sums[l];
*s = sumf;
#endif
}
You must start your code with #elif defined(__wasm_simd128__)
To think about it, you need to take into account both the refenrence code from ARM NEON and AVX implementation.
DeepSeek evaluation of the converted code.
The performance gains from converting the ggml_vec_dot_q8_0_q8_0 function to use WASM SIMD (versus a scalar fallback in WebAssembly) are expected to be significant, with potential improvements of 4–8× for the SIMD-optimized portion of the code. Here’s a breakdown of the factors and trade-offs:
Key Performance Drivers
SIMD Parallelism:
Each 128-bit WASM SIMD register processes 16 int8 elements at once, reducing the number of instructions needed for arithmetic.
The i32x4.dot_i16x8 instruction computes 4 accumulated 32-bit dot products in parallel, replacing 32 scalar operations per block of 16 elements.
Memory Efficiency:
Contiguous 128-bit loads (wasm_v128_load) align with SIMD-friendly memory access patterns, minimizing overhead compared to scalar byte-wise loads.
Reduced Loop Overhead:
The SIMD version processes 32 elements per iteration (two 128-bit registers for x and y), drastically reducing loop iterations compared to scalar code (which processes 1 element per iteration).
Comparison to ARM NEON
NEON (original code): Uses vdotq_s32 for direct 8-bit dot products and unrolls loops (2 blocks per iteration), achieving high throughput.
WASM SIMD (converted code):
Requires 8-bit → 16-bit extension (extend_low/high_i8x16) before using dot_i16x8, adding slight overhead.
Processes 1 block per iteration (vs. 2 in NEON), increasing loop overhead slightly.
Despite this, WASM SIMD still achieves close-to-NEON performance in practice for WebAssembly targets.
Expected Gains
Scalar Fallback (WebAssembly):
Without SIMD, scalar code requires 32 multiply-adds + loop overhead per block.
SIMD reduces this to ~4–6 vector instructions per block, yielding 4–8× speedups for the SIMD-optimized loop.
Real-World Impact:
For large tensors (common in LLMs), the SIMD-optimized loop will dominate runtime, leading to near-linear scaling with SIMD width.
Example: A 512-element dot product would see ~16× fewer loop iterations in SIMD vs. scalar.
Trade-Offs
Loop Unrolling:
The NEON version unrolls loops (2 blocks/iteration), but the WASM code processes 1 block/iteration. This slightly increases loop overhead but simplifies code maintenance.
8→16-Bit Extension:
WASM lacks direct 8-bit dot product instructions, requiring extension to 16-bit. This adds instructions but is still far faster than scalar.
Code-Specific Optimizations
The provided WASM SIMD code:
Uses wasm_f32x4_mul + wasm_f32x4_add to mimic NEON’s vmlaq_n_f32 (fused multiply-add).
Efficiently reduces partial sums with wasm_f32x4_extract_lane, minimizing horizontal addition overhead.
Conclusion
The WASM SIMD version should deliver 4–8× faster execution compared to the scalar fallback in WebAssembly, depending on runtime optimizations (e.g., V8 TurboFan). While slightly less efficient than ARM NEON due to 8→16-bit extension and loop unrolling differences, it brings WebAssembly performance much closer to native code for quantized LLM inference. For best results, ensure the runtime enables SIMD support (e.g., --experimental-wasm-simd in Node.js).