-
-
Save ochafik/aeda893dec73c2181a88752022f129e5 to your computer and use it in GitHub Desktop.
llama.cpp + RPi OpenCL (clvk --> Vulkan)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /** | |
| TODO | |
| - Checkin unmodified cl file | |
| - Update all const int to const size_t (w/ modulos if needed) | |
| - Polyfill for cl_khr_fp16 (vload_half, half defines) tested on MacOS X | |
| - https://gist.github.com/milhidaka/95863906fe828198f47991c813dbe233 | |
| - https://github.com/AcademySoftwareFoundation/Imath/blob/490be7e74e713b119895547a9d78a616ed479ceb/src/Imath/half.h#L285 | |
| - https://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion/60047308#60047308 | |
| - FAST MATH detection for faster conversion! | |
| - Functions to get all block_* fields out of array of uint32_t (for RPi), and turn vload_half into HALF_TO_FLOAT(uint32_t) | |
| - Bump https://github.com/ggerganov/llama.cpp/issues/1571 | |
| - File bug for clspv: '__kernel void bug(__global ushort* x, __global float* y) { y[get_global_id(0)] = x[get_global_id(0)]; }' | |
| */ | |
| // #define CLSPV_WORKAROUNDS 1 | |
| // #ifdef cl_khr_fp16 | |
| // #pragma OPENCL EXTENSION cl_khr_fp16 : enable | |
| // #else | |
| // #define half uint16_t | |
| // #endif // cl_khr_fp16 | |
| typedef char int8_t; | |
| typedef uchar uint8_t; | |
| typedef short int16_t; | |
| typedef ushort uint16_t; | |
| typedef int int32_t; | |
| typedef uint uint32_t; | |
| // https://github.com/AcademySoftwareFoundation/Imath/blob/490be7e74e713b119895547a9d78a616ed479ceb/src/Imath/half.h#L285 | |
| // https://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion/60047308#60047308 | |
| static float half_to_float(uint32_t h) { | |
| union { uint32_t i; float f; } v; | |
| uint32_t hexpmant = ((uint32_t) (h) << 17) >> 4; | |
| v.i = ((uint32_t) (h >> 15)) << 31; | |
| if (hexpmant >= 0x00800000) | |
| { | |
| v.i |= hexpmant; | |
| // either we are a normal number, in which case add in the bias difference | |
| // otherwise make sure all exponent bits are set | |
| if (hexpmant < 0x0f800000) | |
| v.i += 0x38000000; | |
| else | |
| v.i |= 0x7f800000; | |
| } | |
| else if (hexpmant != 0) | |
| { | |
| // exponent is 0 because we're denormal, don't have to extract | |
| // the mantissa, can just use as is | |
| uint32_t lc = clz(hexpmant) - 8; | |
| // so nominally we want to remove that extra bit we shifted | |
| // up, but we are going to add that bit back in, then subtract | |
| // from it with the 0x38800000 - (lc << 23).... | |
| // | |
| // by combining, this allows us to skip the & operation (and | |
| // remove a constant) | |
| // | |
| // hexpmant &= ~0x00800000; | |
| v.i |= 0x38800000; | |
| // lc is now x, where the desired exponent is then | |
| // -14 - lc | |
| // + 127 -> new exponent | |
| v.i |= (hexpmant << lc); | |
| v.i -= (lc << 23); | |
| } | |
| // if (true) return 12.3f; | |
| return v.f; | |
| } | |
| #ifdef CLSPV_WORKAROUNDS | |
| #ifdef __ENDIAN_LITTLE__ | |
| #define UCHAR_FROM_UINT(x, offset) ((offset) == 0 ? ((x) & 0xFF) : (offset) == 1 ? (((x) >> 8) & 0xFF) : (offset) == 2 ? (((x) >> 16) & 0xFF) : (((x) >> 24))) | |
| #define USHORT_FROM_UINT(x, offset) ((offset) == 0 ? ((x) & 0xFFFF) : (offset) == 1 ? (((x) >> 8) & 0xFFFF) : (offset) == 2 ? ((x) >> 16) : 0) | |
| #define FIRST_BITS_FROM_UINT(x, bits) ((x) & ((1 << (bits)) - 1)) | |
| #define LAST_BITS_FROM_UINT(x, bits) (((x) >> (bits)) & ((1 << (bits)) - 1)) | |
| #define CONCAT_BITS(x, xbits, y, ybits) ((x) | ((y) << (ybits))) | |
| #else | |
| #define UCHAR_FROM_UINT(x, offset) ((offset) == 0 ? ((x >> 24)) : (offset) == 1 ? ((x >> 16) & 0xFF) : (offset) == 2 ? ((x >> 8) & 0xFF) : ((x) & 0xFF)) | |
| #define USHORT_FROM_UINT(x, offset) ((offset) == 0 ? ((x) >> 16) : (offset) == 1 ? (((x) >> 8) & 0xFFFF) : (offset) == 2 ? ((x) & 0xFFFF) : 0) | |
| #define LAST_BITS_FROM_UINT(x, bits) ((x) & ((1 << (bits)) - 1)) | |
| #define FIRST_BITS_FROM_UINT(x, bits) (((x) >> (bits)) & ((1 << (bits)) - 1)) | |
| #define CONCAT_BITS(x, xbits, y, ybits) ((x) << (xbits) | (y)) | |
| #endif | |
| static uint32_t get_uint32(__global const uint32_t* x, size_t offset) { | |
| const uint8_t m = offset % 4; | |
| const size_t i = offset / 4; | |
| const uint32_t x0 = x[i]; | |
| if (m == 0) { | |
| return x0; | |
| } | |
| const uint32_t x1 = x[i + 1]; | |
| const uint8_t b1 = m * 8; | |
| const uint8_t b0 = 32 - b1; | |
| return CONCAT_BITS(LAST_BITS_FROM_UINT(x0, b0), b0, FIRST_BITS_FROM_UINT(x1, b1), b1); | |
| } | |
| static uint16_t get_uint16(__global const uint32_t* x, size_t offset) { | |
| const uint8_t m = offset % 4; | |
| const size_t i = offset / 4; | |
| const uint32_t x0 = x[i]; | |
| if (m < 3) { | |
| return USHORT_FROM_UINT(x0, m); | |
| } | |
| const uint32_t x1 = x[i + 1]; | |
| return CONCAT_BITS(LAST_BITS_FROM_UINT(x0, 8), 8, FIRST_BITS_FROM_UINT(x1, 8), 8); | |
| } | |
| static float get_half(__global const uint32_t* x, size_t offset) { | |
| return half_to_float(get_uint16(x, offset)); | |
| } | |
| static uint8_t get_uint8(__global const uint32_t* x, size_t offset) { | |
| const uint8_t m = offset % 4; | |
| const size_t i = offset / 4; | |
| const uint32_t x0 = x[i]; | |
| return UCHAR_FROM_UINT(x0, m); | |
| } | |
| static int8_t get_int8(__global const uint32_t* x, size_t offset) { | |
| return (int8_t)get_uint8(x, offset); | |
| } | |
| #endif // CLSPV_WORKAROUNDS | |
| struct __attribute__ ((packed)) block_q4_0 | |
| { | |
| half d; | |
| uint8_t qs[QK4_0 / 2]; | |
| }; | |
| #ifndef CLSPV_WORKAROUNDS | |
| #define BLOCK_Q4_0_D(p, i) vload_half(0, &p[i].d) | |
| #define BLOCK_Q4_0_QS(p, i, iQ) p[i].qs[iQ] | |
| #else | |
| #define BLOCK_Q4_0_D(p, i) get_half (p, (i) * sizeof(struct block_q4_0)) | |
| #define BLOCK_Q4_0_QS(p, i, iQ) get_uint8 (p, (i) * sizeof(struct block_q4_0) + 2 + (iQ)) | |
| #endif | |
| struct __attribute__ ((packed)) block_q4_1 | |
| { | |
| half d; | |
| half m; | |
| uint8_t qs[QK4_1 / 2]; | |
| }; | |
| #ifndef CLSPV_WORKAROUNDS | |
| #define BLOCK_Q4_1_D(p, i) vload_half(0, &p[i].d) | |
| #define BLOCK_Q4_1_M(p, i) vload_half(0, &p[i].m) | |
| #define BLOCK_Q4_1_QS(p, i, iQ) p[i].qs[iQ] | |
| #else | |
| #define BLOCK_Q4_1_D(p, i) get_half (p, (i) * sizeof(struct block_q4_1)) | |
| #define BLOCK_Q4_1_M(p, i) get_half (p, (i) * sizeof(struct block_q4_1) + 2) | |
| #define BLOCK_Q4_1_QS(p, i, iQ) get_uint8 (p, (i) * sizeof(struct block_q4_1) + 4 + (iQ)) | |
| #endif | |
| struct __attribute__ ((packed)) block_q5_0 | |
| { | |
| half d; | |
| uint32_t qh; | |
| uint8_t qs[QK5_0 / 2]; | |
| }; | |
| #ifndef CLSPV_WORKAROUNDS | |
| #define BLOCK_Q5_0_D(p, i) vload_half(0, &p[i].d) | |
| #define BLOCK_Q5_0_QH(p, i) p[i].qh | |
| #define BLOCK_Q5_0_QS(p, i, iQ) p[i].qs[iQ] | |
| #else | |
| #define BLOCK_Q5_0_D(p, i) get_half (p, (i) * sizeof(struct block_q5_0)) | |
| #define BLOCK_Q5_0_QH(p, i) get_uint32(p, (i) * sizeof(struct block_q5_0) + 2) | |
| #define BLOCK_Q5_0_QS(p, i, iQ) get_uint8 (p, (i) * sizeof(struct block_q5_0) + 2 + 4 + (iQ)) | |
| #endif | |
| struct __attribute__ ((packed)) block_q5_1 | |
| { | |
| half d; | |
| half m; | |
| uint32_t qh; | |
| uint8_t qs[QK5_1 / 2]; | |
| }; | |
| #ifndef CLSPV_WORKAROUNDS | |
| #define BLOCK_Q5_1_D(p, i) vload_half(0, &p[i].d) | |
| #define BLOCK_Q5_1_M(p, i) vload_half(0, &p[i].m) | |
| #define BLOCK_Q5_1_QH(p, i) p[i].qh | |
| #define BLOCK_Q5_1_QS(p, i, iQ) p[i].qs[iQ] | |
| #else | |
| #define BLOCK_Q5_1_D(p, i) get_half (p, (i) * sizeof(struct block_q5_1)) | |
| #define BLOCK_Q5_1_M(p, i) get_half (p, (i) * sizeof(struct block_q5_1) + 2) | |
| #define BLOCK_Q5_1_QH(p, i) get_uint32(p, (i) * sizeof(struct block_q5_1) + 2 + 2) | |
| #define BLOCK_Q5_1_QS(p, i, iQ) get_uint8 (p, (i) * sizeof(struct block_q5_1) + 2 + 2 + 4, (iQ)) | |
| #endif | |
| struct __attribute__ ((packed)) block_q8_0 | |
| { | |
| half d; | |
| int8_t qs[QK8_0]; | |
| }; | |
| #ifndef CLSPV_WORKAROUNDS | |
| #define BLOCK_Q8_0_D(p, i) vload_half(0, &p[i].d) | |
| #define BLOCK_Q8_0_QS(p, i, iQ) p[i].qs[iQ] | |
| #else | |
| #define BLOCK_Q8_0_D(p, i) get_half (p, (i) * sizeof(struct block_q8_0)) | |
| #define BLOCK_Q8_0_QS(p, i, iQ) get_int8 (p, (i) * sizeof(struct block_q8_0) + 2 + (iQ)) | |
| #endif | |
| struct __attribute__((packed)) block_q2_K | |
| { | |
| uint8_t scales[16]; | |
| uint8_t qs[64]; | |
| half d; | |
| half dmin; | |
| }; | |
| #ifndef CLSPV_WORKAROUNDS | |
| #define BLOCK_Q2_K_SCALES(p, i, iScale) p[i].scales[iScale] | |
| #define BLOCK_Q2_K_QS(p, i, iQ) p[i].qs[iQ] | |
| #define BLOCK_Q2_K_D(p, i) vload_half(0, &p[i].d) | |
| #define BLOCK_Q2_K_DMIN(p, i) vload_half(0, &p[i].dmin) | |
| #else | |
| #define BLOCK_Q2_K_SCALES(p, i, iScale) get_uint8 (p, (i) * sizeof(struct block_q2_K) + (iScale)) | |
| #define BLOCK_Q2_K_QS(p, i, iQ) get_uint8 (p, (i) * sizeof(struct block_q2_K) + 16 + (iQ)) | |
| #define BLOCK_Q2_K_D(p, i) get_half (p, (i) * sizeof(struct block_q2_K) + 16 + 64) | |
| #define BLOCK_Q2_K_DMIN(p, i) get_half (p, (i) * sizeof(struct block_q2_K) + 16 + 64 + 2) | |
| #endif | |
| struct __attribute__((packed)) block_q3_K | |
| { | |
| uint8_t hmask[32]; | |
| uint8_t qs[64]; | |
| uint8_t scales[12]; | |
| half d; | |
| }; | |
| #ifndef CLSPV_WORKAROUNDS | |
| #define BLOCK_Q3_K_HMASK(p, i, iM) p[i].hmask[iM] | |
| #define BLOCK_Q3_K_QS(p, i, iQ) p[i].qs[iQ] | |
| #define BLOCK_Q3_K_SCALES(p, i, iScale) p[i].scales[iScale] | |
| #define BLOCK_Q3_K_D(p, i) vload_half(0, &p[i].d) | |
| #else | |
| #define BLOCK_Q3_K_HMASK(p, i, iM) get_uint8 (p, (i) * sizeof(struct block_q3_K) + iM) | |
| #define BLOCK_Q3_K_QS(p, i, iQ) get_uint8 (p, (i) * sizeof(struct block_q3_K) + 32 + (iQ)) | |
| #define BLOCK_Q3_K_SCALES(p, i, iScale) get_uint8 (p, (i) * sizeof(struct block_q3_K) + 32 + 64 + (iScale)) | |
| #define BLOCK_Q3_K_D(p, i) get_half (p, (i) * sizeof(struct block_q3_K) + 32 + 64 + 12) | |
| #endif | |
| struct __attribute__((packed)) block_q4_K | |
| { | |
| half d; | |
| half dmin; | |
| uint8_t scales[12]; | |
| uint8_t qs[128]; | |
| }; | |
| #ifndef CLSPV_WORKAROUNDS | |
| #define BLOCK_Q4_K_D(p, i) vload_half(0, &p[i].d) | |
| #define BLOCK_Q4_K_DMIN(p, i) vload_half(0, &p[i].dmin) | |
| #define BLOCK_Q4_K_SCALES(p, i, iScale) p[i].scales[iScale] | |
| #define BLOCK_Q4_K_QS(p, i, iQ) p[i].qs[iQ] | |
| #else | |
| #define BLOCK_Q4_K_D(p, i) get_half (p, (i) * sizeof(struct block_q4_K)) | |
| #define BLOCK_Q4_K_DMIN(p, i) get_half (p, (i) * sizeof(struct block_q4_K) + 2) | |
| #define BLOCK_Q4_K_SCALES(p, i, iScale) get_uint8 (p, (i) * sizeof(struct block_q4_K) + 2 + 2 + (iScale)) | |
| #define BLOCK_Q4_K_QS(p, i, iQ) get_uint8 (p, (i) * sizeof(struct block_q4_K) + 2 + 2 + 12 + (iQ)) | |
| #endif | |
| struct __attribute__((packed)) block_q5_K | |
| { | |
| half d; | |
| half dmin; | |
| uint8_t scales[12]; | |
| uint8_t qh[32]; | |
| uint8_t qs[128]; | |
| }; | |
| #ifndef CLSPV_WORKAROUNDS | |
| #define BLOCK_Q5_K_D(p, i) vload_half(0, &p[i].d) | |
| #define BLOCK_Q5_K_DMIN(p, i) vload_half(0, &p[i].dmin) | |
| #define BLOCK_Q5_K_SCALES(p, i, iScale) p[i].scales[iScale] | |
| #define BLOCK_Q5_K_QH(p, i, iQ) p[i].qh[iQ] | |
| #define BLOCK_Q5_K_QS(p, i, iQ) p[i].qs[iQ] | |
| #else | |
| #define BLOCK_Q5_K_D(p, i) get_half (p, (i) * sizeof(struct block_q5_K)) | |
| #define BLOCK_Q5_K_DMIN(p, i) get_half (p, (i) * sizeof(struct block_q5_K) + 2) | |
| #define BLOCK_Q5_K_SCALES(p, i, iScale) get_uint8 (p, (i) * sizeof(struct block_q5_K) + 2 + 2 + (iScale)) | |
| #define BLOCK_Q5_K_QH(p, i, iQ) get_uint8 (p, (i) * sizeof(struct block_q5_K) + 2 + 2 + 12 + (iQ)) | |
| #define BLOCK_Q5_K_QS(p, i, iQ) get_uint8 (p, (i) * sizeof(struct block_q5_K) + 2 + 2 + 12 + 32 + (iQ)) | |
| #endif | |
| struct __attribute__((packed)) block_q6_K | |
| { | |
| uint8_t ql[128]; | |
| uint8_t qh[64]; | |
| int8_t scales[16]; | |
| half d; | |
| }; | |
| #ifndef CLSPV_WORKAROUNDS | |
| #define BLOCK_Q6_K_QL(p, i, iQ) p[i].ql[iQ] | |
| #define BLOCK_Q6_K_QH(p, i, iQ) p[i].qh[iQ] | |
| #define BLOCK_Q6_K_SCALES(p, i, iScale) p[i].scales[iScale] | |
| #define BLOCK_Q6_K_D(p, i) vload_half(0, &p[i].d) | |
| #else | |
| #define BLOCK_Q6_K_QL(p, i, iQ) get_uint8 (p, (i) * sizeof(struct block_q6_K) + (iQ)) | |
| #define BLOCK_Q6_K_QH(p, i, iQ) get_uint8 (p, (i) * sizeof(struct block_q6_K) + 128 + (iQ)) | |
| #define BLOCK_Q6_K_SCALES(p, i, iScale) get_int8 (p, (i) * sizeof(struct block_q6_K) + 128 + 64 + (iScale)) | |
| #define BLOCK_Q6_K_D(p, i) get_half (p, (i) * sizeof(struct block_q6_K) + 128 + 64 + 16) | |
| #endif | |
| #ifndef CLSPV_WORKAROUNDS | |
| __kernel void convert_fp16_to_fp32(__global half* x, __global float* y) { | |
| const uint i = get_global_id(0); | |
| y[i] = vload_half(0, &x[i]); | |
| } | |
| #else | |
| __kernel void convert_fp16_to_fp32(__global uint32_t* x, __global float* y) { | |
| const uint i = get_global_id(0); | |
| y[i] = get_half(x, i * 2); | |
| } | |
| #endif | |
| static void dequantize_q4_0(__global const struct block_q4_0* x, const int ib, const int iqs, float* v0, float* v1) { | |
| const float d = BLOCK_Q4_0_D(x, ib); | |
| const uint8_t vui = BLOCK_Q4_0_QS(x, ib, iqs); | |
| const int8_t vi0 = vui & 0xF; | |
| const int8_t vi1 = vui >> 4; | |
| *v0 = (vi0 - 8)*d; | |
| *v1 = (vi1 - 8)*d; | |
| } | |
| static void dequantize_q4_1(__global const struct block_q4_1* x, const int ib, const int iqs, float* v0, float* v1) { | |
| const float d = BLOCK_Q4_1_D(x, ib); | |
| const float m = BLOCK_Q4_1_M(x, ib); | |
| const uint8_t vui = BLOCK_Q4_1_QS(x, ib, iqs); | |
| const int8_t vi0 = vui & 0xF; | |
| const int8_t vi1 = vui >> 4; | |
| *v0 = vi0*d + m; | |
| *v1 = vi1*d + m; | |
| } | |
| static void dequantize_q5_0(__global const struct block_q5_0* x, const int ib, const int iqs, float* v0, float* v1) { | |
| const float d = BLOCK_Q5_0_D(x, ib); | |
| uint32_t qh = BLOCK_Q5_0_QH(x, ib); | |
| const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; | |
| const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10; | |
| const int32_t x0 = ((BLOCK_Q5_0_QS(x, ib, iqs) & 0xf) | xh_0) - 16; | |
| const int32_t x1 = ((BLOCK_Q5_0_QS(x, ib, iqs) >> 4) | xh_1) - 16; | |
| *v0 = x0*d; | |
| *v1 = x1*d; | |
| } | |
| static void dequantize_q5_1(__global const struct block_q5_1* x, const int ib, const int iqs, float* v0, float* v1) { | |
| const float d = BLOCK_Q5_1_D(x, ib); | |
| const float m = BLOCK_Q5_1_M(x, ib); | |
| uint32_t qh = BLOCK_Q5_1_QH(x, ib); | |
| const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; | |
| const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10; | |
| const int32_t x0 = ((BLOCK_Q5_1_QS(x, ib, iqs) & 0xf) | xh_0); | |
| const int32_t x1 = ((BLOCK_Q5_1_QS(x, ib, iqs) >> 4) | xh_1); | |
| *v0 = x0*d + m; | |
| *v1 = x1*d + m; | |
| } | |
| static void dequantize_q8_0(__global const struct block_q8_0* x, const int ib, const int iqs, float* v0, float* v1) { | |
| const float d = BLOCK_Q8_0_D(x, ib); | |
| const int8_t vi0 = x[ib].qs[iqs + 0]; | |
| const int8_t vi1 = x[ib].qs[iqs + 1]; | |
| *v0 = vi0*d; | |
| *v1 = vi1*d; | |
| } | |
| #ifndef CLSPV_WORKAROUNDS | |
| inline convert_f16(__global half* x, const int ib, const int iqs, float* v0, float* v1){ | |
| *v0 = vload_half(0, &x[ib + 0]); | |
| *v1 = vload_half(0, &x[ib + 1]); | |
| } | |
| #else | |
| inline void convert_f16(__global uint32_t* x, const int ib, const int iqs, float* v0, float* v1){ | |
| *v0 = get_half(x, 2 * (ib + 0)); | |
| *v1 = get_half(x, 2 * (ib + 1)); | |
| } | |
| #endif | |
| inline void get_scale_min_k4(int j, const __global uint8_t *q, uint8_t *d, uint8_t *m) | |
| { | |
| if (j < 4) | |
| { | |
| *d = q[j] & 63; | |
| *m = q[j + 4] & 63; | |
| } | |
| else | |
| { | |
| *d = (q[j + 4] & 0xF) | ((q[j - 4] >> 6) << 4); | |
| *m = (q[j + 4] >> 4) | ((q[j - 0] >> 6) << 4); | |
| } | |
| } | |
| __kernel void dequantize_block_q2_K(__global const struct block_q2_K *x, __global float *yy) | |
| { | |
| const int i = get_group_id(0) + get_global_offset(0); | |
| const int tid = get_local_id(0); | |
| const int n = tid / 32; | |
| const int l = tid - 32 * n; | |
| const int is = 8 * n + l / 16; | |
| const uint8_t q = BLOCK_Q2_K_QS(x, i, 32 * n + l); | |
| __global float *y = yy + get_group_id(0) * QK_K + 128 * n; | |
| const float dall = BLOCK_Q2_K_D(x, i); | |
| const float dmin = BLOCK_Q2_K_DMIN(x, i); | |
| y[l + 0] = dall * (BLOCK_Q2_K_SCALES(x, i, is + 0) & 0xF) * ((q >> 0) & 3) - dmin * (BLOCK_Q2_K_SCALES(x, i, is + 0) >> 4); | |
| y[l + 32] = dall * (BLOCK_Q2_K_SCALES(x, i, is + 2) & 0xF) * ((q >> 2) & 3) - dmin * (BLOCK_Q2_K_SCALES(x, i, is + 2) >> 4); | |
| y[l + 64] = dall * (BLOCK_Q2_K_SCALES(x, i, is + 4) & 0xF) * ((q >> 4) & 3) - dmin * (BLOCK_Q2_K_SCALES(x, i, is + 4) >> 4); | |
| y[l + 96] = dall * (BLOCK_Q2_K_SCALES(x, i, is + 6) & 0xF) * ((q >> 6) & 3) - dmin * (BLOCK_Q2_K_SCALES(x, i, is + 6) >> 4); | |
| } | |
| __kernel void dequantize_block_q3_K(__global const struct block_q3_K *x, __global float *yy) | |
| { | |
| int r = get_local_id(0) / 4; | |
| int i = get_group_id(0) + get_global_offset(0); | |
| int tid = r / 2; | |
| int is0 = r % 2; | |
| int l0 = 16 * is0 + 4 * (get_local_id(0) % 4); | |
| int n = tid / 4; | |
| int j = tid - 4 * n; | |
| uint8_t m = 1 << (4 * n + j); | |
| int is = 8 * n + 2 * j + is0; | |
| int shift = 2 * j; | |
| int8_t us = is < 4 ? (BLOCK_Q3_K_SCALES(x, i, is - 0) & 0xF) | (((BLOCK_Q3_K_SCALES(x, i, is + 8) >> 0) & 3) << 4) | |
| : is < 8 ? (BLOCK_Q3_K_SCALES(x, i, is - 0) & 0xF) | (((BLOCK_Q3_K_SCALES(x, i, is + 4) >> 2) & 3) << 4) | |
| : is < 12 ? (BLOCK_Q3_K_SCALES(x, i, is - 8) >> 4) | (((BLOCK_Q3_K_SCALES(x, i, is + 0) >> 4) & 3) << 4) | |
| : (BLOCK_Q3_K_SCALES(x, i, is - 8) >> 4) | (((BLOCK_Q3_K_SCALES(x, i, is - 4) >> 6) & 3) << 4); | |
| float d_all = BLOCK_Q3_K_D(x, i); | |
| float dl = d_all * (us - 32); | |
| __global float *y = yy + get_group_id(0) * QK_K + 128 * n + 32 * j; | |
| const size_t qs_offset = 32 * n; | |
| for (int l = l0; l < l0 + 4; ++l) | |
| y[l] = dl * ((int8_t)((BLOCK_Q3_K_QS(x, i, qs_offset + l) >> shift) & 3) - ((BLOCK_Q3_K_HMASK(x, i, l) & m) ? 0 : 4)); | |
| } | |
| __kernel void dequantize_block_q4_K(__global const struct block_q4_K *x, __global float *yy) | |
| { | |
| const int i = get_group_id(0) + get_global_offset(0); | |
| const int tid = get_local_id(0); | |
| const int il = tid / 8; | |
| const int ir = tid % 8; | |
| const int is = 2 * il; | |
| const int n = 4; | |
| __global float *y = yy + get_group_id(0) * QK_K + 64 * il + n * ir; | |
| const float dall = BLOCK_Q4_K_D(x, i); | |
| const float dmin = BLOCK_Q4_K_DMIN(x, i); | |
| const int qs_offset = 32 * il + n * ir; | |
| uint8_t sc, m; | |
| get_scale_min_k4(is + 0, x[i].scales, &sc, &m); | |
| float d1 = dall * sc; | |
| float m1 = dmin * m; | |
| get_scale_min_k4(is + 1, x[i].scales, &sc, &m); | |
| float d2 = dall * sc; | |
| float m2 = dmin * m; | |
| for (int l = 0; l < n; ++l) | |
| { | |
| uint8_t q = BLOCK_Q4_K_QS(x, i, qs_offset + l); | |
| y[l + 0] = d1 * (q & 0xF) - m1; | |
| y[l + 32] = d2 * (q >> 4) - m2; | |
| } | |
| } | |
| __kernel void dequantize_block_q5_K(__global const struct block_q5_K *x, __global float *yy) | |
| { | |
| const int i = get_group_id(0) + get_global_offset(0); | |
| const int tid = get_local_id(0); | |
| const int il = tid / 16; | |
| const int ir = tid % 16; | |
| const int is = 2 * il; | |
| __global float *y = yy + get_group_id(0) * QK_K + 64 * il + 2 * ir; | |
| const float dall = BLOCK_Q5_K_D(x, i); | |
| const float dmin = BLOCK_Q5_K_DMIN(x, i); | |
| const int ql_offset = 32 * il + 2 * ir; | |
| const int qh_offset = 2 * ir; | |
| uint8_t sc, m; | |
| get_scale_min_k4(is + 0, x[i].scales, &sc, &m); | |
| const float d1 = dall * sc; | |
| const float m1 = dmin * m; | |
| get_scale_min_k4(is + 1, x[i].scales, &sc, &m); | |
| const float d2 = dall * sc; | |
| const float m2 = dmin * m; | |
| uint8_t hm = 1 << (2 * il); | |
| uint8_t ql0 = BLOCK_Q5_K_QS(x, i, ql_offset + 0); | |
| uint8_t ql1 = BLOCK_Q5_K_QS(x, i, ql_offset + 1); | |
| uint8_t qh0 = BLOCK_Q5_K_QH(x, i, qh_offset + 0); | |
| uint8_t qh1 = BLOCK_Q5_K_QH(x, i, qh_offset + 1); | |
| y[0] = d1 * ((ql0 & 0xF) + (qh0 & hm ? 16 : 0)) - m1; | |
| y[1] = d1 * ((ql1 & 0xF) + (qh1 & hm ? 16 : 0)) - m1; | |
| hm <<= 1; | |
| y[32] = d2 * ((ql0 >> 4) + (qh0 & hm ? 16 : 0)) - m2; | |
| y[33] = d2 * ((ql1 >> 4) + (qh1 & hm ? 16 : 0)) - m2; | |
| } | |
| __kernel void dequantize_block_q6_K(__global const struct block_q6_K *x, __global float *yy) | |
| { | |
| const int i = get_group_id(0) + get_global_offset(0); | |
| const int tid = get_local_id(0); | |
| const int ip = tid / 32; | |
| const int il = tid - 32 * ip; | |
| const int is = 8 * ip + il / 16; | |
| __global float *y = yy + get_group_id(0) * QK_K + 128 * ip + il; | |
| const float d = BLOCK_Q6_K_D(x, i); | |
| const int ql_offset = 64 * ip + il; | |
| const uint8_t qh = BLOCK_Q6_K_QH(x, i, 32 * ip + il); | |
| y[0] = d * BLOCK_Q6_K_SCALES(x, i, is + 0) * ((int8_t)((BLOCK_Q6_K_QL(x, i, ql_offset + 0) & 0xF) | (((qh >> 0) & 3) << 4)) - 32); | |
| y[32] = d * BLOCK_Q6_K_SCALES(x, i, is + 2) * ((int8_t)((BLOCK_Q6_K_QL(x, i, ql_offset + 32) & 0xF) | (((qh >> 2) & 3) << 4)) - 32); | |
| y[64] = d * BLOCK_Q6_K_SCALES(x, i, is + 4) * ((int8_t)((BLOCK_Q6_K_QL(x, i, ql_offset + 0) >> 4) | (((qh >> 4) & 3) << 4)) - 32); | |
| y[96] = d * BLOCK_Q6_K_SCALES(x, i, is + 6) * ((int8_t)((BLOCK_Q6_K_QL(x, i, ql_offset + 32) >> 4) | (((qh >> 6) & 3) << 4)) - 32); | |
| } | |
| __kernel void dequantize_mul_mat_vec_q2_K(__global const struct block_q2_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) { | |
| const int row = get_group_id(0); | |
| const int num_blocks_per_row = ncols / QK_K; | |
| const int ib0 = row*num_blocks_per_row + get_global_offset(0); | |
| __global const struct block_q2_K * x = xx + ib0; | |
| const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...31 or 0...15 | |
| const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION; // 0 or 0,1 | |
| const int step = 16/K_QUANTS_PER_ITERATION; | |
| const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... | |
| const int in = tid - step*im; // 0...15 or 0...7 | |
| const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2 | |
| const int q_offset = 32*im + l0; | |
| const int s_offset = 8*im; | |
| const int y_offset = 128*im + l0; | |
| tmp[16 * ix + tid] = 0; | |
| uint32_t aux[4]; | |
| const uint8_t * d = (const uint8_t *)aux; | |
| const uint8_t * m = (const uint8_t *)(aux + 2); | |
| for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { | |
| __global const float * y = yy + i * QK_K + y_offset; | |
| // __global const uint8_t * q = BLOCK_Q2_K_QS(x, i, q_offset; | |
| const float dall = BLOCK_Q2_K_D(x, i); | |
| const float dmin = BLOCK_Q2_K_DMIN(x, i); | |
| __global const uint32_t * a = (__global const uint32_t *)(x[i].scales + s_offset); | |
| aux[0] = a[0] & 0x0f0f0f0f; | |
| aux[1] = a[1] & 0x0f0f0f0f; | |
| aux[2] = (a[0] >> 4) & 0x0f0f0f0f; | |
| aux[3] = (a[1] >> 4) & 0x0f0f0f0f; | |
| float sum1 = 0, sum2 = 0; | |
| for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { | |
| sum1 += y[l+ 0] * d[0] * ((BLOCK_Q2_K_QS(x, i, q_offset + l+ 0) >> 0) & 3) | |
| + y[l+32] * d[2] * ((BLOCK_Q2_K_QS(x, i, q_offset + l+ 0) >> 2) & 3) | |
| + y[l+64] * d[4] * ((BLOCK_Q2_K_QS(x, i, q_offset + l+ 0) >> 4) & 3) | |
| + y[l+96] * d[6] * ((BLOCK_Q2_K_QS(x, i, q_offset + l+ 0) >> 6) & 3) | |
| + y[l+16] * d[1] * ((BLOCK_Q2_K_QS(x, i, q_offset + l+16) >> 0) & 3) | |
| + y[l+48] * d[3] * ((BLOCK_Q2_K_QS(x, i, q_offset + l+16) >> 2) & 3) | |
| + y[l+80] * d[5] * ((BLOCK_Q2_K_QS(x, i, q_offset + l+16) >> 4) & 3) | |
| +y[l+112] * d[7] * ((BLOCK_Q2_K_QS(x, i, q_offset + l+16) >> 6) & 3); | |
| sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6] | |
| + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7]; | |
| } | |
| tmp[16 * ix + tid] += dall * sum1 - dmin * sum2; | |
| } | |
| // sum up partial sums and write back result | |
| barrier(CLK_LOCAL_MEM_FENCE); | |
| for (int s=16; s>0; s>>=1) { | |
| if (tid < s) { | |
| tmp[tid] += tmp[tid + s]; | |
| } | |
| barrier(CLK_LOCAL_MEM_FENCE); | |
| } | |
| if (tid == 0) { | |
| dst[row] = tmp[0]; | |
| } | |
| } | |
| __kernel void dequantize_mul_mat_vec_q3_K(__global const struct block_q3_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) { | |
| const uint16_t kmask1 = 0x0303; | |
| const uint16_t kmask2 = 0x0f0f; | |
| const int row = get_group_id(0); | |
| const int num_blocks_per_row = ncols / QK_K; | |
| const int ib0 = row*num_blocks_per_row + get_global_offset(0); | |
| __global const struct block_q3_K * x = xx + ib0; | |
| const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 | |
| const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION; // 0 or 0,1 | |
| const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop | |
| const int step = 16/K_QUANTS_PER_ITERATION; | |
| const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... | |
| const int in = tid - step*im; // 0....15 or 0...7 | |
| const uint8_t m = 1 << (4*im); | |
| const int l0 = n*in; // 0...15 or 0...14 in steps of 2 | |
| const int q_offset = 32*im + l0; | |
| const int y_offset = 128*im + l0; | |
| uint16_t utmp[4]; | |
| const int8_t * s = (const int8_t *)utmp; | |
| const uint16_t s_shift = 4*im; | |
| tmp[16 * ix + tid] = 0; | |
| for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { | |
| __global const float * y = yy + i * QK_K + y_offset; | |
| __global const uint16_t * a = (__global const uint16_t *)x[i].scales; | |
| utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4); | |
| utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4); | |
| utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4); | |
| utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4); | |
| const float d = BLOCK_Q2_K_D(x, i); | |
| float sum = 0; | |
| for (int l = 0; l < n; ++l) { | |
| sum += y[l+ 0] * (s[0] - 32) * (((BLOCK_Q3_K_QS(x, i, q_offset + l) >> 0) & 3) - (BLOCK_Q3_K_HMASK(x, i, l0 + l) & (m << 0) ? 0 : 4)) | |
| + y[l+32] * (s[2] - 32) * (((BLOCK_Q3_K_QS(x, i, q_offset + l) >> 2) & 3) - (BLOCK_Q3_K_HMASK(x, i, l0 + l) & (m << 1) ? 0 : 4)) | |
| + y[l+64] * (s[4] - 32) * (((BLOCK_Q3_K_QS(x, i, q_offset + l) >> 4) & 3) - (BLOCK_Q3_K_HMASK(x, i, l0 + l) & (m << 2) ? 0 : 4)) | |
| + y[l+96] * (s[6] - 32) * (((BLOCK_Q3_K_QS(x, i, q_offset + l) >> 6) & 3) - (BLOCK_Q3_K_HMASK(x, i, l0 + l) & (m << 3) ? 0 : 4)); | |
| sum += y[l+16] * (s[1] - 32) * (((BLOCK_Q3_K_QS(x, i, q_offset + l+16) >> 0) & 3) - (BLOCK_Q3_K_HMASK(x, i, l0 + l+16) & (m << 0) ? 0 : 4)) | |
| + y[l+48] * (s[3] - 32) * (((BLOCK_Q3_K_QS(x, i, q_offset + l+16) >> 2) & 3) - (BLOCK_Q3_K_HMASK(x, i, l0 + l+16) & (m << 1) ? 0 : 4)) | |
| + y[l+80] * (s[5] - 32) * (((BLOCK_Q3_K_QS(x, i, q_offset + l+16) >> 4) & 3) - (BLOCK_Q3_K_HMASK(x, i, l0 + l+16) & (m << 2) ? 0 : 4)) | |
| + y[l+112] * (s[7] - 32) * (((BLOCK_Q3_K_QS(x, i, q_offset + l+16) >> 6) & 3) - (BLOCK_Q3_K_HMASK(x, i, l0 + l+16) & (m << 3) ? 0 : 4)); | |
| } | |
| tmp[16 * ix + tid] += d * sum; | |
| } | |
| // sum up partial sums and write back result | |
| barrier(CLK_LOCAL_MEM_FENCE); | |
| for (int s=16; s>0; s>>=1) { | |
| if (tid < s) { | |
| tmp[tid] += tmp[tid + s]; | |
| } | |
| barrier(CLK_LOCAL_MEM_FENCE); | |
| } | |
| if (tid == 0) { | |
| dst[row] = tmp[0]; | |
| } | |
| } | |
| __kernel void dequantize_mul_mat_vec_q4_K(__global const struct block_q4_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) { | |
| //to rename it later, just to test now | |
| const uint16_t kmask1 = 0x3f3f; | |
| const uint16_t kmask2 = 0x0f0f; | |
| const uint16_t kmask3 = 0xc0c0; | |
| const int row = get_group_id(0); | |
| const int num_blocks_per_row = ncols / QK_K; | |
| const int ib0 = row*num_blocks_per_row + get_global_offset(0); | |
| const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...15 | |
| const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION; | |
| const int step = 8/K_QUANTS_PER_ITERATION; | |
| const int il = tid/step; // 0...3 | |
| const int ir = tid - step*il;// 0...3 | |
| const int n = 2*K_QUANTS_PER_ITERATION; | |
| const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 | |
| const int in = il%2; | |
| const int l0 = n*(2*ir + in); | |
| const int q_offset = 32*im + l0; | |
| const int y_offset = 64*im + l0; | |
| uint16_t aux[4]; | |
| const uint8_t * sc = (const uint8_t *)aux; | |
| __global const struct block_q4_K * x = xx + ib0; | |
| tmp[16 * ix + tid] = 0; | |
| for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { | |
| __global const float * y1 = yy + i*QK_K + y_offset; | |
| __global const float * y2 = y1 + 128; | |
| const float dall = BLOCK_Q4_K_D(x, i); | |
| const float dmin = BLOCK_Q4_K_DMIN(x, i); | |
| __global const uint16_t * a = (__global const uint16_t *)x[i].scales; | |
| aux[0] = a[im+0] & kmask1; | |
| aux[1] = a[im+2] & kmask1; | |
| aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); | |
| aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); | |
| float4 s = (float4)(0.f); | |
| float smin = 0; | |
| for (int l = 0; l < n; ++l) { | |
| uint8_t q1 = BLOCK_Q4_K_QS(x, i, q_offset + l); | |
| uint8_t q2 = BLOCK_Q4_K_QS(x, i, q_offset + 64 + l); | |
| s.x += y1[l] * (q1 & 0xF); s.y += y1[l+32] * (q1 >> 4); | |
| s.z += y2[l] * (q2 & 0xF); s.w += y2[l+32] * (q2 >> 4); | |
| smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; | |
| } | |
| tmp[16 * ix + tid] += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin; | |
| } | |
| // sum up partial sums and write back result | |
| barrier(CLK_LOCAL_MEM_FENCE); | |
| for (int s=16; s>0; s>>=1) { | |
| if (tid < s) { | |
| tmp[tid] += tmp[tid + s]; | |
| } | |
| barrier(CLK_LOCAL_MEM_FENCE); | |
| } | |
| if (tid == 0) { | |
| dst[row] = tmp[0]; | |
| } | |
| } | |
| __kernel void dequantize_mul_mat_vec_q5_K(__global const struct block_q5_K * xx, __local float* tmp, __global float* yy, __global float* dst, const int ncols) { | |
| const uint16_t kmask1 = 0x3f3f; | |
| const uint16_t kmask2 = 0x0f0f; | |
| const uint16_t kmask3 = 0xc0c0; | |
| const int row = get_group_id(0); | |
| const int num_blocks_per_row = ncols / QK_K; | |
| const int ib0 = row*num_blocks_per_row + get_global_offset(0); | |
| const int tid = get_local_id(0)/2; // 0...15 | |
| const int ix = get_local_id(0)%2; | |
| const int il = tid/4; // 0...3 | |
| const int ir = tid - 4*il;// 0...3 | |
| const int n = 2; | |
| const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 | |
| const int in = il%2; | |
| const int l0 = n*(2*ir + in); | |
| const int q_offset = 32*im + l0; | |
| const int y_offset = 64*im + l0; | |
| const uint8_t hm1 = 1 << (2*im); | |
| const uint8_t hm2 = hm1 << 4; | |
| uint16_t aux[4]; | |
| const uint8_t * sc = (const uint8_t *)aux; | |
| __global const struct block_q5_K * x = xx + ib0; | |
| tmp[16 * ix + tid] = 0; | |
| for (int i = ix; i < num_blocks_per_row; i += 2) { | |
| __global const float * y1 = yy + i*QK_K + y_offset; | |
| __global const float * y2 = y1 + 128; | |
| const float dall = BLOCK_Q5_K_D(x, i); | |
| const float dmin = BLOCK_Q5_K_DMIN(x, i); | |
| __global const uint16_t * a = (__global const uint16_t *)x[i].scales; | |
| aux[0] = a[im+0] & kmask1; | |
| aux[1] = a[im+2] & kmask1; | |
| aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); | |
| aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); | |
| float4 sum = (float4)(0.f); | |
| float smin = 0; | |
| for (int l = 0; l < n; ++l) { | |
| sum.x += y1[l+ 0] * ((BLOCK_Q5_K_QS(x, i, q_offset + l+ 0) & 0xF) + (BLOCK_Q5_K_QH(x, i, l0 + l+ 0) & (hm1 << 0) ? 16 : 0)) | |
| + y1[l+16] * ((BLOCK_Q5_K_QS(x, i, q_offset + l+16) & 0xF) + (BLOCK_Q5_K_QH(x, i, l0 + l+16) & (hm1 << 0) ? 16 : 0)); | |
| sum.y += y1[l+32] * ((BLOCK_Q5_K_QS(x, i, q_offset + l+ 0) >> 4) + (BLOCK_Q5_K_QH(x, i, l0 + l+ 0) & (hm1 << 1) ? 16 : 0)) | |
| + y1[l+48] * ((BLOCK_Q5_K_QS(x, i, q_offset + l+16) >> 4) + (BLOCK_Q5_K_QH(x, i, l0 + l+16) & (hm1 << 1) ? 16 : 0)); | |
| sum.z += y2[l+ 0] * ((BLOCK_Q5_K_QS(x, i, q_offset + 64 + l+ 0) & 0xF) + (BLOCK_Q5_K_QH(x, i, l0 + l+ 0) & (hm2 << 0) ? 16 : 0)) | |
| + y2[l+16] * ((BLOCK_Q5_K_QS(x, i, q_offset + 64 + l+16) & 0xF) + (BLOCK_Q5_K_QH(x, i, l0 + l+16) & (hm2 << 0) ? 16 : 0)); | |
| sum.w += y2[l+32] * ((BLOCK_Q5_K_QS(x, i, q_offset + 64 + l+ 0) >> 4) + (BLOCK_Q5_K_QH(x, i, l0 + l+ 0) & (hm2 << 1) ? 16 : 0)) | |
| + y2[l+48] * ((BLOCK_Q5_K_QS(x, i, q_offset + 64 + l+16) >> 4) + (BLOCK_Q5_K_QH(x, i, l0 + l+16) & (hm2 << 1) ? 16 : 0)); | |
| smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3] | |
| + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7]; | |
| } | |
| tmp[16 * ix + tid] += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin; | |
| } | |
| // sum up partial sums and write back result | |
| barrier(CLK_LOCAL_MEM_FENCE); | |
| for (int s=16; s>0; s>>=1) { | |
| if (tid < s) { | |
| tmp[tid] += tmp[tid + s]; | |
| } | |
| barrier(CLK_LOCAL_MEM_FENCE); | |
| } | |
| if (tid == 0) { | |
| dst[row] = tmp[0]; | |
| } | |
| } | |
| __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * x, __local float* tmp, __global const float * yy, __global float * dst, const int ncols) { | |
| const int row = get_group_id(0); | |
| const int num_blocks_per_row = ncols / QK_K; | |
| const int ib0 = row*num_blocks_per_row + get_global_offset(0); | |
| const int tid = get_local_id(0)/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 | |
| const int ix = get_local_id(0)%K_QUANTS_PER_ITERATION; // 0 or 0, 1 | |
| const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8 | |
| const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... | |
| const int in = tid - step*im; // 0...15 or 0...7 | |
| #if K_QUANTS_PER_ITERATION == 1 | |
| const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 | |
| const int is = 0; | |
| #else | |
| const int l0 = 4 * in; // 0, 4, 8, ..., 28 | |
| const int is = in / 4; | |
| #endif | |
| const int ql_offset = 64*im + l0; | |
| const int qh_offset = 32*im + l0; | |
| const int s_offset = 8*im + is; | |
| const int y_offset = 128*im + l0; | |
| tmp[16 * ix + tid] = 0; // partial sum for thread in warp | |
| for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { | |
| __global const float * y = yy + i * QK_K + y_offset; | |
| int ii = ib0 + i; | |
| const float d = BLOCK_Q6_K_D(x, ii); | |
| // uint8_t ql = BLOCK_Q6_K_QL(x, ii, ql_offset + 0); | |
| // uint8_t qh = BLOCK_Q6_K_QH(x, ii, qh_offset + 0); | |
| // float sum = y[ 0] * BLOCK_Q6_K_SCALES(x, ii, s_offset + 0) * d | |
| // * ((int8_t)((ql & 0xF) | ((qh & 0x03) << 4)) - 32); | |
| // tmp[16 * ix + tid] += sum; | |
| #if K_QUANTS_PER_ITERATION == 1 | |
| float sum = y[ 0] * BLOCK_Q6_K_SCALES(x, ii, s_offset + 0) * d * ((int8_t)((BLOCK_Q6_K_QL(x, ii, ql_offset + 0) & 0xF) | ((BLOCK_Q6_K_QH(x, ii, qh_offset + 0) & 0x03) << 4)) - 32) | |
| + y[16] * BLOCK_Q6_K_SCALES(x, ii, s_offset + 1) * d * ((int8_t)((BLOCK_Q6_K_QL(x, ii, ql_offset + 16) & 0xF) | ((BLOCK_Q6_K_QH(x, ii, qh_offset + 16) & 0x03) << 4)) - 32) | |
| + y[32] * BLOCK_Q6_K_SCALES(x, ii, s_offset + 2) * d * ((int8_t)((BLOCK_Q6_K_QL(x, ii, ql_offset + 32) & 0xF) | ((BLOCK_Q6_K_QH(x, ii, qh_offset + 0) & 0x0c) << 2)) - 32) | |
| + y[48] * BLOCK_Q6_K_SCALES(x, ii, s_offset + 3) * d * ((int8_t)((BLOCK_Q6_K_QL(x, ii, ql_offset + 48) & 0xF) | ((BLOCK_Q6_K_QH(x, ii, qh_offset + 16) & 0x0c) << 2)) - 32) | |
| + y[64] * BLOCK_Q6_K_SCALES(x, ii, s_offset + 4) * d * ((int8_t)((BLOCK_Q6_K_QL(x, ii, ql_offset + 0) >> 4) | ((BLOCK_Q6_K_QH(x, ii, qh_offset + 0) & 0x30) >> 0)) - 32) | |
| + y[80] * BLOCK_Q6_K_SCALES(x, ii, s_offset + 5) * d * ((int8_t)((BLOCK_Q6_K_QL(x, ii, ql_offset + 16) >> 4) | ((BLOCK_Q6_K_QH(x, ii, qh_offset + 16) & 0x30) >> 0)) - 32) | |
| + y[96] * BLOCK_Q6_K_SCALES(x, ii, s_offset + 6) * d * ((int8_t)((BLOCK_Q6_K_QL(x, ii, ql_offset + 32) >> 4) | ((BLOCK_Q6_K_QH(x, ii, qh_offset + 0) & 0xc0) >> 2)) - 32) | |
| +y[112] * BLOCK_Q6_K_SCALES(x, ii, s_offset + 7) * d * ((int8_t)((BLOCK_Q6_K_QL(x, ii, ql_offset + 48) >> 4) | ((BLOCK_Q6_K_QH(x, ii, qh_offset + 16) & 0xc0) >> 2)) - 32); | |
| tmp[16 * ix + tid] += sum; | |
| #else | |
| float sum = 0; | |
| for (int l = 0; l < 4; ++l) { | |
| sum += y[l+ 0] * BLOCK_Q6_K_SCALES(x, ii, s_offset + 0) * d * ((int8_t)((BLOCK_Q6_K_QL(x, ii, ql_offset + l + 0) & 0xF) | (((BLOCK_Q6_K_QH(x, ii, qh_offset + l) >> 0) & 3) << 4)) - 32) | |
| + y[l+32] * BLOCK_Q6_K_SCALES(x, ii, s_offset + 2) * d * ((int8_t)((BLOCK_Q6_K_QL(x, ii, ql_offset + l + 32) & 0xF) | (((BLOCK_Q6_K_QH(x, ii, qh_offset + l) >> 2) & 3) << 4)) - 32) | |
| + y[l+64] * BLOCK_Q6_K_SCALES(x, ii, s_offset + 4) * d * ((int8_t)((BLOCK_Q6_K_QL(x, ii, ql_offset + l + 0) >> 4) | (((BLOCK_Q6_K_QH(x, ii, qh_offset + l) >> 4) & 3) << 4)) - 32) | |
| + y[l+96] * BLOCK_Q6_K_SCALES(x, ii, s_offset + 6) * d * ((int8_t)((BLOCK_Q6_K_QL(x, ii, ql_offset + l + 32) >> 4) | (((BLOCK_Q6_K_QH(x, ii, qh_offset + l) >> 6) & 3) << 4)) - 32); | |
| } | |
| tmp[16 * ix + tid] += sum; | |
| #endif | |
| } | |
| // sum up partial sums and write back result | |
| barrier(CLK_LOCAL_MEM_FENCE); | |
| for (int s=16; s>0; s>>=1) { | |
| if (tid < s) { | |
| tmp[tid] += tmp[tid + s]; | |
| } | |
| barrier(CLK_LOCAL_MEM_FENCE); | |
| } | |
| if (tid == 0) { | |
| dst[row] = tmp[0]; | |
| } | |
| } | |
| #define DEQUANT(KERNEL_NAME, X_TYPE, QUANT_K, QUANT_R, DEQUANT_FUNC) \ | |
| __kernel void KERNEL_NAME(__global X_TYPE* x, __global float* y) { \ | |
| const size_t i = get_group_id(0)*get_local_size(0) + get_local_id(0)*2; \ | |
| \ | |
| if (i >= get_global_size(0)) { \ | |
| return; \ | |
| } \ | |
| \ | |
| const uint qk = QUANT_K; \ | |
| const uint qr = QUANT_R; \ | |
| \ | |
| const size_t ib = i/qk + get_global_offset(0); /* block index */ \ | |
| const size_t iqs = (i%qk)/qr; /* quant index */ \ | |
| const int iybs = i - i%qk; /* y block start index */ \ | |
| const int y_offset = qr == 1 ? 1 : qk/2; \ | |
| \ | |
| /* dequantize */ \ | |
| float v0, v1; \ | |
| DEQUANT_FUNC(x, ib, iqs, &v0, &v1); \ | |
| y[iybs + iqs + 0] = v0; \ | |
| y[iybs + iqs + y_offset] = v1; \ | |
| } | |
| DEQUANT(dequantize_row_q4_0, struct block_q4_0, QK4_0, QR4_0, dequantize_q4_0) | |
| DEQUANT(dequantize_row_q4_1, struct block_q4_1, QK4_1, QR4_1, dequantize_q4_1) | |
| DEQUANT(dequantize_row_q5_0, struct block_q5_0, QK5_0, QR5_0, dequantize_q5_0) | |
| DEQUANT(dequantize_row_q5_1, struct block_q5_1, QK5_1, QR5_1, dequantize_q5_1) | |
| DEQUANT(dequantize_row_q8_0, struct block_q8_0, QK8_0, QR8_0, dequantize_q8_0) | |
| #ifndef CLSPV_WORKAROUNDS | |
| DEQUANT(convert_row_f16, half, 1, 1, convert_f16) | |
| #else | |
| DEQUANT(convert_row_f16, uint32_t, 1, 1, convert_f16) | |
| #endif | |
| #define DEQUANT_MUL_MAT_VEC(KERNEL_NAME, X_TYPE, QUANT_K, QUANT_R, DEQUANT_FUNC) \ | |
| __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float* y, __global float* dst, const int ncols) { \ | |
| const int local_size = get_local_size(0); \ | |
| const int row = get_group_id(0); \ | |
| const int tid = get_local_id(0); \ | |
| \ | |
| const uint qk = QUANT_K; \ | |
| const uint qr = QUANT_R; \ | |
| \ | |
| const int col_step = local_size * 2; \ | |
| const int y_offset = qr == 1 ? 1 : qk/2; \ | |
| \ | |
| x += get_global_offset(0); \ | |
| \ | |
| tmp[tid] = 0; \ | |
| \ | |
| for (int col = tid*2; col < ncols; col += col_step) { \ | |
| const int ib = (row*ncols + col)/qk; /* block index */ \ | |
| const int iqs = (col%qk)/qr; /* quant index */ \ | |
| const int iybs = col - col%qk; /* y block start index */ \ | |
| \ | |
| /* dequantize */ \ | |
| float v0, v1; \ | |
| DEQUANT_FUNC(x, ib, iqs, &v0, &v1); \ | |
| \ | |
| /* matrix multiplication */ \ | |
| tmp[tid] += v0 * y[iybs + iqs + 0]; \ | |
| tmp[tid] += v1 * y[iybs + iqs + y_offset]; \ | |
| } \ | |
| \ | |
| /* sum up partial sums and write back result */ \ | |
| barrier(CLK_LOCAL_MEM_FENCE); \ | |
| for (int s=local_size/2; s>0; s>>=1) { \ | |
| if (tid < s) { \ | |
| tmp[tid] += tmp[tid + s]; \ | |
| } \ | |
| barrier(CLK_LOCAL_MEM_FENCE); \ | |
| } \ | |
| if (tid == 0) { \ | |
| dst[row] = tmp[0]; \ | |
| } \ | |
| } | |
| DEQUANT_MUL_MAT_VEC(dequantize_mul_mat_vec_q4_0, struct block_q4_0, QK4_0, QR4_0, dequantize_q4_0) | |
| DEQUANT_MUL_MAT_VEC(dequantize_mul_mat_vec_q4_1, struct block_q4_1, QK4_1, QR4_1, dequantize_q4_1) | |
| DEQUANT_MUL_MAT_VEC(dequantize_mul_mat_vec_q5_0, struct block_q5_0, QK5_0, QR5_0, dequantize_q5_0) | |
| DEQUANT_MUL_MAT_VEC(dequantize_mul_mat_vec_q5_1, struct block_q5_1, QK5_1, QR5_1, dequantize_q5_1) | |
| DEQUANT_MUL_MAT_VEC(dequantize_mul_mat_vec_q8_0, struct block_q8_0, QK8_0, QR8_0, dequantize_q8_0) | |
| DEQUANT_MUL_MAT_VEC(convert_mul_mat_vec_f16, half, 1, 1, convert_f16); | |
| #define MUL(KERNEL_NAME, TYPE) \ | |
| __kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y, const int y_offset, __global TYPE* dst, const int dst_offset, const int ky) { \ | |
| const int i = get_group_id(0)*get_local_size(0) + get_local_id(0); \ | |
| \ | |
| if (i >= get_global_size(0)) { \ | |
| return; \ | |
| } \ | |
| \ | |
| dst[dst_offset + i] = x[x_offset + i] * y[y_offset + i%ky]; \ | |
| } | |
| MUL(mul_f32, float) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include "ggml-opencl.h" | |
| #include <array> | |
| #include <atomic> | |
| #include <sstream> | |
| #include <vector> | |
| #include <limits> | |
| #include <iostream> | |
| #include <fstream> | |
| #define CL_TARGET_OPENCL_VERSION 110 | |
| #include <clblast.h> | |
| #include <stdlib.h> | |
| #include <stdio.h> | |
| #include <string.h> | |
| #include "ggml.h" | |
| #if defined(_MSC_VER) | |
| #pragma warning(disable: 4244 4267) // possible loss of data | |
| #endif | |
| #define CL_DMMV_LOCAL_SIZE 32 | |
| #ifndef K_QUANTS_PER_ITERATION | |
| #define K_QUANTS_PER_ITERATION 1 | |
| #else | |
| static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2"); | |
| #endif | |
| #define CL_CHECK(err) \ | |
| do { \ | |
| cl_int err_ = (err); \ | |
| if (err_ != CL_SUCCESS) { \ | |
| fprintf(stderr, "ggml_opencl: %s error %d at %s:%d\n", \ | |
| #err, err_, __FILE__, __LINE__); \ | |
| exit(1); \ | |
| } \ | |
| } while (0) | |
| #define CLBLAST_CHECK(err) \ | |
| do { \ | |
| CLBlastStatusCode err_ = (err); \ | |
| if (err_ != CLBlastSuccess) { \ | |
| fprintf(stderr, "ggml_opencl: %s error %d at %s:%d\n", \ | |
| #err, err_, __FILE__, __LINE__); \ | |
| exit(1); \ | |
| } \ | |
| } while (0) | |
| static std::string generate_kernels() { | |
| std::string file = "ggml-opencl.cl"; | |
| // std::string file = "debug.cl"; | |
| std::ifstream kernels_file(file); | |
| if (!kernels_file.is_open()) { | |
| fprintf(stderr, "ggml_opencl: file %s not found!", file.c_str()); | |
| return "Kernels file not found."; | |
| } | |
| std::string line; | |
| std::ostringstream src; | |
| while (getline(kernels_file,line)) { | |
| src << line << std::endl; | |
| } | |
| return src.str(); | |
| } | |
| static cl_platform_id platform; | |
| static cl_device_id device; | |
| static cl_context context; | |
| static cl_command_queue queue; | |
| static cl_program program; | |
| static cl_kernel convert_row_f16_cl; | |
| static cl_kernel dequantize_row_q4_0_cl, dequantize_row_q4_1_cl, dequantize_row_q5_0_cl, dequantize_row_q5_1_cl, dequantize_row_q8_0_cl; | |
| static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl, dequantize_mul_mat_vec_q5_0_cl, dequantize_mul_mat_vec_q5_1_cl, dequantize_mul_mat_vec_q8_0_cl, convert_mul_mat_vec_f16_cl; | |
| static cl_kernel dequantize_block_q2_k_cl, dequantize_block_q3_k_cl, dequantize_block_q4_k_cl, dequantize_block_q5_k_cl, dequantize_block_q6_k_cl; | |
| static cl_kernel dequantize_mul_mat_vec_q2_K_cl, dequantize_mul_mat_vec_q3_K_cl, dequantize_mul_mat_vec_q4_K_cl, dequantize_mul_mat_vec_q5_K_cl, dequantize_mul_mat_vec_q6_K_cl; | |
| static cl_kernel mul_f32_cl; | |
| static bool fp16_support; | |
| static bool byte_addressable_store_support; | |
| static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) { | |
| cl_program p; | |
| char *program_log; | |
| size_t program_size; | |
| size_t log_size; | |
| int err; | |
| program_size = strlen(program_buffer); | |
| p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err); | |
| if(err < 0) { | |
| fprintf(stderr, "OpenCL error creating program"); | |
| exit(1); | |
| } | |
| std::string compile_opts = "-cl-mad-enable -cl-unsafe-math-optimizations -cl-finite-math-only -cl-fast-relaxed-math " | |
| "-DQK4_0=32 -DQR4_0=2 -DQK4_1=32 -DQR4_1=2 -DQK5_0=32 -DQR5_0=2 -DQK5_1=32 -DQR5_1=2 -DQK8_0=32 -DQR8_0=1 " | |
| "-DQK_K=256 -DK_QUANTS_PER_ITERATION=" + std::to_string(K_QUANTS_PER_ITERATION); | |
| err = clBuildProgram(p, 0, NULL, compile_opts.c_str(), NULL, NULL); | |
| if(err < 0) { | |
| clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); | |
| program_log = (char*) malloc(log_size + 1); | |
| program_log[log_size] = '\0'; | |
| clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL); | |
| fprintf(stderr, "ggml_opencl: kernel compile error:\n\n%s\n", program_log); | |
| free(program_log); | |
| exit(1); | |
| } | |
| return p; | |
| } | |
| void ggml_cl_init(void) { | |
| cl_int err; | |
| struct cl_device; | |
| struct cl_platform { | |
| cl_platform_id id; | |
| unsigned number; | |
| char name[128]; | |
| char vendor[128]; | |
| struct cl_device * devices; | |
| unsigned n_devices; | |
| struct cl_device * default_device; | |
| }; | |
| struct cl_device { | |
| struct cl_platform * platform; | |
| cl_device_id id; | |
| unsigned number; | |
| cl_device_type type; | |
| char name[128]; | |
| }; | |
| enum { NPLAT = 16, NDEV = 16 }; | |
| struct cl_platform platforms[NPLAT]; | |
| unsigned n_platforms = 0; | |
| struct cl_device devices[NDEV]; | |
| unsigned n_devices = 0; | |
| struct cl_device * default_device = NULL; | |
| platform = NULL; | |
| device = NULL; | |
| cl_platform_id platform_ids[NPLAT]; | |
| CL_CHECK(clGetPlatformIDs(NPLAT, platform_ids, &n_platforms)); | |
| for (unsigned i = 0; i < n_platforms; i++) { | |
| struct cl_platform * p = &platforms[i]; | |
| p->number = i; | |
| p->id = platform_ids[i]; | |
| CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_NAME, sizeof(p->name), &p->name, NULL)); | |
| CL_CHECK(clGetPlatformInfo(p->id, CL_PLATFORM_VENDOR, sizeof(p->vendor), &p->vendor, NULL)); | |
| cl_device_id device_ids[NDEV]; | |
| cl_int clGetDeviceIDsError = clGetDeviceIDs(p->id, CL_DEVICE_TYPE_ALL, NDEV, device_ids, &p->n_devices); | |
| if (clGetDeviceIDsError == CL_DEVICE_NOT_FOUND) { | |
| p->n_devices = 0; | |
| } else { | |
| CL_CHECK(clGetDeviceIDsError); | |
| } | |
| p->devices = p->n_devices > 0 ? &devices[n_devices] : NULL; | |
| p->default_device = NULL; | |
| for (unsigned j = 0; j < p->n_devices; j++) { | |
| struct cl_device * d = &devices[n_devices]; | |
| d->number = n_devices++; | |
| d->id = device_ids[j]; | |
| d->platform = p; | |
| CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_NAME, sizeof(d->name), &d->name, NULL)); | |
| CL_CHECK(clGetDeviceInfo(d->id, CL_DEVICE_TYPE, sizeof(d->type), &d->type, NULL)); | |
| if (p->default_device == NULL && d->type == CL_DEVICE_TYPE_GPU) { | |
| p->default_device = d; | |
| } | |
| } | |
| if (default_device == NULL && p->default_device != NULL) { | |
| default_device = p->default_device; | |
| } | |
| } | |
| if (n_devices == 0) { | |
| fprintf(stderr, "ggml_opencl: could find any OpenCL devices.\n"); | |
| exit(1); | |
| } | |
| char * user_platform_string = getenv("GGML_OPENCL_PLATFORM"); | |
| char * user_device_string = getenv("GGML_OPENCL_DEVICE"); | |
| int user_platform_number = -1; | |
| int user_device_number = -1; | |
| unsigned n; | |
| if (user_platform_string != NULL && sscanf(user_platform_string, " %u", &n) == 1 && n < n_platforms) { | |
| user_platform_number = (int)n; | |
| } | |
| if (user_device_string != NULL && sscanf(user_device_string, " %u", &n) == 1 && n < n_devices) { | |
| user_device_number = (int)n; | |
| } | |
| if (user_platform_number != -1 && user_device_number != -1) { | |
| cl_platform* platform = &platforms[user_platform_number]; | |
| if ((unsigned)user_device_number >= platform->n_devices) { | |
| fprintf(stderr, "ggml_opencl: invalid device number %d\n", user_device_number); | |
| exit(1); | |
| } | |
| default_device = &platform->devices[user_device_number]; | |
| } else { | |
| struct cl_device * selected_devices = devices; | |
| unsigned n_selected_devices = n_devices; | |
| if (user_platform_number == -1 && user_platform_string != NULL && user_platform_string[0] != 0) { | |
| for (unsigned i = 0; i < n_platforms; i++) { | |
| struct cl_platform * p = &platforms[i]; | |
| if (strstr(p->name, user_platform_string) != NULL || | |
| strstr(p->vendor, user_platform_string) != NULL) { | |
| user_platform_number = (int)i; | |
| break; | |
| } | |
| } | |
| if (user_platform_number == -1) { | |
| fprintf(stderr, "ggml_opencl: no platform matching '%s' was found.\n", user_platform_string); | |
| exit(1); | |
| } | |
| } | |
| if (user_platform_number != -1) { | |
| struct cl_platform * p = &platforms[user_platform_number]; | |
| selected_devices = p->devices; | |
| n_selected_devices = p->n_devices; | |
| default_device = p->default_device; | |
| if (n_selected_devices == 0) { | |
| fprintf(stderr, "ggml_opencl: selected platform '%s' does not have any devices.\n", p->name); | |
| exit(1); | |
| } | |
| } | |
| if (user_device_number == -1 && user_device_string != NULL && user_device_string[0] != 0) { | |
| for (unsigned i = 0; i < n_selected_devices; i++) { | |
| struct cl_device * d = &selected_devices[i]; | |
| if (strstr(d->name, user_device_string) != NULL) { | |
| user_device_number = d->number; | |
| break; | |
| } | |
| } | |
| if (user_device_number == -1) { | |
| fprintf(stderr, "ggml_opencl: no device matching '%s' was found.\n", user_device_string); | |
| exit(1); | |
| } | |
| } | |
| if (user_device_number != -1) { | |
| selected_devices = &devices[user_device_number]; | |
| n_selected_devices = 1; | |
| default_device = &selected_devices[0]; | |
| } | |
| GGML_ASSERT(n_selected_devices > 0); | |
| if (default_device == NULL) { | |
| default_device = &selected_devices[0]; | |
| } | |
| } | |
| fprintf(stderr, "ggml_opencl: selecting platform: '%s'\n", default_device->platform->name); | |
| fprintf(stderr, "ggml_opencl: selecting device: '%s'\n", default_device->name); | |
| if (default_device->type != CL_DEVICE_TYPE_GPU) { | |
| fprintf(stderr, "ggml_opencl: warning, not a GPU: '%s'.\n", default_device->name); | |
| } | |
| platform = default_device->platform->id; | |
| device = default_device->id; | |
| size_t ext_str_size; | |
| clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &ext_str_size); | |
| char *ext_buffer = (char *)alloca(ext_str_size + 1); | |
| clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL); | |
| ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated | |
| // Check if ext_buffer contains cl_khr_fp16 | |
| fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL; | |
| byte_addressable_store_support = strstr(ext_buffer, "cl_khr_byte_addressable_store") != NULL; | |
| fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false"); | |
| fprintf(stderr, "ggml_opencl: device Byte-addressable store support: %s\n", byte_addressable_store_support ? "true" : "false"); | |
| cl_context_properties properties[] = { | |
| (intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)platform, 0 | |
| }; | |
| CL_CHECK((context = clCreateContext(properties, 1, &device, NULL, NULL, &err), err)); | |
| CL_CHECK((queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err), | |
| (err != CL_INVALID_QUEUE_PROPERTIES && err != CL_INVALID_VALUE ? err : | |
| (queue = clCreateCommandQueue(context, device, 0, &err), err) | |
| ))); | |
| const std::string kernel_src = generate_kernels(); | |
| program = build_program_from_source(context, device, kernel_src.c_str()); | |
| // FP16 to FP32 kernel | |
| // CL_CHECK((convert_row_f16_cl = clCreateKernel(program, "convert_row_f16", &err), err)); | |
| convert_row_f16_cl = clCreateKernel(program, "convert_row_f16", &err); // | |
| // Dequantize kernels | |
| CL_CHECK((dequantize_row_q4_0_cl = clCreateKernel(program, "dequantize_row_q4_0", &err), err)); | |
| CL_CHECK((dequantize_row_q4_1_cl = clCreateKernel(program, "dequantize_row_q4_1", &err), err)); | |
| CL_CHECK((dequantize_row_q5_0_cl = clCreateKernel(program, "dequantize_row_q5_0", &err), err)); | |
| CL_CHECK((dequantize_row_q5_1_cl = clCreateKernel(program, "dequantize_row_q5_1", &err), err)); | |
| CL_CHECK((dequantize_row_q8_0_cl = clCreateKernel(program, "dequantize_row_q8_0", &err), err)); | |
| CL_CHECK((dequantize_row_q8_0_cl = clCreateKernel(program, "dequantize_row_q8_0", &err), err)); | |
| CL_CHECK((dequantize_block_q2_k_cl = clCreateKernel(program, "dequantize_block_q2_K", &err), err)); | |
| CL_CHECK((dequantize_block_q3_k_cl = clCreateKernel(program, "dequantize_block_q3_K", &err), err)); | |
| CL_CHECK((dequantize_block_q4_k_cl = clCreateKernel(program, "dequantize_block_q4_K", &err), err)); | |
| CL_CHECK((dequantize_block_q5_k_cl = clCreateKernel(program, "dequantize_block_q5_K", &err), err)); | |
| CL_CHECK((dequantize_block_q6_k_cl = clCreateKernel(program, "dequantize_block_q6_K", &err), err)); | |
| // dequant mul mat kernel | |
| CL_CHECK((dequantize_mul_mat_vec_q4_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_0", &err), err)); | |
| CL_CHECK((dequantize_mul_mat_vec_q4_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_1", &err), err)); | |
| CL_CHECK((dequantize_mul_mat_vec_q5_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_0", &err), err)); | |
| CL_CHECK((dequantize_mul_mat_vec_q5_1_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_1", &err), err)); | |
| CL_CHECK((dequantize_mul_mat_vec_q8_0_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q8_0", &err), err)); | |
| CL_CHECK((convert_mul_mat_vec_f16_cl = clCreateKernel(program, "convert_mul_mat_vec_f16", &err), err)); | |
| CL_CHECK((dequantize_mul_mat_vec_q2_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q2_K", &err), err)); | |
| CL_CHECK((dequantize_mul_mat_vec_q3_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q3_K", &err), err)); | |
| CL_CHECK((dequantize_mul_mat_vec_q4_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q4_K", &err), err)); | |
| CL_CHECK((dequantize_mul_mat_vec_q5_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q5_K", &err), err)); | |
| CL_CHECK((dequantize_mul_mat_vec_q6_K_cl = clCreateKernel(program, "dequantize_mul_mat_vec_q6_K", &err), err)); | |
| // mul kernel | |
| CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err)); | |
| } | |
| static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) { | |
| switch (type) { | |
| case GGML_TYPE_Q4_0: | |
| return &dequantize_row_q4_0_cl; | |
| case GGML_TYPE_Q4_1: | |
| return &dequantize_row_q4_1_cl; | |
| case GGML_TYPE_Q5_0: | |
| return &dequantize_row_q5_0_cl; | |
| case GGML_TYPE_Q5_1: | |
| return &dequantize_row_q5_1_cl; | |
| case GGML_TYPE_Q8_0: | |
| return &dequantize_row_q8_0_cl; | |
| case GGML_TYPE_Q2_K: | |
| return &dequantize_block_q2_k_cl; | |
| case GGML_TYPE_Q3_K: | |
| return &dequantize_block_q3_k_cl; | |
| case GGML_TYPE_Q4_K: | |
| return &dequantize_block_q4_k_cl; | |
| case GGML_TYPE_Q5_K: | |
| return &dequantize_block_q5_k_cl; | |
| case GGML_TYPE_Q6_K: | |
| return &dequantize_block_q6_k_cl; | |
| case GGML_TYPE_F16: | |
| return &convert_row_f16_cl; | |
| default: | |
| return nullptr; | |
| } | |
| } | |
| static size_t ggml_cl_global_denom(ggml_type type) { | |
| switch (type) { | |
| case GGML_TYPE_Q4_0: | |
| case GGML_TYPE_Q4_1: | |
| case GGML_TYPE_Q5_0: | |
| case GGML_TYPE_Q5_1: | |
| case GGML_TYPE_Q8_0: | |
| return 1; | |
| case GGML_TYPE_Q2_K: | |
| case GGML_TYPE_Q3_K: | |
| return 4; | |
| case GGML_TYPE_Q4_K: | |
| return 8; | |
| case GGML_TYPE_Q5_K: | |
| case GGML_TYPE_Q6_K: | |
| return 4; | |
| case GGML_TYPE_F16: | |
| default: | |
| return 1; | |
| } | |
| } | |
| static size_t ggml_cl_local_size(ggml_type type) { | |
| switch (type) { | |
| case GGML_TYPE_Q4_0: | |
| case GGML_TYPE_Q4_1: | |
| case GGML_TYPE_Q5_0: | |
| case GGML_TYPE_Q5_1: | |
| case GGML_TYPE_Q8_0: | |
| return 0; | |
| case GGML_TYPE_Q2_K: | |
| case GGML_TYPE_Q3_K: | |
| return 64; | |
| case GGML_TYPE_Q4_K: | |
| return 32; | |
| case GGML_TYPE_Q5_K: | |
| case GGML_TYPE_Q6_K: | |
| return 64; | |
| case GGML_TYPE_F16: | |
| default: | |
| return 0; | |
| } | |
| } | |
| static cl_kernel* ggml_get_dequantize_mul_mat_vec_cl(ggml_type type) { | |
| switch (type) { | |
| case GGML_TYPE_Q4_0: | |
| return &dequantize_mul_mat_vec_q4_0_cl; | |
| case GGML_TYPE_Q4_1: | |
| return &dequantize_mul_mat_vec_q4_1_cl; | |
| case GGML_TYPE_Q5_0: | |
| return &dequantize_mul_mat_vec_q5_0_cl; | |
| case GGML_TYPE_Q5_1: | |
| return &dequantize_mul_mat_vec_q5_1_cl; | |
| case GGML_TYPE_Q8_0: | |
| return &dequantize_mul_mat_vec_q8_0_cl; | |
| case GGML_TYPE_F16: | |
| return &convert_mul_mat_vec_f16_cl; | |
| case GGML_TYPE_Q2_K: | |
| return &dequantize_mul_mat_vec_q2_K_cl; | |
| case GGML_TYPE_Q3_K: | |
| return &dequantize_mul_mat_vec_q3_K_cl; | |
| case GGML_TYPE_Q4_K: | |
| return &dequantize_mul_mat_vec_q4_K_cl; | |
| case GGML_TYPE_Q5_K: | |
| return &dequantize_mul_mat_vec_q5_K_cl; | |
| case GGML_TYPE_Q6_K: | |
| return &dequantize_mul_mat_vec_q6_K_cl; | |
| default: | |
| return nullptr; | |
| } | |
| } | |
| // buffer pool for cl | |
| #define MAX_CL_BUFFERS 256 | |
| struct scoped_spin_lock { | |
| std::atomic_flag& lock; | |
| scoped_spin_lock(std::atomic_flag& lock) : lock(lock) { | |
| while (lock.test_and_set(std::memory_order_acquire)) { | |
| ; // spin | |
| } | |
| } | |
| ~scoped_spin_lock() { | |
| lock.clear(std::memory_order_release); | |
| } | |
| scoped_spin_lock(const scoped_spin_lock&) = delete; | |
| scoped_spin_lock& operator=(const scoped_spin_lock&) = delete; | |
| }; | |
| struct cl_buffer { | |
| cl_mem mem; | |
| size_t size = 0; | |
| }; | |
| static cl_buffer g_cl_buffer_pool[MAX_CL_BUFFERS]; | |
| static std::atomic_flag g_cl_pool_lock = ATOMIC_FLAG_INIT; | |
| static cl_mem ggml_cl_pool_malloc(size_t size, size_t * actual_size) { | |
| scoped_spin_lock lock(g_cl_pool_lock); | |
| cl_int err; | |
| int best_i = -1; | |
| size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs | |
| int worst_i = -1; | |
| size_t worst_size = 0; //largest unused buffer seen so far | |
| for (int i = 0; i < MAX_CL_BUFFERS; ++i) { | |
| cl_buffer &b = g_cl_buffer_pool[i]; | |
| if (b.size > 0 && b.size >= size && b.size < best_size) | |
| { | |
| best_i = i; | |
| best_size = b.size; | |
| } | |
| if (b.size > 0 && b.size > worst_size) | |
| { | |
| worst_i = i; | |
| worst_size = b.size; | |
| } | |
| } | |
| if(best_i!=-1) //found the smallest buffer that fits our needs | |
| { | |
| cl_buffer& b = g_cl_buffer_pool[best_i]; | |
| cl_mem mem = b.mem; | |
| *actual_size = b.size; | |
| b.size = 0; | |
| return mem; | |
| } | |
| if(worst_i!=-1) //no buffer that fits our needs, resize largest one to save memory | |
| { | |
| cl_buffer& b = g_cl_buffer_pool[worst_i]; | |
| cl_mem mem = b.mem; | |
| b.size = 0; | |
| clReleaseMemObject(mem); | |
| } | |
| cl_mem mem; | |
| CL_CHECK((mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err), err)); | |
| *actual_size = size; | |
| return mem; | |
| } | |
| static void ggml_cl_pool_free(cl_mem mem, size_t size) { | |
| scoped_spin_lock lock(g_cl_pool_lock); | |
| for (int i = 0; i < MAX_CL_BUFFERS; ++i) { | |
| cl_buffer& b = g_cl_buffer_pool[i]; | |
| if (b.size == 0) { | |
| b.mem = mem; | |
| b.size = size; | |
| return; | |
| } | |
| } | |
| fprintf(stderr, "WARNING: cl buffer pool full, increase MAX_CL_BUFFERS\n"); | |
| clReleaseMemObject(mem); | |
| } | |
| void ggml_cl_free_data(const struct ggml_tensor* tensor) { | |
| if (tensor->backend != GGML_BACKEND_GPU) { | |
| return; | |
| } | |
| cl_mem mem = (cl_mem)tensor->extra; | |
| clReleaseMemObject(mem); | |
| } | |
| static cl_int ggml_cl_h2d_tensor_2d(cl_command_queue queue, cl_mem dst, size_t offset, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cl_event* ev) { | |
| cl_int err; | |
| const uint64_t ne0 = src->ne[0]; | |
| const uint64_t ne1 = src->ne[1]; | |
| const uint64_t nb0 = src->nb[0]; | |
| const uint64_t nb1 = src->nb[1]; | |
| const uint64_t nb2 = src->nb[2]; | |
| const uint64_t nb3 = src->nb[3]; | |
| const enum ggml_type type = src->type; | |
| const size_t ts = ggml_type_size(type); | |
| const size_t bs = ggml_blck_size(type); | |
| const uint64_t row_size = ts*ne0/bs; | |
| const char * x = (const char *) src->data + i2*nb2 + i3*nb3; | |
| if (nb0 == ts && nb1 == row_size) { | |
| return clEnqueueWriteBuffer(queue, dst, CL_FALSE, offset, ne1*row_size, x, 0, NULL, ev); | |
| } | |
| if (nb0 == ts) { | |
| const size_t buffer_origin[3] = { offset, 0, 0 }; | |
| const size_t host_origin[3] = { 0, 0, 0 }; | |
| const size_t region[3] = { row_size, ne1, 1 }; | |
| return clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, row_size, 0, nb1, 0, x, 0, NULL, ev); | |
| } | |
| std::vector<cl_event> events; | |
| if (ev && ne1>1) events.reserve(ne1-1); | |
| for (uint64_t i1 = 0; i1 < ne1; i1++) { | |
| // pretend the row is a matrix with cols=1 | |
| const size_t buffer_origin[3] = { offset + i1*row_size, 0, 0 }; | |
| const size_t host_origin[3] = { 0, 0, 0 }; | |
| const size_t region[3] = { ts, ne0/bs, 1 }; | |
| // if an event is requested, make the last write wait for all previous writes to complete | |
| if (ev && i1) { | |
| events.push_back(*ev); | |
| } | |
| cl_uint nevents = i1 == ne1-1 ? events.size() : 0U; | |
| err = clEnqueueWriteBufferRect(queue, dst, CL_FALSE, buffer_origin, host_origin, region, ts, 0, nb0, 0, x + i1*nb1, nevents, nevents ? events.data() : nullptr, ev); | |
| if (err != CL_SUCCESS) { | |
| for (auto event : events) { | |
| clReleaseEvent(event); | |
| } | |
| return err; | |
| } | |
| } | |
| for (auto event : events) { | |
| CL_CHECK(clReleaseEvent(event)); | |
| } | |
| return CL_SUCCESS; | |
| } | |
| static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { | |
| GGML_ASSERT(src1->backend == GGML_BACKEND_GPU); | |
| const int64_t ne00 = src0->ne[0]; | |
| const int64_t ne01 = src0->ne[1]; | |
| const int64_t ne02 = src0->ne[2]; | |
| const int64_t ne03 = src0->ne[3]; | |
| const int64_t ne10 = src1->ne[0]; | |
| const int64_t ne11 = src1->ne[1]; | |
| const int64_t ne12 = src1->ne[2]; | |
| const int64_t ne13 = src1->ne[3]; | |
| const int nb2 = dst->nb[2]; | |
| const int nb3 = dst->nb[3]; | |
| size_t x_size; | |
| size_t d_size; | |
| cl_mem d_X = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &x_size); // src0 | |
| cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted. | |
| cl_mem d_D = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &d_size); // dst | |
| for (int64_t i03 = 0; i03 < ne03; i03++) { | |
| for (int64_t i02 = 0; i02 < ne02; i02++) { | |
| cl_event ev; | |
| // copy src0 to device | |
| CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, &ev)); | |
| const int64_t i13 = i03%ne13; | |
| const int64_t i12 = i02%ne12; | |
| const int i1 = i13*ne12*ne11 + i12*ne11; | |
| cl_int x_offset = 0; | |
| cl_int y_offset = i1*ne10; | |
| cl_int d_offset = 0; | |
| size_t global = ne00 * ne01; | |
| cl_int ky = ne10 * ne11; | |
| CL_CHECK(clSetKernelArg(mul_f32_cl, 0, sizeof(cl_mem), &d_X)); | |
| CL_CHECK(clSetKernelArg(mul_f32_cl, 1, sizeof(cl_int), &x_offset)); | |
| CL_CHECK(clSetKernelArg(mul_f32_cl, 2, sizeof(cl_mem), &d_Y)); | |
| CL_CHECK(clSetKernelArg(mul_f32_cl, 3, sizeof(cl_int), &y_offset)); | |
| CL_CHECK(clSetKernelArg(mul_f32_cl, 4, sizeof(cl_mem), &d_D)); | |
| CL_CHECK(clSetKernelArg(mul_f32_cl, 5, sizeof(cl_int), &d_offset)); | |
| CL_CHECK(clSetKernelArg(mul_f32_cl, 6, sizeof(cl_int), &ky)); | |
| CL_CHECK(clEnqueueNDRangeKernel(queue, mul_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL)); | |
| CL_CHECK(clReleaseEvent(ev)); | |
| CL_CHECK(clFinish(queue)); | |
| // copy dst to host | |
| float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); | |
| CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * ne00*ne01, d, 0, NULL, NULL)); | |
| } | |
| } | |
| ggml_cl_pool_free(d_X, x_size); | |
| ggml_cl_pool_free(d_D, d_size); | |
| } | |
| void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { | |
| GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); | |
| ggml_cl_mul_f32(src0, src1, dst); | |
| } | |
| static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { | |
| const int64_t ne00 = src0->ne[0]; | |
| const int64_t ne01 = src0->ne[1]; | |
| const int64_t ne02 = src0->ne[2]; | |
| const int64_t ne03 = src0->ne[3]; | |
| const int64_t ne10 = src1->ne[0]; | |
| const int64_t ne11 = src1->ne[1]; | |
| const int64_t ne12 = src1->ne[2]; | |
| const int64_t ne13 = src1->ne[3]; | |
| const int nb2 = dst->nb[2]; | |
| const int nb3 = dst->nb[3]; | |
| const int64_t r2 = ne12 / ne02; | |
| const int64_t r3 = ne13 / ne03; | |
| const float alpha = 1.0f; | |
| const float beta = 0.0f; | |
| const int x_ne = ne01 * ne00; | |
| const int y_ne = ne11 * ne10; | |
| const int d_ne = ne11 * ne01; | |
| size_t x_size; | |
| size_t y_size; | |
| size_t d_size; | |
| cl_mem d_X; | |
| if (src0->backend == GGML_BACKEND_GPU) { // NOLINT | |
| d_X = (cl_mem) src0->extra; | |
| } else { | |
| d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size); | |
| } | |
| cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size); | |
| cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size); | |
| size_t x_offset = 0; | |
| for (int64_t i03 = 0; i03 < ne03; i03++) { | |
| // TODO: copy src0 here when r3>1 | |
| for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) { | |
| for (int64_t i02 = 0; i02 < ne02; i02++) { | |
| if (src0->backend == GGML_BACKEND_GPU) { | |
| x_offset = (i03 * ne02 + i02) * x_ne; | |
| } else { | |
| // copy src0 to device | |
| CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL)); | |
| } | |
| for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) { | |
| // copy src1 to device | |
| CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL)); | |
| CL_CHECK(clFinish(queue)); | |
| // compute | |
| cl_event ev_sgemm; | |
| clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor, | |
| clblast::Transpose::kYes, clblast::Transpose::kNo, | |
| ne01, ne11, ne10, | |
| alpha, | |
| d_X, x_offset, ne00, | |
| d_Y, 0, ne10, | |
| beta, | |
| d_D, 0, ne01, | |
| &queue, &ev_sgemm); | |
| if (status != clblast::StatusCode::kSuccess) { | |
| GGML_ASSERT(false); | |
| } | |
| // copy dst to host | |
| float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); | |
| CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL)); | |
| } | |
| } | |
| } | |
| } | |
| if (src0->backend != GGML_BACKEND_GPU) { | |
| ggml_cl_pool_free(d_X, x_size); | |
| } | |
| ggml_cl_pool_free(d_Y, y_size); | |
| ggml_cl_pool_free(d_D, d_size); | |
| } | |
| static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) { | |
| GGML_ASSERT(fp16_support); | |
| const int64_t ne00 = src0->ne[0]; | |
| const int64_t ne01 = src0->ne[1]; | |
| const int64_t ne02 = src0->ne[2]; | |
| const int64_t ne03 = src0->ne[3]; | |
| const int64_t ne10 = src1->ne[0]; | |
| const int64_t ne11 = src1->ne[1]; | |
| const int64_t ne12 = src1->ne[2]; | |
| const int64_t ne13 = src1->ne[3]; | |
| const int nb10 = src1->nb[0]; | |
| const int nb11 = src1->nb[1]; | |
| const int nb12 = src1->nb[2]; | |
| const int nb13 = src1->nb[3]; | |
| const int nb2 = dst->nb[2]; | |
| const int nb3 = dst->nb[3]; | |
| const int64_t r2 = ne12 / ne02; | |
| const int64_t r3 = ne13 / ne03; | |
| const ggml_fp16_t alpha = ggml_fp32_to_fp16(1.0f); | |
| const ggml_fp16_t beta = ggml_fp32_to_fp16(0.0f); | |
| const int x_ne = ne01 * ne00; | |
| const int y_ne = ne11 * ne10; | |
| const int d_ne = ne11 * ne01; | |
| GGML_ASSERT(wsize >= sizeof(ggml_fp16_t) * y_ne); | |
| GGML_ASSERT(wsize >= sizeof(ggml_fp16_t) * d_ne); | |
| ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata; | |
| size_t x_size; | |
| size_t y_size; | |
| size_t d_size; | |
| cl_mem d_X; | |
| if (src0->backend == GGML_BACKEND_GPU) { // NOLINT | |
| d_X = (cl_mem) src0->extra; | |
| } else { | |
| d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size); | |
| } | |
| cl_mem d_Y = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * y_ne, &y_size); | |
| cl_mem d_D = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * d_ne, &d_size); | |
| bool src1_cont_rows = nb10 == sizeof(float); | |
| bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float); | |
| size_t x_offset = 0; | |
| for (int64_t i03 = 0; i03 < ne03; i03++) { | |
| // TODO: copy src0 here when r3>1 | |
| for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) { | |
| for (int64_t i02 = 0; i02 < ne02; i02++) { | |
| if (src0->backend == GGML_BACKEND_GPU) { | |
| x_offset = (i03 * ne02 + i02) * x_ne; | |
| } else { | |
| // copy src0 to device | |
| CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL)); | |
| } | |
| for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) { | |
| // convert src1 to fp16 | |
| // TODO: use multiple threads | |
| char * src1i = (char *) src1->data + i13*nb13 + i12*nb12; | |
| if (src1_cont_rows) { | |
| if (src1_cont_cols) { | |
| ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11); | |
| } | |
| else { | |
| for (int64_t i11 = 0; i11 < ne11; i11++) { | |
| ggml_fp32_to_fp16_row((float *) (src1i + i11*nb11), tmp + i11*ne10, ne10); | |
| } | |
| } | |
| } | |
| else { | |
| for (int64_t i11 = 0; i11 < ne11; i11++) { | |
| for (int64_t i10 = 0; i10 < ne10; i10++) { | |
| // very slow due to no inlining | |
| tmp[i11*ne10 + i10] = ggml_fp32_to_fp16(*(float *) (src1i + i11*nb11 + i10*nb10)); | |
| } | |
| } | |
| } | |
| // copy src1 to device | |
| CL_CHECK(clEnqueueWriteBuffer(queue, d_Y, false, 0, sizeof(ggml_fp16_t) * y_ne, tmp, 0, NULL, NULL)); | |
| CL_CHECK(clFinish(queue)); | |
| // compute | |
| cl_event ev_sgemm; | |
| clblast::StatusCode status = clblast::Gemm<cl_half>(clblast::Layout::kColMajor, | |
| clblast::Transpose::kYes, clblast::Transpose::kNo, | |
| ne01, ne11, ne10, | |
| alpha, | |
| d_X, x_offset, ne00, | |
| d_Y, 0, ne10, | |
| beta, | |
| d_D, 0, ne01, | |
| &queue, &ev_sgemm); | |
| if (status != clblast::StatusCode::kSuccess) { | |
| GGML_ASSERT(false); | |
| } | |
| // copy dst to host, then convert to float | |
| CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL)); | |
| float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); | |
| ggml_fp16_to_fp32_row(tmp, d, d_ne); | |
| } | |
| } | |
| } | |
| } | |
| if (src0->backend != GGML_BACKEND_GPU) { | |
| ggml_cl_pool_free(d_X, x_size); | |
| } | |
| ggml_cl_pool_free(d_Y, y_size); | |
| ggml_cl_pool_free(d_D, d_size); | |
| } | |
| static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { | |
| const int64_t ne00 = src0->ne[0]; | |
| const int64_t ne01 = src0->ne[1]; | |
| const int64_t ne02 = src0->ne[2]; | |
| const int64_t ne03 = src0->ne[3]; | |
| const int64_t ne10 = src1->ne[0]; | |
| const int64_t ne11 = src1->ne[1]; | |
| const int64_t ne12 = src1->ne[2]; | |
| const int64_t ne13 = src1->ne[3]; | |
| const int nb2 = dst->nb[2]; | |
| const int nb3 = dst->nb[3]; | |
| const ggml_type type = src0->type; | |
| const bool mul_mat_vec = ne11 == 1 && ne00%2 == 0; | |
| const int64_t r2 = ne12 / ne02; | |
| const int64_t r3 = ne13 / ne03; | |
| const float alpha = 1.0f; | |
| const float beta = 0.0f; | |
| const int x_ne = ne01 * ne00; | |
| const int y_ne = ne11 * ne10; | |
| const int d_ne = ne11 * ne01; | |
| const int x_bps = x_ne / ggml_blck_size(type); // blocks per 2D slice | |
| const size_t q_sz = ggml_type_size(type) * x_bps; | |
| size_t x_size; | |
| size_t y_size; | |
| size_t d_size; | |
| size_t q_size; | |
| cl_mem d_X; | |
| if (!mul_mat_vec) { | |
| d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size); | |
| } | |
| cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size); | |
| cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size); | |
| cl_mem d_Q; | |
| if (src0->backend == GGML_BACKEND_CPU) { | |
| d_Q = ggml_cl_pool_malloc(q_sz, &q_size); | |
| } | |
| cl_kernel* to_fp32_cl = ggml_get_to_fp32_cl(type); | |
| cl_kernel* dmmv = ggml_get_dequantize_mul_mat_vec_cl(type); | |
| GGML_ASSERT(to_fp32_cl != nullptr); | |
| const size_t global_denom = ggml_cl_global_denom(type); | |
| const size_t local = mul_mat_vec ? CL_DMMV_LOCAL_SIZE : ggml_cl_local_size(type); | |
| size_t ev_idx = 0; | |
| std::vector<cl_event> events; | |
| for (int64_t i03 = 0; i03 < ne03; i03++) { | |
| // TODO: copy and dequantize src0 here when r3>1 | |
| for (int64_t i13 = i03 * r3, e13 = i13 + r3; i13 < e13; i13++) { | |
| for (int64_t i02 = 0; i02 < ne02; i02++) { | |
| // copy src0 to device if necessary | |
| if (src0->backend == GGML_BACKEND_CPU) { | |
| events.emplace_back(); | |
| CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++)); | |
| } else if (src0->backend == GGML_BACKEND_GPU) { | |
| d_Q = (cl_mem) src0->extra; | |
| } else { | |
| GGML_ASSERT(false); | |
| } | |
| if (!mul_mat_vec) { | |
| // convert src0 to fp32 on device | |
| const size_t global = x_ne / global_denom; | |
| const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0; | |
| CL_CHECK(clSetKernelArg(*to_fp32_cl, 0, sizeof(cl_mem), &d_Q)); | |
| CL_CHECK(clSetKernelArg(*to_fp32_cl, 1, sizeof(cl_mem), &d_X)); | |
| CL_CHECK(clEnqueueNDRangeKernel(queue, *to_fp32_cl, 1, &offset, &global, local > 0 ? &local : NULL, events.size(), !events.empty() ? events.data() : NULL, NULL)); | |
| } | |
| for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) { | |
| if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel | |
| // copy src1 to device | |
| events.emplace_back(); | |
| CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, events.data() + ev_idx++)); | |
| // compute | |
| const size_t global = ne01 * local; | |
| const size_t offset = src0->backend == GGML_BACKEND_GPU ? (i03 * ne02 + i02) * x_bps : 0; | |
| const cl_int ncols = ne00; | |
| events.emplace_back(); | |
| CL_CHECK(clSetKernelArg(*dmmv, 0, sizeof(cl_mem), &d_Q)); | |
| CL_CHECK(clSetKernelArg(*dmmv, 1, sizeof(float) * local, NULL)); | |
| CL_CHECK(clSetKernelArg(*dmmv, 2, sizeof(cl_mem), &d_Y)); | |
| CL_CHECK(clSetKernelArg(*dmmv, 3, sizeof(cl_mem), &d_D)); | |
| CL_CHECK(clSetKernelArg(*dmmv, 4, sizeof(cl_int), &ncols)); | |
| CL_CHECK(clEnqueueNDRangeKernel(queue, *dmmv, 1, &offset, &global, &local, events.size() - 1, events.data(), events.data() + ev_idx++)); | |
| } else { // CLBlast matrix matrix multiplication | |
| // copy src1 to device | |
| CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL)); | |
| // wait for conversion | |
| CL_CHECK(clFinish(queue)); | |
| // compute | |
| events.emplace_back(); | |
| clblast::StatusCode status = clblast::Gemm<cl_float>(clblast::Layout::kColMajor, | |
| clblast::Transpose::kYes, clblast::Transpose::kNo, | |
| ne01, ne11, ne10, | |
| alpha, | |
| d_X, 0, ne00, | |
| d_Y, 0, ne10, | |
| beta, | |
| d_D, 0, ne01, | |
| &queue, events.data() + ev_idx++); | |
| if (status != clblast::StatusCode::kSuccess) { | |
| GGML_ASSERT(false); | |
| } | |
| } | |
| // copy dst to host | |
| float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); | |
| CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &events[events.size() - 1], NULL)); | |
| for (auto *event : events) { | |
| clReleaseEvent(event); | |
| } | |
| ev_idx = 0; | |
| events.clear(); | |
| } | |
| } | |
| } | |
| } | |
| if (!mul_mat_vec) { | |
| ggml_cl_pool_free(d_X, x_size); | |
| } | |
| ggml_cl_pool_free(d_Y, y_size); | |
| ggml_cl_pool_free(d_D, d_size); | |
| if (src0->backend == GGML_BACKEND_CPU) { | |
| ggml_cl_pool_free(d_Q, q_size); | |
| } | |
| } | |
| bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { | |
| const int64_t ne10 = src1->ne[0]; | |
| const int64_t ne0 = dst->ne[0]; | |
| const int64_t ne1 = dst->ne[1]; | |
| // TODO: find the optimal values for these | |
| if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && | |
| src1->type == GGML_TYPE_F32 && | |
| dst->type == GGML_TYPE_F32 && | |
| ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_GPU)) { | |
| return true; | |
| } | |
| return false; | |
| } | |
| static bool ggml_cl_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) { | |
| // If device doesn't support FP16 | |
| if (!fp16_support) { | |
| return false; | |
| } | |
| size_t src0_sz = ggml_nbytes(src0); | |
| size_t src1_sz = ggml_nbytes(src1); | |
| // mul_mat_q: src0 is converted to fp32 on device | |
| size_t mul_mat_q_transfer = src0_sz + src1_sz; | |
| // mul_mat_f16: src1 is converted to fp16 on cpu | |
| size_t mul_mat_f16_transfer = src0_sz + sizeof(ggml_fp16_t) * ggml_nelements(src1); | |
| // choose the smaller one to transfer to the device | |
| // TODO: this is not always the best choice due to the overhead of converting to fp16 | |
| return mul_mat_f16_transfer < mul_mat_q_transfer; | |
| } | |
| void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize) { | |
| GGML_ASSERT(ggml_cl_can_mul_mat(src0, src1, dst)); | |
| if (src0->type == GGML_TYPE_F32) { | |
| ggml_cl_mul_mat_f32(src0, src1, dst); | |
| } | |
| else if (src0->type == GGML_TYPE_F16) { | |
| if (ggml_cl_mul_mat_use_f16(src0, src1, dst)) { | |
| ggml_cl_mul_mat_f16(src0, src1, dst, wdata, wsize); | |
| } | |
| else { | |
| ggml_cl_mul_mat_q_f32(src0, src1, dst); | |
| } | |
| } | |
| else if (ggml_is_quantized(src0->type)) { | |
| ggml_cl_mul_mat_q_f32(src0, src1, dst); | |
| } | |
| else { | |
| GGML_ASSERT(false); | |
| } | |
| } | |
| size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { | |
| if (src0->type == GGML_TYPE_F16 && ggml_cl_mul_mat_use_f16(src0, src1, dst)) { | |
| return sizeof(ggml_fp16_t) * std::max(src1->ne[0] * src1->ne[1], dst->ne[0] * dst->ne[1]); | |
| } | |
| return 0; | |
| } | |
| void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) { | |
| const int64_t ne0 = tensor->ne[0]; | |
| const int64_t ne1 = tensor->ne[1]; | |
| const int64_t ne2 = tensor->ne[2]; | |
| const int64_t ne3 = tensor->ne[3]; | |
| const ggml_type type = tensor->type; | |
| const size_t s_sz = ggml_type_size(type) * (size_t) (ne0 * ne1 / ggml_blck_size(type)); | |
| const size_t q_sz = s_sz * (size_t) (ne2 * ne3); | |
| size_t q_size; | |
| cl_mem dst = ggml_cl_pool_malloc(q_sz, &q_size); | |
| tensor->data = data; | |
| // copy tensor to device | |
| size_t offset = 0; | |
| for (int64_t i3 = 0; i3 < ne3; i3++) { | |
| for (int64_t i2 = 0; i2 < ne2; i2++) { | |
| CL_CHECK(ggml_cl_h2d_tensor_2d(queue, dst, offset, tensor, i3, i2, NULL)); | |
| offset += s_sz; | |
| } | |
| } | |
| CL_CHECK(clFinish(queue)); | |
| tensor->extra = dst; | |
| GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment