Last active
August 7, 2023 23:49
-
-
Save funnbot/e807f0d9680dd5fa38648caa030f718e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
// From cuda docs: | |
__device__ int __dp4a ( int srcA, int srcB, int c ) | |
Four-way signedint8 dot product with int32 accumulate. | |
Description | |
Extracts four pairs of packed byte-sized integers from scrA and srcB, then creates four pairwise products and adds them together to a signed 32-bit integer c. | |
*/ | |
static __device__ __forceinline__ int32_t __dp4a(int32_t srcA, int32_t srcB, int32_t c) { | |
return amd_mixed_dot(srcA, srcB, c, /*saturate=*/ true); | |
} | |
// https://github.com/intel/llvm/blob/8e0cc4b7a845df9389a1313a3e680babc4d87782/sycl/source/detail/builtins_integer.cpp#L218 | |
static __device__ __forceinline__ int8_t s_sub_sat(int8_t x, int8_t y) { | |
int8_t result = uint8_t(x) - uint8_t(y); | |
// Saturate result if (+) - (-) = (-) or (-) - (+) = (+). | |
if (((x < 0) ^ (y < 0)) && ((x < 0) ^ (result < 0))) { | |
result = result < 0 ? std::numeric_limits<int8_t>::max() : std::numeric_limits<int8_t>::min(); | |
} | |
return result; | |
} | |
/* | |
// From cuda docs: | |
__device__ unsigned int __vsubss4 ( unsigned int a, unsigned int b ) | |
Performs per-byte subtraction with signed saturation. | |
Returns | |
Returns computed value. | |
Description | |
Splits 4 bytes of each argument into 4 parts, each consisting of 1 byte. For corresponding parts function performs subtraction with signed saturation. Partial results are recombined and returned as unsigned int. | |
*/ | |
static __device__ __forceinline__ uint32_t __vsubss4(uint32_t a, uint32_t b) { | |
// Is this more correct? (Didn't fix mmq) | |
uint32_t r; | |
__asm__("v_sub_nc_i32 %0,%1,%2,clamp;" | |
: "=v"(r) | |
: "r"(a), "r"(b)); | |
return r; | |
// Suggested instruction (Didn't fix mmq) | |
// uint32_t r; | |
// __asm__("v_sub_nc_u32 %0,%1,%2;" | |
// : "=v"(r) | |
// : "r"(a), "r"(b)); | |
// return r; | |
/// Proper hip types (Didn't fix mmq) | |
// char4 a4 = mapFrom<char4, uint32_t>(a); | |
// char4 b4 = mapFrom<char4, uint32_t>(b); | |
// char4 c = char4{ | |
// s_sub_sat(a4.x, b4.x), | |
// s_sub_sat(a4.y, b4.y), | |
// s_sub_sat(a4.z, b4.z), | |
// s_sub_sat(a4.w, b4.w), | |
// }; | |
// return mapFrom<uint32_t, char4>(c); | |
/// Bitwise (Didn't fix mmq) | |
// return ( | |
// (s_sub_sat(a, b) & 0xFF) | | |
// ((s_sub_sat(a >> 8, b >> 8) << 8) & 0xFF00) | | |
// ((s_sub_sat(a >> 16, b >> 16) << 16) & 0xFF0000) | | |
// ((s_sub_sat(a >> 24, b >> 24) << 24) & 0xFF000000) | |
// ); | |
/// From github comment (Didn't fix mmq) | |
// typedef int8_t int8x4_t __attribute__((ext_vector_type(4))); | |
// int8x4_t ap = int8x4_t(a); | |
// int8x4_t bp = int8x4_t(b); | |
// int8x4_t c = { | |
// s_sub_sat(ap.x, bp.x), | |
// s_sub_sat(ap.y, bp.y), | |
// s_sub_sat(ap.z, bp.z), | |
// s_sub_sat(ap.w, bp.w) }; | |
// return *(unsigned int*)(&c); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment