-
-
Save oscarbg/0d3fa07385bef69a8455 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -*- compile-command: "nvcc -m 32 -arch sm_50 -Xptxas=-v,-abi=no -cubin int_mul.cu" ; -*- | |
#include <stdint.h> | |
// | |
// | |
// | |
#define KERNEL_QUALIFIERS __global__ | |
#define KERNEL_QUALIFIERS_EXTERN extern KERNEL_QUALIFIERS | |
#define KERNEL_QUALIFIERS_EXTERN_C extern "C" KERNEL_QUALIFIERS | |
// | |
// | |
// | |
#ifndef _DEBUG | |
#define DEVICE_FUNCTION_QUALIFIERS __device__ __forceinline__ | |
#define DEVICE_INTRINSIC_QUALIFIERS __device__ __forceinline__ | |
#else | |
#define DEVICE_FUNCTION_QUALIFIERS __device__ | |
#define DEVICE_INTRINSIC_QUALIFIERS __device__ | |
#endif | |
// | |
// | |
// | |
#define DEVICE_STATIC_FUNCTION_QUALIFIERS static DEVICE_FUNCTION_QUALIFIERS | |
#define DEVICE_STATIC_INTRINSIC_QUALIFIERS static DEVICE_INTRINSIC_QUALIFIERS | |
// | |
// | |
// | |
#define RESTRICT __restrict__ | |
// | |
// | |
// | |
#define WARP_SIZE 32 | |
// | |
// | |
// | |
typedef int16_t s16; | |
typedef short2 s16v2; | |
typedef int32_t s32; | |
typedef uint32_t u32; | |
typedef int64_t s64; | |
// | |
// | |
// | |
DEVICE_STATIC_INTRINSIC_QUALIFIERS | |
s32 | |
mul_wide_s16(s16 a, s16 b) | |
{ | |
s32 d; | |
asm("mul.wide.s16 %0, %1, %2;" : "=r"(d) : "h"(a), "h"(b)); | |
return d; | |
} | |
DEVICE_STATIC_INTRINSIC_QUALIFIERS | |
s32 | |
mad_wide_s16(s16 a, s16 b, s32 c) | |
{ | |
s32 d; | |
asm("mad.wide.s16 %0, %1, %2, %3;" : "=r"(d) : "h"(a), "h"(b), "r"(c)); | |
return d; | |
} | |
// | |
// | |
// | |
DEVICE_STATIC_INTRINSIC_QUALIFIERS | |
s32 | |
mul_s32_s16(s32 a, s16 b) | |
{ | |
return a * b; | |
} | |
DEVICE_STATIC_INTRINSIC_QUALIFIERS | |
s32 | |
mad_s32_s16(s32 a, s16 b, s32 c) | |
{ | |
return a * b + c; | |
} | |
// | |
// | |
// | |
DEVICE_STATIC_INTRINSIC_QUALIFIERS | |
s32 | |
mul_lo_s32(s32 a, s32 b) | |
{ | |
s32 d; | |
asm("mul.lo.s32 %0, %1, %2;" : "=r"(d) : "r"(a), "r"(b)); | |
return d; | |
} | |
DEVICE_STATIC_INTRINSIC_QUALIFIERS | |
s32 | |
mad_lo_s32(s32 a, s32 b, s32 c) | |
{ | |
s32 d; | |
asm("mad.lo.s32 %0, %1, %2, %3;" : "=r"(d) : "r"(a), "r"(b), "r"(c)); | |
return d; | |
} | |
// | |
// | |
// | |
DEVICE_STATIC_INTRINSIC_QUALIFIERS | |
s64 | |
mul_wide_s32(s32 a, s32 b) | |
{ | |
s64 d; | |
asm("mul.wide.s32 %0, %1, %2;" : "=l"(d) : "r"(a), "r"(b)); | |
return d; | |
} | |
DEVICE_STATIC_INTRINSIC_QUALIFIERS | |
s64 | |
mad_wide_s32(s32 a, s32 b, s64 c) | |
{ | |
s64 d; | |
asm("mad.wide.s32 %0, %1, %2, %3;" : "=l"(d) : "r"(a), "r"(b), "l"(c)); | |
return d; | |
} | |
// | |
// | |
// | |
DEVICE_STATIC_INTRINSIC_QUALIFIERS | |
s64 | |
mul_s64(s64 a, s64 b) | |
{ | |
s64 d; | |
asm("mul.lo.s64 %0, %1, %2;" : "=l"(d) : "l"(a), "l"(b)); | |
return d; | |
} | |
DEVICE_STATIC_INTRINSIC_QUALIFIERS | |
s64 | |
mad_s64(s64 a, s64 b, s64 c) | |
{ | |
s64 d; | |
asm("mad.lo.s64 %0, %1, %2, %3;" : "=l"(d) : "l"(a), "l"(b), "l"(c)); | |
return d; | |
} | |
// | |
// | |
// | |
/* | |
KERNEL_QUALIFIERS_EXTERN_C | |
void | |
mul_s16v2_kernel(const s16v2* const RESTRICT a, | |
const s16v2* const RESTRICT b, | |
s16v2* const RESTRICT d) | |
{ | |
d[threadIdx.x] = mul_s16v2(a[threadIdx.x], | |
b[threadIdx.x]); | |
} | |
KERNEL_QUALIFIERS_EXTERN_C | |
void | |
mad_s16v2_kernel(const s16v2* const RESTRICT a, | |
const s16v2* const RESTRICT b, | |
const s16v2* const RESTRICT c, | |
s16v2* const RESTRICT d) | |
{ | |
d[threadIdx.x] = mad_s16v2(a[threadIdx.x], | |
b[threadIdx.x], | |
c[threadIdx.x]); | |
} | |
*/ | |
// | |
// | |
// | |
KERNEL_QUALIFIERS_EXTERN_C | |
void | |
mul_wide_s16_kernel(const s16* const RESTRICT a, | |
const s16* const RESTRICT b, | |
s32* const RESTRICT d) | |
{ | |
d[threadIdx.x] = mul_wide_s16(a[threadIdx.x], | |
b[threadIdx.x]); | |
} | |
KERNEL_QUALIFIERS_EXTERN_C | |
void | |
mad_wide_s16_kernel(const s16* const RESTRICT a, | |
const s16* const RESTRICT b, | |
const s32* const RESTRICT c, | |
s32* const RESTRICT d) | |
{ | |
d[threadIdx.x] = mad_wide_s16(a[threadIdx.x], | |
b[threadIdx.x], | |
c[threadIdx.x]); | |
} | |
// | |
// | |
// | |
KERNEL_QUALIFIERS_EXTERN_C | |
void | |
mul_s32_s16_kernel(const s32* const RESTRICT a, | |
const s16* const RESTRICT b, | |
s32* const RESTRICT d) | |
{ | |
d[threadIdx.x] = mul_s32_s16(a[threadIdx.x], | |
b[threadIdx.x]); | |
} | |
KERNEL_QUALIFIERS_EXTERN_C | |
void | |
mad_s32_s16_kernel(const s32* const RESTRICT a, | |
const s16* const RESTRICT b, | |
const s32* const RESTRICT c, | |
s32* const RESTRICT d) | |
{ | |
d[threadIdx.x] = mad_s32_s16(a[threadIdx.x], | |
b[threadIdx.x], | |
c[threadIdx.x]); | |
} | |
// | |
// | |
// | |
KERNEL_QUALIFIERS_EXTERN_C | |
void | |
mul_lo_s32_kernel(const s32* const RESTRICT a, | |
const s32* const RESTRICT b, | |
s32* const RESTRICT d) | |
{ | |
d[threadIdx.x] = mul_lo_s32(a[threadIdx.x], | |
b[threadIdx.x]); | |
} | |
KERNEL_QUALIFIERS_EXTERN_C | |
void | |
mad_lo_s32_kernel(const s32* const RESTRICT a, | |
const s32* const RESTRICT b, | |
const s32* const RESTRICT c, | |
s32* const RESTRICT d) | |
{ | |
d[threadIdx.x] = mad_lo_s32(a[threadIdx.x], | |
b[threadIdx.x], | |
c[threadIdx.x]); | |
} | |
// | |
// | |
// | |
KERNEL_QUALIFIERS_EXTERN_C | |
void | |
mul_wide_s32_kernel(const s32* const RESTRICT a, | |
const s32* const RESTRICT b, | |
s64* const RESTRICT d) | |
{ | |
d[threadIdx.x] = mul_wide_s32(a[threadIdx.x], | |
b[threadIdx.x]); | |
} | |
KERNEL_QUALIFIERS_EXTERN_C | |
void | |
mad_wide_s32_kernel(const s32* const RESTRICT a, | |
const s32* const RESTRICT b, | |
const s64* const RESTRICT c, | |
s64* const RESTRICT d) | |
{ | |
d[threadIdx.x] = mad_wide_s32(a[threadIdx.x], | |
b[threadIdx.x], | |
c[threadIdx.x]); | |
} | |
// | |
// | |
// | |
KERNEL_QUALIFIERS_EXTERN_C | |
void | |
mul_s64_kernel(const s64* const RESTRICT a, | |
const s64* const RESTRICT b, | |
s64* const RESTRICT d) | |
{ | |
d[threadIdx.x] = mul_s64(a[threadIdx.x], | |
b[threadIdx.x]); | |
} | |
KERNEL_QUALIFIERS_EXTERN_C | |
void | |
mad_s64_kernel(const s64* const RESTRICT a, | |
const s64* const RESTRICT b, | |
const s64* const RESTRICT c, | |
s64* const RESTRICT d) | |
{ | |
d[threadIdx.x] = mad_s64(a[threadIdx.x], | |
b[threadIdx.x], | |
c[threadIdx.x]); | |
} | |
// | |
// | |
// |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment