Last active
November 1, 2018 04:20
-
-
Save allanmac/8973f01a2e5b2aa5a994 to your computer and use it in GitHub Desktop.
Inspecting Kepler vs. Maxwell integer multiplies. Some performance insights here: https://devtalk.nvidia.com/default/topic/804281/cuda-programming-and-performance/maxwell-integer-mul-mad-instruction-counts/post/4423835/#4423835
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// -*- compile-command: "nvcc -m 32 -arch sm_50 -Xptxas=-v,-abi=no -cubin int_mul.cu" ; -*- | |
#include <stdint.h> | |
#include <cuda_fp16.h> | |
// | |
// | |
// | |
#define KERNEL_QUALIFIERS __global__ | |
#define KERNEL_QUALIFIERS_EXTERN extern KERNEL_QUALIFIERS | |
#define KERNEL_QUALIFIERS_EXTERN_C extern "C" KERNEL_QUALIFIERS | |
// | |
// | |
// | |
#ifndef _DEBUG | |
#define DEVICE_FUNCTION_QUALIFIERS __device__ __forceinline__ | |
#define DEVICE_INTRINSIC_QUALIFIERS __device__ __forceinline__ | |
#else | |
#define DEVICE_FUNCTION_QUALIFIERS __device__ | |
#define DEVICE_INTRINSIC_QUALIFIERS __device__ | |
#endif | |
// | |
// | |
// | |
#define DEVICE_STATIC_FUNCTION_QUALIFIERS static DEVICE_FUNCTION_QUALIFIERS | |
#define DEVICE_STATIC_INTRINSIC_QUALIFIERS static DEVICE_INTRINSIC_QUALIFIERS | |
// | |
// | |
// | |
#define RESTRICT __restrict__ | |
// | |
// | |
// | |
#define WARP_SIZE 32 | |
// | |
// | |
// | |
typedef int16_t s16; | |
typedef short2 s16v2; | |
typedef uint16_t u16; | |
typedef ushort2 u16v2; | |
typedef int32_t s32; | |
typedef uint32_t u32; | |
typedef int64_t s64; | |
// | |
// | |
// | |
typedef u16 q16; | |
union q16v2 | |
{ | |
u32 lohi; | |
struct { | |
q16 lo; | |
q16 hi; | |
}; | |
}; | |
// | |
// | |
// | |
DEVICE_STATIC_INTRINSIC_QUALIFIERS | |
q16v2 | |
mad_q16v2(union q16v2 a, union q16v2 b, union q16v2 c) | |
{ | |
u32 d,e; | |
asm("vmad.u32.u32.u32.shr15 %0, %1.h0, %2.h0, %3;" : "=r"(d) : "r"(a.lohi), "r"(b.lohi), "r"((u32)c.lo)); | |
asm("vmad.u32.u32.u32.shr15 %0, %1.h1, %2.h1, %3;" : "=r"(e) : "r"(a.lohi), "r"(b.lohi), "r"((u32)c.hi)); | |
q16v2 r; | |
r.lo = d; | |
r.hi = e; | |
return r; | |
} | |
// | |
// | |
// | |
DEVICE_STATIC_INTRINSIC_QUALIFIERS | |
half2 | |
fma_half2(half2 a, half2 b, half2 c) | |
{ | |
#if __CUDA_ARCH__ >= 530 | |
return __hfma2(a,b,c); | |
#else | |
return __floats2half2_rn(fmaf( __low2float(a), __low2float(b), __low2float(c)), | |
fmaf(__high2float(a),__high2float(b),__high2float(c))); | |
#endif | |
} | |
// | |
// | |
// | |
DEVICE_STATIC_INTRINSIC_QUALIFIERS | |
u16v2 | |
mad_u16v2(u16v2 a, u16v2 b, u32 c) | |
{ | |
u16v2 r; | |
{ | |
u32 d; | |
asm("mad.wide.u16 %0, %1, %2, %3;" : "=r"(d) : "h"(a.x), "h"(b.x), "r"(c)); | |
r.x = d >> 15; | |
} | |
{ | |
u32 d; | |
asm("mad.wide.u16 %0, %1, %2, %3;" : "=r"(d) : "h"(a.y), "h"(b.y), "r"(c)); | |
r.y = d >> 15; | |
} | |
return r; | |
} | |
// | |
// | |
// | |
DEVICE_STATIC_INTRINSIC_QUALIFIERS | |
s32 | |
mul_wide_s16(s16 a, s16 b) | |
{ | |
s32 d; | |
asm("mul.wide.s16 %0, %1, %2;" : "=r"(d) : "h"(a), "h"(b)); | |
return d; | |
} | |
DEVICE_STATIC_INTRINSIC_QUALIFIERS | |
s32 | |
mad_wide_s16(s16 a, s16 b, s32 c) | |
{ | |
s32 d; | |
asm("mad.wide.s16 %0, %1, %2, %3;" : "=r"(d) : "h"(a), "h"(b), "r"(c)); | |
return d; | |
} | |
// | |
// | |
// | |
DEVICE_STATIC_INTRINSIC_QUALIFIERS | |
s32 | |
mul_s32_s16(s32 a, s16 b) | |
{ | |
return a * b; | |
} | |
DEVICE_STATIC_INTRINSIC_QUALIFIERS | |
s32 | |
mad_s32_s16(s32 a, s16 b, s32 c) | |
{ | |
return a * b + c; | |
} | |
// | |
// | |
// | |
DEVICE_STATIC_INTRINSIC_QUALIFIERS | |
s32 | |
mul_lo_s32(s32 a, s32 b) | |
{ | |
s32 d; | |
asm("mul.lo.s32 %0, %1, %2;" : "=r"(d) : "r"(a), "r"(b)); | |
return d; | |
} | |
DEVICE_STATIC_INTRINSIC_QUALIFIERS | |
s32 | |
mad_lo_s32(s32 a, s32 b, s32 c) | |
{ | |
s32 d; | |
asm("mad.lo.s32 %0, %1, %2, %3;" : "=r"(d) : "r"(a), "r"(b), "r"(c)); | |
return d; | |
} | |
// | |
// | |
// | |
DEVICE_STATIC_INTRINSIC_QUALIFIERS | |
s64 | |
mul_wide_s32(s32 a, s32 b) | |
{ | |
s64 d; | |
asm("mul.wide.s32 %0, %1, %2;" : "=l"(d) : "r"(a), "r"(b)); | |
return d; | |
} | |
DEVICE_STATIC_INTRINSIC_QUALIFIERS | |
s64 | |
mad_wide_s32(s32 a, s32 b, s64 c) | |
{ | |
s64 d; | |
asm("mad.wide.s32 %0, %1, %2, %3;" : "=l"(d) : "r"(a), "r"(b), "l"(c)); | |
return d; | |
} | |
// | |
// | |
// | |
DEVICE_STATIC_INTRINSIC_QUALIFIERS | |
s64 | |
mul_s64(s64 a, s64 b) | |
{ | |
s64 d; | |
asm("mul.lo.s64 %0, %1, %2;" : "=l"(d) : "l"(a), "l"(b)); | |
return d; | |
} | |
DEVICE_STATIC_INTRINSIC_QUALIFIERS | |
s64 | |
mad_s64(s64 a, s64 b, s64 c) | |
{ | |
s64 d; | |
asm("mad.lo.s64 %0, %1, %2, %3;" : "=l"(d) : "l"(a), "l"(b), "l"(c)); | |
return d; | |
} | |
// | |
// | |
// | |
KERNEL_QUALIFIERS_EXTERN_C | |
void | |
mad_q16v2_kernel(const union q16v2* const RESTRICT a, | |
const union q16v2* const RESTRICT b, | |
const union q16v2* const RESTRICT c, | |
union q16v2* const RESTRICT d) | |
{ | |
d[threadIdx.x] = mad_q16v2(a[threadIdx.x], | |
b[threadIdx.x], | |
c[threadIdx.x]); | |
} | |
// | |
// | |
// | |
KERNEL_QUALIFIERS_EXTERN_C | |
void | |
fma_half2_kernel(const half2* const RESTRICT a, | |
const half2* const RESTRICT b, | |
const half2* const RESTRICT c, | |
half2* const RESTRICT d) | |
{ | |
d[threadIdx.x] = fma_half2(a[threadIdx.x], | |
b[threadIdx.x], | |
c[threadIdx.x]); | |
} | |
// | |
// | |
// | |
KERNEL_QUALIFIERS_EXTERN_C | |
void | |
mad_u16v2_kernel(const u16v2* const RESTRICT a, | |
const u16v2* const RESTRICT b, | |
const u32* const RESTRICT c, | |
u16v2* const RESTRICT d) | |
{ | |
d[threadIdx.x] = mad_u16v2(a[threadIdx.x], | |
b[threadIdx.x], | |
c[threadIdx.x]); | |
} | |
// | |
// | |
// | |
KERNEL_QUALIFIERS_EXTERN_C | |
void | |
mul_wide_s16_kernel(const s16* const RESTRICT a, | |
const s16* const RESTRICT b, | |
s32* const RESTRICT d) | |
{ | |
d[threadIdx.x] = mul_wide_s16(a[threadIdx.x], | |
b[threadIdx.x]); | |
} | |
KERNEL_QUALIFIERS_EXTERN_C | |
void | |
mad_wide_s16_kernel(const s16* const RESTRICT a, | |
const s16* const RESTRICT b, | |
const s32* const RESTRICT c, | |
s32* const RESTRICT d) | |
{ | |
d[threadIdx.x] = mad_wide_s16(a[threadIdx.x], | |
b[threadIdx.x], | |
c[threadIdx.x]); | |
} | |
// | |
// | |
// | |
KERNEL_QUALIFIERS_EXTERN_C | |
void | |
mul_s32_s16_kernel(const s32* const RESTRICT a, | |
const s16* const RESTRICT b, | |
s32* const RESTRICT d) | |
{ | |
d[threadIdx.x] = mul_s32_s16(a[threadIdx.x], | |
b[threadIdx.x]); | |
} | |
KERNEL_QUALIFIERS_EXTERN_C | |
void | |
mad_s32_s16_kernel(const s32* const RESTRICT a, | |
const s16* const RESTRICT b, | |
const s32* const RESTRICT c, | |
s32* const RESTRICT d) | |
{ | |
d[threadIdx.x] = mad_s32_s16(a[threadIdx.x], | |
b[threadIdx.x], | |
c[threadIdx.x]); | |
} | |
// | |
// | |
// | |
KERNEL_QUALIFIERS_EXTERN_C | |
void | |
mul_lo_s32_kernel(const s32* const RESTRICT a, | |
const s32* const RESTRICT b, | |
s32* const RESTRICT d) | |
{ | |
d[threadIdx.x] = mul_lo_s32(a[threadIdx.x], | |
b[threadIdx.x]); | |
} | |
KERNEL_QUALIFIERS_EXTERN_C | |
void | |
mad_lo_s32_kernel(const s32* const RESTRICT a, | |
const s32* const RESTRICT b, | |
const s32* const RESTRICT c, | |
s32* const RESTRICT d) | |
{ | |
d[threadIdx.x] = mad_lo_s32(a[threadIdx.x], | |
b[threadIdx.x], | |
c[threadIdx.x]); | |
} | |
// | |
// | |
// | |
KERNEL_QUALIFIERS_EXTERN_C | |
void | |
mul_wide_s32_kernel(const s32* const RESTRICT a, | |
const s32* const RESTRICT b, | |
s64* const RESTRICT d) | |
{ | |
d[threadIdx.x] = mul_wide_s32(a[threadIdx.x], | |
b[threadIdx.x]); | |
} | |
KERNEL_QUALIFIERS_EXTERN_C | |
void | |
mad_wide_s32_kernel(const s32* const RESTRICT a, | |
const s32* const RESTRICT b, | |
const s64* const RESTRICT c, | |
s64* const RESTRICT d) | |
{ | |
d[threadIdx.x] = mad_wide_s32(a[threadIdx.x], | |
b[threadIdx.x], | |
c[threadIdx.x]); | |
} | |
// | |
// | |
// | |
KERNEL_QUALIFIERS_EXTERN_C | |
void | |
mul_s64_kernel(const s64* const RESTRICT a, | |
const s64* const RESTRICT b, | |
s64* const RESTRICT d) | |
{ | |
d[threadIdx.x] = mul_s64(a[threadIdx.x], | |
b[threadIdx.x]); | |
} | |
KERNEL_QUALIFIERS_EXTERN_C | |
void | |
mad_s64_kernel(const s64* const RESTRICT a, | |
const s64* const RESTRICT b, | |
const s64* const RESTRICT c, | |
s64* const RESTRICT d) | |
{ | |
d[threadIdx.x] = mad_s64(a[threadIdx.x], | |
b[threadIdx.x], | |
c[threadIdx.x]); | |
} | |
// | |
// | |
// |
Maxwell v2 (sm_52)
code for sm_52
Function : mul_wide_s32_kernel
/*0038*/ XMAD R3, R1.reuse, R0.reuse, RZ;
/*0048*/ XMAD.MRG R4, R1.reuse, R0.H1.reuse, RZ;
/*0050*/ XMAD.U16.S16 R5, R1.reuse, R0.H1.reuse, RZ;
/*0058*/ XMAD.S16.S16.CSFU R6, R1.H1.reuse, R0.H1.reuse, RZ;
/*0068*/ XMAD.S16.U16.CHI R7, R1.H1.reuse, R0, R3.reuse;
/*0070*/ XMAD.PSL.CBCC R0, R1.H1, R4.H1, R3;
/*0088*/ IADD3.RS R1, R7, R5, R6;
....................................
Function : mul_lo_s32_kernel
/*0048*/ XMAD R2, R0.reuse, R1.reuse, RZ;
/*0050*/ XMAD.MRG R4, R0.reuse, R1.H1, RZ;
/*0058*/ XMAD.PSL.CBCC R2, R0.H1, R4.H1, R2;
..................................
Function : mul_s64_kernel
/*0038*/ XMAD R2, R10.reuse, R0.reuse, RZ;
/*0048*/ XMAD R6, R11.reuse, R0.reuse, RZ;
/*0050*/ XMAD.MRG R7, R11.reuse, R0.H1.reuse, RZ;
/*0058*/ XMAD R3, R10.reuse, R0.H1.reuse, RZ;
/*0068*/ XMAD R4, R10.H1.reuse, R0.H1.reuse, RZ;
/*0070*/ XMAD R8, R10.reuse, R1.reuse, RZ;
/*0078*/ XMAD.MRG R9, R10.reuse, R1.H1, RZ;
/*0088*/ XMAD.CHI R5, R10.H1.reuse, R0.reuse, R2.reuse;
/*0090*/ XMAD.MRG R1, R10.reuse, R0.H1, RZ;
/*0098*/ XMAD.PSL.CBCC R6, R11.H1, R7.H1, R6;
/*00a8*/ XMAD.PSL.CBCC R7, R10.H1.reuse, R9.H1, R8;
/*00b0*/ IADD3.RS R3, R5, R3, R4;
/*00b8*/ XMAD.PSL.CBCC R0, R10.H1, R1.H1, R2;
/*00d0*/ IADD3 R1, R7, R3, R6;
...............................
Function : mad_wide_s32_kernel
/*0050*/ XMAD R4, R7.reuse, R2.reuse, RZ;
/*0058*/ XMAD.MRG R3, R7.reuse, R2.H1.reuse, RZ;
/*0068*/ XMAD.PSL.CBCC R3, R7.H1.reuse, R3.H1, R4.reuse;
/*0070*/ XMAD.U16.S16 R5, R7.reuse, R2.H1.reuse, RZ;
/*0078*/ XMAD.S16.S16.CSFU R6, R7.H1.reuse, R2.H1.reuse, RZ;
/*0088*/ IADD R0.CC, R0, R3;
/*0090*/ XMAD.S16.U16.CHI R3, R7.H1, R2, R4;
/*0098*/ IADD3.RS R3, R3, R5, R6;
/*00b0*/ IADD.X R1, R1, R3;
....................................
Function : mad_lo_s32_kernel
/*0058*/ XMAD.MRG R5, R0.reuse, R1.H1.reuse, RZ;
/*0068*/ XMAD R3, R0.reuse, R1, R2;
/*0070*/ XMAD.PSL.CBCC R3, R0.H1, R5.H1, R3;
..................................
Function : mad_s64_kernel
/*0050*/ XMAD R4, R12.reuse, R0.reuse, RZ;
/*0058*/ XMAD.MRG R5, R12.reuse, R0.H1.reuse, RZ;
/*0068*/ XMAD R6, R12.reuse, R0.reuse, RZ;
/*0070*/ XMAD R7, R13.reuse, R0.reuse, RZ;
/*0078*/ XMAD.MRG R8, R13.reuse, R0.H1.reuse, RZ;
/*0088*/ XMAD R9, R12.reuse, R1.reuse, RZ;
/*0090*/ XMAD.MRG R10, R12.reuse, R1.H1, RZ;
/*0098*/ XMAD.PSL.CBCC R4, R12.H1.reuse, R5.H1, R4;
/*00a8*/ XMAD R1, R12.reuse, R0.H1.reuse, RZ;
/*00b0*/ XMAD R5, R12.H1.reuse, R0.H1.reuse, RZ;
/*00b8*/ XMAD.CHI R14, R12.H1.reuse, R0, R6;
/*00c8*/ XMAD.PSL.CBCC R6, R13.H1, R8.H1, R7;
/*00d0*/ XMAD.PSL.CBCC R7, R12.H1, R10.H1, R9;
/*00d8*/ IADD R0.CC, R2, R4;
/*00e8*/ IADD3.RS R1, R14, R1, R5;
/*00f0*/ IADD3 R1, R7, R1, R6;
/*0108*/ IADD.X R1, R3, R1;
...............................
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Kepler (sm_35):