allanmac · November 1, 2018 04:20 · allanmac · Jan 14, 2015
diff --git a/int_mul.cu b/int_mul.cu
 // -*- compile-command: "nvcc -m 32 -arch sm_50 -Xptxas=-v,-abi=no -cubin int_mul.cu" ; -*-

 #include <stdint.h>
 #include <cuda_fp16.h>

 //
 //
 //

 #define KERNEL_QUALIFIERS                   __global__
 #define KERNEL_QUALIFIERS_EXTERN            extern     KERNEL_QUALIFIERS
 #define KERNEL_QUALIFIERS_EXTERN_C          extern "C" KERNEL_QUALIFIERS

 //
 //
 //

 #ifndef _DEBUG
 #define DEVICE_FUNCTION_QUALIFIERS          __device__ __forceinline__
 #define DEVICE_INTRINSIC_QUALIFIERS         __device__ __forceinline__
 #else
 #define DEVICE_FUNCTION_QUALIFIERS          __device__
 #define DEVICE_INTRINSIC_QUALIFIERS         __device__
 #endif

 //
 //
 //

 #define DEVICE_STATIC_FUNCTION_QUALIFIERS   static DEVICE_FUNCTION_QUALIFIERS
 #define DEVICE_STATIC_INTRINSIC_QUALIFIERS  static DEVICE_INTRINSIC_QUALIFIERS

 //
 //
 //

 #define RESTRICT                            __restrict__

 //
 //
 //

 #define WARP_SIZE                           32

 //
 //
 //

 typedef int16_t  s16;
 typedef short2   s16v2;

 typedef uint16_t u16;
 typedef ushort2  u16v2;

 typedef int32_t  s32;
 typedef uint32_t u32;

 typedef int64_t  s64;

 //
 //
 //

 typedef u16 q16;

 union q16v2
 {
  u32     lohi;

  struct {
    q16   lo;
    q16   hi;
  };
 };

 //
 //
 //

 DEVICE_STATIC_INTRINSIC_QUALIFIERS
 q16v2
 mad_q16v2(union q16v2 a, union q16v2 b, union q16v2 c)
 {
  u32 d,e;

  asm("vmad.u32.u32.u32.shr15 %0, %1.h0, %2.h0, %3;" : "=r"(d) : "r"(a.lohi), "r"(b.lohi), "r"((u32)c.lo));
  asm("vmad.u32.u32.u32.shr15 %0, %1.h1, %2.h1, %3;" : "=r"(e) : "r"(a.lohi), "r"(b.lohi), "r"((u32)c.hi));

  q16v2 r;

  r.lo = d;
  r.hi = e;

  return r;
 }

 //
 //
 //

 DEVICE_STATIC_INTRINSIC_QUALIFIERS
 half2
 fma_half2(half2 a, half2 b, half2 c)
 {
 #if __CUDA_ARCH__ >= 530
  return __hfma2(a,b,c);
 #else
  return __floats2half2_rn(fmaf( __low2float(a), __low2float(b), __low2float(c)),
                           fmaf(__high2float(a),__high2float(b),__high2float(c)));
 #endif
 }

 //
 //
 //

 DEVICE_STATIC_INTRINSIC_QUALIFIERS
 u16v2
 mad_u16v2(u16v2 a, u16v2 b, u32 c)
 {
  u16v2 r;
  {
    u32 d;
    asm("mad.wide.u16 %0, %1, %2, %3;" : "=r"(d) : "h"(a.x), "h"(b.x), "r"(c));
    r.x = d >> 15;
  }

  {
    u32 d;
    asm("mad.wide.u16 %0, %1, %2, %3;" : "=r"(d) : "h"(a.y), "h"(b.y), "r"(c));
    r.y = d >> 15;
  }

  return r;
 }

 //
 //
 //

 DEVICE_STATIC_INTRINSIC_QUALIFIERS
 s32
 mul_wide_s16(s16 a, s16 b)
 {
  s32 d;
  
  asm("mul.wide.s16 %0, %1, %2;" : "=r"(d) : "h"(a), "h"(b));

  return d;
 }

 DEVICE_STATIC_INTRINSIC_QUALIFIERS
 s32
 mad_wide_s16(s16 a, s16 b, s32 c)
 {
  s32 d;
  
  asm("mad.wide.s16 %0, %1, %2, %3;" : "=r"(d) : "h"(a), "h"(b), "r"(c));

  return d;
 }

 //
 //
 //

 DEVICE_STATIC_INTRINSIC_QUALIFIERS
 s32
 mul_s32_s16(s32 a, s16 b)
 {
  return a * b;
 }

 DEVICE_STATIC_INTRINSIC_QUALIFIERS
 s32
 mad_s32_s16(s32 a, s16 b, s32 c)
 {
  return a * b + c;
 }

 //
 //
 //

 DEVICE_STATIC_INTRINSIC_QUALIFIERS
 s32
 mul_lo_s32(s32 a, s32 b)
 {
  s32 d;
  
  asm("mul.lo.s32 %0, %1, %2;" : "=r"(d) : "r"(a), "r"(b));

  return d;
 }

 DEVICE_STATIC_INTRINSIC_QUALIFIERS
 s32
 mad_lo_s32(s32 a, s32 b, s32 c)
 {
  s32 d;
  
  asm("mad.lo.s32 %0, %1, %2, %3;" : "=r"(d) : "r"(a), "r"(b), "r"(c));

  return d;
 }

 //
 //
 //

 DEVICE_STATIC_INTRINSIC_QUALIFIERS
 s64
 mul_wide_s32(s32 a, s32 b)
 {
  s64 d;
  
  asm("mul.wide.s32 %0, %1, %2;" : "=l"(d) : "r"(a), "r"(b));

  return d;
 }

 DEVICE_STATIC_INTRINSIC_QUALIFIERS
 s64
 mad_wide_s32(s32 a, s32 b, s64 c)
 {
  s64 d;
  
  asm("mad.wide.s32 %0, %1, %2, %3;" : "=l"(d) : "r"(a), "r"(b), "l"(c));

  return d;
 }

 //
 //
 //

 DEVICE_STATIC_INTRINSIC_QUALIFIERS
 s64
 mul_s64(s64 a, s64 b)
 {
  s64 d;
  
  asm("mul.lo.s64 %0, %1, %2;" : "=l"(d) : "l"(a), "l"(b));

  return d;
 }

 DEVICE_STATIC_INTRINSIC_QUALIFIERS
 s64
 mad_s64(s64 a, s64 b, s64 c)
 {
  s64 d;
  
  asm("mad.lo.s64 %0, %1, %2, %3;" : "=l"(d) : "l"(a), "l"(b), "l"(c));

  return d;
 }

 //
 //
 //

 KERNEL_QUALIFIERS_EXTERN_C
 void
 mad_q16v2_kernel(const union q16v2* const RESTRICT a,
                 const union q16v2* const RESTRICT b,
                 const union q16v2* const RESTRICT c,
                 union       q16v2* const RESTRICT d)
 {
  d[threadIdx.x] = mad_q16v2(a[threadIdx.x],
                             b[threadIdx.x],
                             c[threadIdx.x]);
 }

 //
 //
 //

 KERNEL_QUALIFIERS_EXTERN_C
 void
 fma_half2_kernel(const half2* const RESTRICT a,
                 const half2* const RESTRICT b,
                 const half2* const RESTRICT c,
                       half2* const RESTRICT d)
 {
  d[threadIdx.x] = fma_half2(a[threadIdx.x],
                             b[threadIdx.x],
                             c[threadIdx.x]);
 }

 //
 //
 //

 KERNEL_QUALIFIERS_EXTERN_C
 void
 mad_u16v2_kernel(const u16v2* const RESTRICT a,
                 const u16v2* const RESTRICT b,
                 const u32*   const RESTRICT c,
                 u16v2*       const RESTRICT d)
 {
  d[threadIdx.x] = mad_u16v2(a[threadIdx.x],
                             b[threadIdx.x],
                             c[threadIdx.x]);
 }

 //
 //
 //

 KERNEL_QUALIFIERS_EXTERN_C
 void
 mul_wide_s16_kernel(const s16* const RESTRICT a,
                    const s16* const RESTRICT b,
                    s32*       const RESTRICT d)
 {
  d[threadIdx.x] = mul_wide_s16(a[threadIdx.x],
                                b[threadIdx.x]);
 }

 KERNEL_QUALIFIERS_EXTERN_C
 void
 mad_wide_s16_kernel(const s16* const RESTRICT a,
                    const s16* const RESTRICT b,
                    const s32* const RESTRICT c,
                    s32*       const RESTRICT d)
 {
  d[threadIdx.x] = mad_wide_s16(a[threadIdx.x],
                                b[threadIdx.x],
                                c[threadIdx.x]);
 }

 //
 //
 //

 KERNEL_QUALIFIERS_EXTERN_C
 void
 mul_s32_s16_kernel(const s32* const RESTRICT a,
                   const s16* const RESTRICT b,
                   s32*       const RESTRICT d)
 {
  d[threadIdx.x] = mul_s32_s16(a[threadIdx.x],
                               b[threadIdx.x]);
 }

 KERNEL_QUALIFIERS_EXTERN_C
 void
 mad_s32_s16_kernel(const s32* const RESTRICT a,
                   const s16* const RESTRICT b,
                   const s32* const RESTRICT c,
                   s32*       const RESTRICT d)
 {
  d[threadIdx.x] = mad_s32_s16(a[threadIdx.x],
                               b[threadIdx.x],
                               c[threadIdx.x]);
 }

 //
 //
 //

 KERNEL_QUALIFIERS_EXTERN_C
 void
 mul_lo_s32_kernel(const s32* const RESTRICT a,
                  const s32* const RESTRICT b,
                  s32*       const RESTRICT d)
 {
  d[threadIdx.x] = mul_lo_s32(a[threadIdx.x],
                              b[threadIdx.x]);
 }

 KERNEL_QUALIFIERS_EXTERN_C
 void
 mad_lo_s32_kernel(const s32* const RESTRICT a,
                  const s32* const RESTRICT b,
                  const s32* const RESTRICT c,
                  s32*       const RESTRICT d)
 {
  d[threadIdx.x] = mad_lo_s32(a[threadIdx.x],
                              b[threadIdx.x],
                              c[threadIdx.x]);
 }

 //
 //
 //

 KERNEL_QUALIFIERS_EXTERN_C
 void
 mul_wide_s32_kernel(const s32* const RESTRICT a,
                    const s32* const RESTRICT b,
                    s64*       const RESTRICT d)
 {
  d[threadIdx.x] = mul_wide_s32(a[threadIdx.x],
                                b[threadIdx.x]);
 }

 KERNEL_QUALIFIERS_EXTERN_C
 void
 mad_wide_s32_kernel(const s32* const RESTRICT a,
                    const s32* const RESTRICT b,
                    const s64* const RESTRICT c,
                    s64*       const RESTRICT d)
 {
  d[threadIdx.x] = mad_wide_s32(a[threadIdx.x],
                                b[threadIdx.x],
                                c[threadIdx.x]);
 }

 //
 //
 //

 KERNEL_QUALIFIERS_EXTERN_C
 void
 mul_s64_kernel(const s64* const RESTRICT a,
               const s64* const RESTRICT b,
               s64*       const RESTRICT d)
 {
  d[threadIdx.x] = mul_s64(a[threadIdx.x],
                           b[threadIdx.x]);
 }

 KERNEL_QUALIFIERS_EXTERN_C
 void
 mad_s64_kernel(const s64* const RESTRICT a,
               const s64* const RESTRICT b,
               const s64* const RESTRICT c,
               s64*       const RESTRICT d)
 {
  d[threadIdx.x] = mad_s64(a[threadIdx.x],
                           b[threadIdx.x],
                           c[threadIdx.x]);
 }

 //
 //
 //
	// -- compile-command: "nvcc -m 32 -arch sm_50 -Xptxas=-v,-abi=no -cubin int_mul.cu" ; --

	#include <stdint.h>
	#include <cuda_fp16.h>

	//
	//
	//

	#define KERNEL_QUALIFIERS __global__
	#define KERNEL_QUALIFIERS_EXTERN extern KERNEL_QUALIFIERS
	#define KERNEL_QUALIFIERS_EXTERN_C extern "C" KERNEL_QUALIFIERS

	//
	//
	//

	#ifndef _DEBUG
	#define DEVICE_FUNCTION_QUALIFIERS __device__ __forceinline__
	#define DEVICE_INTRINSIC_QUALIFIERS __device__ __forceinline__
	#else
	#define DEVICE_FUNCTION_QUALIFIERS __device__
	#define DEVICE_INTRINSIC_QUALIFIERS __device__
	#endif

	//
	//
	//

	#define DEVICE_STATIC_FUNCTION_QUALIFIERS static DEVICE_FUNCTION_QUALIFIERS
	#define DEVICE_STATIC_INTRINSIC_QUALIFIERS static DEVICE_INTRINSIC_QUALIFIERS

	//
	//
	//

	#define RESTRICT __restrict__

	//
	//
	//

	#define WARP_SIZE 32

	//
	//
	//

	typedef int16_t s16;
	typedef short2 s16v2;

	typedef uint16_t u16;
	typedef ushort2 u16v2;

	typedef int32_t s32;
	typedef uint32_t u32;

	typedef int64_t s64;

	//
	//
	//

	typedef u16 q16;

	union q16v2
	{
	u32 lohi;

	struct {
	q16 lo;
	q16 hi;
	};
	};

	//
	//
	//

	DEVICE_STATIC_INTRINSIC_QUALIFIERS
	q16v2
	mad_q16v2(union q16v2 a, union q16v2 b, union q16v2 c)
	{
	u32 d,e;

	asm("vmad.u32.u32.u32.shr15 %0, %1.h0, %2.h0, %3;" : "=r"(d) : "r"(a.lohi), "r"(b.lohi), "r"((u32)c.lo));
	asm("vmad.u32.u32.u32.shr15 %0, %1.h1, %2.h1, %3;" : "=r"(e) : "r"(a.lohi), "r"(b.lohi), "r"((u32)c.hi));

	q16v2 r;

	r.lo = d;
	r.hi = e;

	return r;
	}

	//
	//
	//

	DEVICE_STATIC_INTRINSIC_QUALIFIERS
	half2
	fma_half2(half2 a, half2 b, half2 c)
	{
	#if __CUDA_ARCH__ >= 530
	return __hfma2(a,b,c);
	#else
	return __floats2half2_rn(fmaf( __low2float(a), __low2float(b), __low2float(c)),
	fmaf(__high2float(a),__high2float(b),__high2float(c)));
	#endif
	}

	//
	//
	//

	DEVICE_STATIC_INTRINSIC_QUALIFIERS
	u16v2
	mad_u16v2(u16v2 a, u16v2 b, u32 c)
	{
	u16v2 r;
	{
	u32 d;
	asm("mad.wide.u16 %0, %1, %2, %3;" : "=r"(d) : "h"(a.x), "h"(b.x), "r"(c));
	r.x = d >> 15;
	}

	{
	u32 d;
	asm("mad.wide.u16 %0, %1, %2, %3;" : "=r"(d) : "h"(a.y), "h"(b.y), "r"(c));
	r.y = d >> 15;
	}

	return r;
	}

	//
	//
	//

	DEVICE_STATIC_INTRINSIC_QUALIFIERS
	s32
	mul_wide_s16(s16 a, s16 b)
	{
	s32 d;

	asm("mul.wide.s16 %0, %1, %2;" : "=r"(d) : "h"(a), "h"(b));

	return d;
	}

	DEVICE_STATIC_INTRINSIC_QUALIFIERS
	s32
	mad_wide_s16(s16 a, s16 b, s32 c)
	{
	s32 d;

	asm("mad.wide.s16 %0, %1, %2, %3;" : "=r"(d) : "h"(a), "h"(b), "r"(c));

	return d;
	}

	//
	//
	//

	DEVICE_STATIC_INTRINSIC_QUALIFIERS
	s32
	mul_s32_s16(s32 a, s16 b)
	{
	return a * b;
	}

	DEVICE_STATIC_INTRINSIC_QUALIFIERS
	s32
	mad_s32_s16(s32 a, s16 b, s32 c)
	{
	return a * b + c;
	}

	//
	//
	//

	DEVICE_STATIC_INTRINSIC_QUALIFIERS
	s32
	mul_lo_s32(s32 a, s32 b)
	{
	s32 d;

	asm("mul.lo.s32 %0, %1, %2;" : "=r"(d) : "r"(a), "r"(b));

	return d;
	}

	DEVICE_STATIC_INTRINSIC_QUALIFIERS
	s32
	mad_lo_s32(s32 a, s32 b, s32 c)
	{
	s32 d;

	asm("mad.lo.s32 %0, %1, %2, %3;" : "=r"(d) : "r"(a), "r"(b), "r"(c));

	return d;
	}

	//
	//
	//

	DEVICE_STATIC_INTRINSIC_QUALIFIERS
	s64
	mul_wide_s32(s32 a, s32 b)
	{
	s64 d;

	asm("mul.wide.s32 %0, %1, %2;" : "=l"(d) : "r"(a), "r"(b));

	return d;
	}

	DEVICE_STATIC_INTRINSIC_QUALIFIERS
	s64
	mad_wide_s32(s32 a, s32 b, s64 c)
	{
	s64 d;

	asm("mad.wide.s32 %0, %1, %2, %3;" : "=l"(d) : "r"(a), "r"(b), "l"(c));

	return d;
	}

	//
	//
	//

	DEVICE_STATIC_INTRINSIC_QUALIFIERS
	s64
	mul_s64(s64 a, s64 b)
	{
	s64 d;

	asm("mul.lo.s64 %0, %1, %2;" : "=l"(d) : "l"(a), "l"(b));

	return d;
	}

	DEVICE_STATIC_INTRINSIC_QUALIFIERS
	s64
	mad_s64(s64 a, s64 b, s64 c)
	{
	s64 d;

	asm("mad.lo.s64 %0, %1, %2, %3;" : "=l"(d) : "l"(a), "l"(b), "l"(c));

	return d;
	}

	//
	//
	//

	KERNEL_QUALIFIERS_EXTERN_C
	void
	mad_q16v2_kernel(const union q16v2* const RESTRICT a,
	const union q16v2* const RESTRICT b,
	const union q16v2* const RESTRICT c,
	union q16v2* const RESTRICT d)
	{
	d[threadIdx.x] = mad_q16v2(a[threadIdx.x],
	b[threadIdx.x],
	c[threadIdx.x]);
	}

	//
	//
	//

	KERNEL_QUALIFIERS_EXTERN_C
	void
	fma_half2_kernel(const half2* const RESTRICT a,
	const half2* const RESTRICT b,
	const half2* const RESTRICT c,
	half2* const RESTRICT d)
	{
	d[threadIdx.x] = fma_half2(a[threadIdx.x],
	b[threadIdx.x],
	c[threadIdx.x]);
	}

	//
	//
	//

	KERNEL_QUALIFIERS_EXTERN_C
	void
	mad_u16v2_kernel(const u16v2* const RESTRICT a,
	const u16v2* const RESTRICT b,
	const u32* const RESTRICT c,
	u16v2* const RESTRICT d)
	{
	d[threadIdx.x] = mad_u16v2(a[threadIdx.x],
	b[threadIdx.x],
	c[threadIdx.x]);
	}

	//
	//
	//

	KERNEL_QUALIFIERS_EXTERN_C
	void
	mul_wide_s16_kernel(const s16* const RESTRICT a,
	const s16* const RESTRICT b,
	s32* const RESTRICT d)
	{
	d[threadIdx.x] = mul_wide_s16(a[threadIdx.x],
	b[threadIdx.x]);
	}

	KERNEL_QUALIFIERS_EXTERN_C
	void
	mad_wide_s16_kernel(const s16* const RESTRICT a,
	const s16* const RESTRICT b,
	const s32* const RESTRICT c,
	s32* const RESTRICT d)
	{
	d[threadIdx.x] = mad_wide_s16(a[threadIdx.x],
	b[threadIdx.x],
	c[threadIdx.x]);
	}

	//
	//
	//

	KERNEL_QUALIFIERS_EXTERN_C
	void
	mul_s32_s16_kernel(const s32* const RESTRICT a,
	const s16* const RESTRICT b,
	s32* const RESTRICT d)
	{
	d[threadIdx.x] = mul_s32_s16(a[threadIdx.x],
	b[threadIdx.x]);
	}

	KERNEL_QUALIFIERS_EXTERN_C
	void
	mad_s32_s16_kernel(const s32* const RESTRICT a,
	const s16* const RESTRICT b,
	const s32* const RESTRICT c,
	s32* const RESTRICT d)
	{
	d[threadIdx.x] = mad_s32_s16(a[threadIdx.x],
	b[threadIdx.x],
	c[threadIdx.x]);
	}

	//
	//
	//

	KERNEL_QUALIFIERS_EXTERN_C
	void
	mul_lo_s32_kernel(const s32* const RESTRICT a,
	const s32* const RESTRICT b,
	s32* const RESTRICT d)
	{
	d[threadIdx.x] = mul_lo_s32(a[threadIdx.x],
	b[threadIdx.x]);
	}

	KERNEL_QUALIFIERS_EXTERN_C
	void
	mad_lo_s32_kernel(const s32* const RESTRICT a,
	const s32* const RESTRICT b,
	const s32* const RESTRICT c,
	s32* const RESTRICT d)
	{
	d[threadIdx.x] = mad_lo_s32(a[threadIdx.x],
	b[threadIdx.x],
	c[threadIdx.x]);
	}

	//
	//
	//

	KERNEL_QUALIFIERS_EXTERN_C
	void
	mul_wide_s32_kernel(const s32* const RESTRICT a,
	const s32* const RESTRICT b,
	s64* const RESTRICT d)
	{
	d[threadIdx.x] = mul_wide_s32(a[threadIdx.x],
	b[threadIdx.x]);
	}

	KERNEL_QUALIFIERS_EXTERN_C
	void
	mad_wide_s32_kernel(const s32* const RESTRICT a,
	const s32* const RESTRICT b,
	const s64* const RESTRICT c,
	s64* const RESTRICT d)
	{
	d[threadIdx.x] = mad_wide_s32(a[threadIdx.x],
	b[threadIdx.x],
	c[threadIdx.x]);
	}

	//
	//
	//

	KERNEL_QUALIFIERS_EXTERN_C
	void
	mul_s64_kernel(const s64* const RESTRICT a,
	const s64* const RESTRICT b,
	s64* const RESTRICT d)
	{
	d[threadIdx.x] = mul_s64(a[threadIdx.x],
	b[threadIdx.x]);
	}

	KERNEL_QUALIFIERS_EXTERN_C
	void
	mad_s64_kernel(const s64* const RESTRICT a,
	const s64* const RESTRICT b,
	const s64* const RESTRICT c,
	s64* const RESTRICT d)
	{
	d[threadIdx.x] = mad_s64(a[threadIdx.x],
	b[threadIdx.x],
	c[threadIdx.x]);
	}

	//
	//
	//