telegraphic · November 10, 2021 20:31
diff --git a/dp4a_cross_correlator.py b/dp4a_cross_correlator.py
 import pycuda.driver as cuda
 import pycuda.autoinit
 from pycuda.compiler import SourceModule
 import numpy as np

 def compute_xcorr_cpu(d):
    dc = d.astype('float32').view('complex64')
    dc = dc.transpose((0,2,3,1)).copy()
    xcorr_cpu = np.einsum('...i,...j', dc,  np.conj(dc)).view('float32').astype('int32').sum(axis=-4)
    return xcorr_cpu

 mod = SourceModule("""
 #include <stdio.h>

 __forceinline__ __device__
 void dp4a(int &c, const int &a, const int &b) {
    #if __CUDA_ARCH__ >= 610
      asm("dp4a.s32.s32 %0, %1, %2, %3;" : "+r"(c) : "r"(a), "r"(b), "r"(c)); 
    #else
      char4 &a4 = *((char4*)&a);
      char4 &b4 = *((char4*)&b);
      c += a4.x*b4.x;
      c += a4.y*b4.y;
      c += a4.z*b4.z;
      c += a4.w*b4.w;
    #endif
    }

 /*
  cmult_dp4a -- Do complex conjugate multiply accumulate <A*Conj(B)>
  Using two dp4a instructions. Takes 8-bit complex data 
  packed as a single 32-bit int [8R8I 8R8I]. 
  
  For two complex numbers:
      ab* = (ar + i*ai)(br + i*bi)
      re(ab*) = ar*br + ai*bi
      im(ab*) = ai*br - ar*bi
  So use two dp4a to compute:
      [a0r a0i a1r a1i].[b0r b0i b1r b1i]   = Re(<ab*>)
      [a0r a0i a1r a1i].[-b0i b0r -b1i b1r] = Im(<ab*>)
  Where angled brackets denote time averaging (over 2x samples)
 */
 __forceinline__ __device__
 void cmult_dp4a(int &res_re, int &res_im, int &A, int &B) {
    // Unpack 32-bit int into 8-bit
    int8_t Bmod[4];
    int8_t *b8 = (int8_t *)&B;      
    
    // Transpose for bmod 
    Bmod[0] = -b8[1];
    Bmod[1] = b8[0];
    Bmod[2] = -b8[3];
    Bmod[3] = b8[2]; 
    
    //int8_t *a8 = (int8_t *)&A;
    //printf("A %d %d %d %d | B %d %d %d %d\\n", a8[0], a8[1], a8[2], a8[3], b8[0], b8[1], b8[2], b8[3]);
    
    // Pack 8-bit to 32-bit
    int &Bmodp = *((int *)&Bmod); 
    
    // Run complex multiply
    dp4a(res_re, A, B);
    dp4a(res_im, A, Bmodp);
    }

 __global__ void cplxConjOuterProduct
    (int *data, int *xcorr, int N, int F, int T)
    {
    int x, y; // x not used
    int idx, ia, ib;
     
    // Setup thread indexes
    x = threadIdx.x;
    y = threadIdx.y;
        
    int chan_offset_in  = blockIdx.x * N * T/2;
    int chan_offset_out = blockIdx.x * N * N * 2;
    int ant_offset      = T / 2;  //x2 for complex, but /4 for packed
    
    idx = 2*y + N*2*x + chan_offset_out; // Compute index for output array
    
    for (int t = 0; t < T/2; t++) {
        ia  = ant_offset*x + chan_offset_in + t;
        ib  = ant_offset*y + chan_offset_in + t;
        
        //printf("idx %d | x%d.y%d | A %dx%d\\n", idx, x, y, ia, ib);
        cmult_dp4a(xcorr[idx], xcorr[idx+1], data[ia], data[ib]);
    }

    }
   """)
   
 # Create complex data
 N = 12        # Number of antennas (2-32)
 F = 320       # Number of frequency channels
 T = 1024      # Number of time samples

 # Create complex data
 d = np.random.randint(64, size=(F, N, T, 2), dtype='int8')
 xcorr = np.zeros((F, N, N*2), dtype='int32')

 d_gpu = cuda.mem_alloc(d.nbytes)
 xcorr_gpu = cuda.mem_alloc(xcorr.nbytes)

 cuda.memcpy_htod(d_gpu, d)
 cuda.memcpy_htod(xcorr_gpu, xcorr)

 compute_xcorr_gpu = mod.get_function("cplxConjOuterProduct")
 compute_xcorr_gpu(d_gpu, xcorr_gpu, np.int32(N), np.int32(F), np.int32(T), block=(N, N, 1), grid=(F,1,1))

 cuda.memcpy_dtoh(xcorr, xcorr_gpu)

 xcorr_cpu = compute_xcorr_cpu(d)

 assert np.allclose(xcorr.squeeze(), xcorr_cpu.squeeze())
diff --git a/dp4a_example.py b/dp4a_example.py
 """
 # dp4a_example.py

 Simple test of DP4A instruction using PyCUDA.  
 """
 import pycuda.driver as cuda
 import pycuda.autoinit
 from pycuda.compiler import SourceModule
 import numpy as np

 def dp4a_cpu(a, b):
    """ Apply DP4A instruction, python version
    
    Take do a dot product, A*B on two arrays, then add
    sets of four elements together. E.g, for two arrays:
        a = [A B C D] 
        b = [E F G H]
        returns AE + BF + CG + DH
    
    Args:
        a (np.array): Numpy array of 8-bit integers of length 4N
        b (np.array): Numpy array of 8-bit integers of length 4N
    
    Returns:
        c (np.array): Numpy array of 32-bit integers of length N
    """
    try:
        assert str(a.dtype) ==  str(b.dtype) == 'int8'
        assert len(a) % 4 == len(b) % 4 ==0
    except AssertionError:
        print("Error: This must be run on a pair of int8 numpy arrays,\
               with 4N elements ineach.")
        return 0
    z = a.astype('int32') * b.astype('int32')
    z = z.reshape((-1, 4)).sum(axis=-1)
    return z

 mod = SourceModule("""
 __forceinline__ __device__
 void dp4a(int &c, const int &a, const int &b) {
    #if __CUDA_ARCH__ >= 610
      asm("dp4a.s32.s32 %0, %1, %2, %3;" : "+r"(c) : "r"(a), "r"(b), "r"(c)); 
    #else
      char4 &a4 = *((char4*)&a);
      char4 &b4 = *((char4*)&b);
      c += a4.x*b4.x;
      c += a4.y*b4.y;
      c += a4.z*b4.z;
      c += a4.w*b4.w;
    #endif
    }
    
 __global__ void multDp4a(int *A, int *B, int *C, int nx) 
    {
    unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
    dp4a(C[ix], A[ix], B[ix]);
    }
    """)

 # Setup some test vectors
 # c = dp4a(a, b), input is 8 bit, output is 32 bit
 nx = 1024 * 4 
 a = np.random.randint(low=-127, high=128, size=nx, dtype='int8')
 b = np.random.randint(low=-127, high=128, size=nx, dtype='int8')
 c = np.zeros(nx/4, dtype='int32')
 
 # Compute CPU reference
 c_cpu = dp4a_cpu(a, b)

 # Allocate memory on GPU
 a_gpu = cuda.mem_alloc(a.nbytes)
 b_gpu = cuda.mem_alloc(b.nbytes)
 c_gpu = cuda.mem_alloc(c.nbytes)

 # Copy to GPU
 cuda.memcpy_htod(a_gpu, a)
 cuda.memcpy_htod(b_gpu, b)
 cuda.memcpy_htod(c_gpu, c)

 # Run kernel on GPU
 func = mod.get_function("multDp4a")
 func(a_gpu, b_gpu, c_gpu, np.int32(nx), block=(nx/4, 1, 1))

 # Copy back
 cuda.memcpy_dtoh(c, c_gpu)

 # Confirm against CPU version
 assert np.allclose(c, c_cpu)
	import pycuda.driver as cuda
	import pycuda.autoinit
	from pycuda.compiler import SourceModule
	import numpy as np

	def compute_xcorr_cpu(d):
	dc = d.astype('float32').view('complex64')
	dc = dc.transpose((0,2,3,1)).copy()
	xcorr_cpu = np.einsum('...i,...j', dc, np.conj(dc)).view('float32').astype('int32').sum(axis=-4)
	return xcorr_cpu

	mod = SourceModule("""
	#include <stdio.h>

	__forceinline__ __device__
	void dp4a(int &c, const int &a, const int &b) {
	#if __CUDA_ARCH__ >= 610
	asm("dp4a.s32.s32 %0, %1, %2, %3;" : "+r"(c) : "r"(a), "r"(b), "r"(c));
	#else
	char4 &a4 = ((char4)&a);
	char4 &b4 = ((char4)&b);
	c += a4.x*b4.x;
	c += a4.y*b4.y;
	c += a4.z*b4.z;
	c += a4.w*b4.w;
	#endif
	}

	/*
	cmult_dp4a -- Do complex conjugate multiply accumulate <A*Conj(B)>
	Using two dp4a instructions. Takes 8-bit complex data
	packed as a single 32-bit int [8R8I 8R8I].

	For two complex numbers:
	ab* = (ar + iai)(br + ibi)
	re(ab) = arbr + ai*bi
	im(ab) = aibr - ar*bi
	So use two dp4a to compute:
	[a0r a0i a1r a1i].[b0r b0i b1r b1i] = Re(<ab*>)
	[a0r a0i a1r a1i].[-b0i b0r -b1i b1r] = Im(<ab*>)
	Where angled brackets denote time averaging (over 2x samples)
	*/
	__forceinline__ __device__
	void cmult_dp4a(int &res_re, int &res_im, int &A, int &B) {
	// Unpack 32-bit int into 8-bit
	int8_t Bmod[4];
	int8_t b8 = (int8_t )&B;

	// Transpose for bmod
	Bmod[0] = -b8[1];
	Bmod[1] = b8[0];
	Bmod[2] = -b8[3];
	Bmod[3] = b8[2];

	//int8_t a8 = (int8_t )&A;
	//printf("A %d %d %d %d \| B %d %d %d %d\\n", a8[0], a8[1], a8[2], a8[3], b8[0], b8[1], b8[2], b8[3]);

	// Pack 8-bit to 32-bit
	int &Bmodp = ((int )&Bmod);

	// Run complex multiply
	dp4a(res_re, A, B);
	dp4a(res_im, A, Bmodp);
	}

	__global__ void cplxConjOuterProduct
	(int data, int xcorr, int N, int F, int T)
	{
	int x, y; // x not used
	int idx, ia, ib;

	// Setup thread indexes
	x = threadIdx.x;
	y = threadIdx.y;

	int chan_offset_in = blockIdx.x * N * T/2;
	int chan_offset_out = blockIdx.x * N * N * 2;
	int ant_offset = T / 2; //x2 for complex, but /4 for packed

	idx = 2y + N2*x + chan_offset_out; // Compute index for output array

	for (int t = 0; t < T/2; t++) {
	ia = ant_offset*x + chan_offset_in + t;
	ib = ant_offset*y + chan_offset_in + t;

	//printf("idx %d \| x%d.y%d \| A %dx%d\\n", idx, x, y, ia, ib);
	cmult_dp4a(xcorr[idx], xcorr[idx+1], data[ia], data[ib]);
	}

	}
	""")

	# Create complex data
	N = 12 # Number of antennas (2-32)
	F = 320 # Number of frequency channels
	T = 1024 # Number of time samples

	# Create complex data
	d = np.random.randint(64, size=(F, N, T, 2), dtype='int8')
	xcorr = np.zeros((F, N, N*2), dtype='int32')

	d_gpu = cuda.mem_alloc(d.nbytes)
	xcorr_gpu = cuda.mem_alloc(xcorr.nbytes)

	cuda.memcpy_htod(d_gpu, d)
	cuda.memcpy_htod(xcorr_gpu, xcorr)

	compute_xcorr_gpu = mod.get_function("cplxConjOuterProduct")
	compute_xcorr_gpu(d_gpu, xcorr_gpu, np.int32(N), np.int32(F), np.int32(T), block=(N, N, 1), grid=(F,1,1))

	cuda.memcpy_dtoh(xcorr, xcorr_gpu)

	xcorr_cpu = compute_xcorr_cpu(d)

	assert np.allclose(xcorr.squeeze(), xcorr_cpu.squeeze())
	"""
	# dp4a_example.py

	Simple test of DP4A instruction using PyCUDA.
	"""
	import pycuda.driver as cuda
	import pycuda.autoinit
	from pycuda.compiler import SourceModule
	import numpy as np

	def dp4a_cpu(a, b):
	""" Apply DP4A instruction, python version

	Take do a dot product, A*B on two arrays, then add
	sets of four elements together. E.g, for two arrays:
	a = [A B C D]
	b = [E F G H]
	returns AE + BF + CG + DH

	Args:
	a (np.array): Numpy array of 8-bit integers of length 4N
	b (np.array): Numpy array of 8-bit integers of length 4N

	Returns:
	c (np.array): Numpy array of 32-bit integers of length N
	"""
	try:
	assert str(a.dtype) == str(b.dtype) == 'int8'
	assert len(a) % 4 == len(b) % 4 ==0
	except AssertionError:
	print("Error: This must be run on a pair of int8 numpy arrays,\
	with 4N elements ineach.")
	return 0
	z = a.astype('int32') * b.astype('int32')
	z = z.reshape((-1, 4)).sum(axis=-1)
	return z

	mod = SourceModule("""
	__forceinline__ __device__
	void dp4a(int &c, const int &a, const int &b) {
	#if __CUDA_ARCH__ >= 610
	asm("dp4a.s32.s32 %0, %1, %2, %3;" : "+r"(c) : "r"(a), "r"(b), "r"(c));
	#else
	char4 &a4 = ((char4)&a);
	char4 &b4 = ((char4)&b);
	c += a4.x*b4.x;
	c += a4.y*b4.y;
	c += a4.z*b4.z;
	c += a4.w*b4.w;
	#endif
	}

	__global__ void multDp4a(int A, int B, int *C, int nx)
	{
	unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x;
	dp4a(C[ix], A[ix], B[ix]);
	}
	""")

	# Setup some test vectors
	# c = dp4a(a, b), input is 8 bit, output is 32 bit
	nx = 1024 * 4
	a = np.random.randint(low=-127, high=128, size=nx, dtype='int8')
	b = np.random.randint(low=-127, high=128, size=nx, dtype='int8')
	c = np.zeros(nx/4, dtype='int32')

	# Compute CPU reference
	c_cpu = dp4a_cpu(a, b)

	# Allocate memory on GPU
	a_gpu = cuda.mem_alloc(a.nbytes)
	b_gpu = cuda.mem_alloc(b.nbytes)
	c_gpu = cuda.mem_alloc(c.nbytes)

	# Copy to GPU
	cuda.memcpy_htod(a_gpu, a)
	cuda.memcpy_htod(b_gpu, b)
	cuda.memcpy_htod(c_gpu, c)

	# Run kernel on GPU
	func = mod.get_function("multDp4a")
	func(a_gpu, b_gpu, c_gpu, np.int32(nx), block=(nx/4, 1, 1))

	# Copy back
	cuda.memcpy_dtoh(c, c_gpu)

	# Confirm against CPU version
	assert np.allclose(c, c_cpu)