Last active
November 10, 2021 20:31
-
-
Save telegraphic/d61fc8c049f4dae6a8fc2a4c9e91f2e5 to your computer and use it in GitHub Desktop.
Use of CUDA DP4A instruction using PyCUDA
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pycuda.driver as cuda | |
import pycuda.autoinit | |
from pycuda.compiler import SourceModule | |
import numpy as np | |
def compute_xcorr_cpu(d): | |
dc = d.astype('float32').view('complex64') | |
dc = dc.transpose((0,2,3,1)).copy() | |
xcorr_cpu = np.einsum('...i,...j', dc, np.conj(dc)).view('float32').astype('int32').sum(axis=-4) | |
return xcorr_cpu | |
mod = SourceModule(""" | |
#include <stdio.h> | |
__forceinline__ __device__ | |
void dp4a(int &c, const int &a, const int &b) { | |
#if __CUDA_ARCH__ >= 610 | |
asm("dp4a.s32.s32 %0, %1, %2, %3;" : "+r"(c) : "r"(a), "r"(b), "r"(c)); | |
#else | |
char4 &a4 = *((char4*)&a); | |
char4 &b4 = *((char4*)&b); | |
c += a4.x*b4.x; | |
c += a4.y*b4.y; | |
c += a4.z*b4.z; | |
c += a4.w*b4.w; | |
#endif | |
} | |
/* | |
cmult_dp4a -- Do complex conjugate multiply accumulate <A*Conj(B)> | |
Using two dp4a instructions. Takes 8-bit complex data | |
packed as a single 32-bit int [8R8I 8R8I]. | |
For two complex numbers: | |
ab* = (ar + i*ai)(br + i*bi) | |
re(ab*) = ar*br + ai*bi | |
im(ab*) = ai*br - ar*bi | |
So use two dp4a to compute: | |
[a0r a0i a1r a1i].[b0r b0i b1r b1i] = Re(<ab*>) | |
[a0r a0i a1r a1i].[-b0i b0r -b1i b1r] = Im(<ab*>) | |
Where angled brackets denote time averaging (over 2x samples) | |
*/ | |
__forceinline__ __device__ | |
void cmult_dp4a(int &res_re, int &res_im, int &A, int &B) { | |
// Unpack 32-bit int into 8-bit | |
int8_t Bmod[4]; | |
int8_t *b8 = (int8_t *)&B; | |
// Transpose for bmod | |
Bmod[0] = -b8[1]; | |
Bmod[1] = b8[0]; | |
Bmod[2] = -b8[3]; | |
Bmod[3] = b8[2]; | |
//int8_t *a8 = (int8_t *)&A; | |
//printf("A %d %d %d %d | B %d %d %d %d\\n", a8[0], a8[1], a8[2], a8[3], b8[0], b8[1], b8[2], b8[3]); | |
// Pack 8-bit to 32-bit | |
int &Bmodp = *((int *)&Bmod); | |
// Run complex multiply | |
dp4a(res_re, A, B); | |
dp4a(res_im, A, Bmodp); | |
} | |
__global__ void cplxConjOuterProduct | |
(int *data, int *xcorr, int N, int F, int T) | |
{ | |
int x, y; // x not used | |
int idx, ia, ib; | |
// Setup thread indexes | |
x = threadIdx.x; | |
y = threadIdx.y; | |
int chan_offset_in = blockIdx.x * N * T/2; | |
int chan_offset_out = blockIdx.x * N * N * 2; | |
int ant_offset = T / 2; //x2 for complex, but /4 for packed | |
idx = 2*y + N*2*x + chan_offset_out; // Compute index for output array | |
for (int t = 0; t < T/2; t++) { | |
ia = ant_offset*x + chan_offset_in + t; | |
ib = ant_offset*y + chan_offset_in + t; | |
//printf("idx %d | x%d.y%d | A %dx%d\\n", idx, x, y, ia, ib); | |
cmult_dp4a(xcorr[idx], xcorr[idx+1], data[ia], data[ib]); | |
} | |
} | |
""") | |
# Create complex data | |
N = 12 # Number of antennas (2-32) | |
F = 320 # Number of frequency channels | |
T = 1024 # Number of time samples | |
# Create complex data | |
d = np.random.randint(64, size=(F, N, T, 2), dtype='int8') | |
xcorr = np.zeros((F, N, N*2), dtype='int32') | |
d_gpu = cuda.mem_alloc(d.nbytes) | |
xcorr_gpu = cuda.mem_alloc(xcorr.nbytes) | |
cuda.memcpy_htod(d_gpu, d) | |
cuda.memcpy_htod(xcorr_gpu, xcorr) | |
compute_xcorr_gpu = mod.get_function("cplxConjOuterProduct") | |
compute_xcorr_gpu(d_gpu, xcorr_gpu, np.int32(N), np.int32(F), np.int32(T), block=(N, N, 1), grid=(F,1,1)) | |
cuda.memcpy_dtoh(xcorr, xcorr_gpu) | |
xcorr_cpu = compute_xcorr_cpu(d) | |
assert np.allclose(xcorr.squeeze(), xcorr_cpu.squeeze()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
# dp4a_example.py | |
Simple test of DP4A instruction using PyCUDA. | |
""" | |
import pycuda.driver as cuda | |
import pycuda.autoinit | |
from pycuda.compiler import SourceModule | |
import numpy as np | |
def dp4a_cpu(a, b): | |
""" Apply DP4A instruction, python version | |
Take do a dot product, A*B on two arrays, then add | |
sets of four elements together. E.g, for two arrays: | |
a = [A B C D] | |
b = [E F G H] | |
returns AE + BF + CG + DH | |
Args: | |
a (np.array): Numpy array of 8-bit integers of length 4N | |
b (np.array): Numpy array of 8-bit integers of length 4N | |
Returns: | |
c (np.array): Numpy array of 32-bit integers of length N | |
""" | |
try: | |
assert str(a.dtype) == str(b.dtype) == 'int8' | |
assert len(a) % 4 == len(b) % 4 ==0 | |
except AssertionError: | |
print("Error: This must be run on a pair of int8 numpy arrays,\ | |
with 4N elements ineach.") | |
return 0 | |
z = a.astype('int32') * b.astype('int32') | |
z = z.reshape((-1, 4)).sum(axis=-1) | |
return z | |
mod = SourceModule(""" | |
__forceinline__ __device__ | |
void dp4a(int &c, const int &a, const int &b) { | |
#if __CUDA_ARCH__ >= 610 | |
asm("dp4a.s32.s32 %0, %1, %2, %3;" : "+r"(c) : "r"(a), "r"(b), "r"(c)); | |
#else | |
char4 &a4 = *((char4*)&a); | |
char4 &b4 = *((char4*)&b); | |
c += a4.x*b4.x; | |
c += a4.y*b4.y; | |
c += a4.z*b4.z; | |
c += a4.w*b4.w; | |
#endif | |
} | |
__global__ void multDp4a(int *A, int *B, int *C, int nx) | |
{ | |
unsigned int ix = threadIdx.x + blockIdx.x * blockDim.x; | |
dp4a(C[ix], A[ix], B[ix]); | |
} | |
""") | |
# Setup some test vectors | |
# c = dp4a(a, b), input is 8 bit, output is 32 bit | |
nx = 1024 * 4 | |
a = np.random.randint(low=-127, high=128, size=nx, dtype='int8') | |
b = np.random.randint(low=-127, high=128, size=nx, dtype='int8') | |
c = np.zeros(nx/4, dtype='int32') | |
# Compute CPU reference | |
c_cpu = dp4a_cpu(a, b) | |
# Allocate memory on GPU | |
a_gpu = cuda.mem_alloc(a.nbytes) | |
b_gpu = cuda.mem_alloc(b.nbytes) | |
c_gpu = cuda.mem_alloc(c.nbytes) | |
# Copy to GPU | |
cuda.memcpy_htod(a_gpu, a) | |
cuda.memcpy_htod(b_gpu, b) | |
cuda.memcpy_htod(c_gpu, c) | |
# Run kernel on GPU | |
func = mod.get_function("multDp4a") | |
func(a_gpu, b_gpu, c_gpu, np.int32(nx), block=(nx/4, 1, 1)) | |
# Copy back | |
cuda.memcpy_dtoh(c, c_gpu) | |
# Confirm against CPU version | |
assert np.allclose(c, c_cpu) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment