aleozlx · August 16, 2021 00:13
diff --git a/__blog_cutlass_matmul.py b/__blog_cutlass_matmul.py
 C = np.zeros((3, 4), dtype=int)
 block_size = (3, 2)
 div_up = lambda a, b: (a + b - 1) // b

 ### CUDA Grid
 for m in range(0, C.shape[0], block_size[0]):
    for n in range(0, C.shape[1], block_size[1]):

 ### Main loop in the CUDA kernel
 ### Smaller K is favorable to satisfy the shared memory bandwidth
        for k in range(A.shape[1]):

 ### The 3x2 block is what hypothetically fits in the shared memory.
 ### The thread block will contain multiple warps.
            row_range = slice(m, m+block_size[0])
            col_range = slice(n, n+block_size[1])

 ### Each warp loads a "fragment" into the register file
 ### from the shared memory. Here we didn't show warp-level
 ### decomposition, but it's essentially an unrolled loop
 ### that further divides the {row,col}_range
            frag_A = A[row_range, k]
            frag_B = B[k, col_range]

 ### Warp threads cooperatively compute the outer product.
 ### This can be done using a warp-level primitive "WMMA" since CUDA 9.
 ### nvcuda::wmma can be used to target the CUDA Tensor Cores
            C[row_range, col_range] += np.outer(frag_A, frag_B)
        print(f"C(t={n//block_size[1]}) =\n", C)

 """ 
 OUTPUT
 =============
 C(t=0) =
 [[42 29  0  0]
 [22 14  0  0]
 [11  9  0  0]]
 C(t=1) =
 [[42 29 26 21]
 [22 14 16 10]
 [11  9  4  5]]
 """
	C = np.zeros((3, 4), dtype=int)
	block_size = (3, 2)
	div_up = lambda a, b: (a + b - 1) // b

	### CUDA Grid
	for m in range(0, C.shape[0], block_size[0]):
	for n in range(0, C.shape[1], block_size[1]):

	### Main loop in the CUDA kernel
	### Smaller K is favorable to satisfy the shared memory bandwidth
	for k in range(A.shape[1]):

	### The 3x2 block is what hypothetically fits in the shared memory.
	### The thread block will contain multiple warps.
	row_range = slice(m, m+block_size[0])
	col_range = slice(n, n+block_size[1])

	### Each warp loads a "fragment" into the register file
	### from the shared memory. Here we didn't show warp-level
	### decomposition, but it's essentially an unrolled loop
	### that further divides the {row,col}_range
	frag_A = A[row_range, k]
	frag_B = B[k, col_range]

	### Warp threads cooperatively compute the outer product.
	### This can be done using a warp-level primitive "WMMA" since CUDA 9.
	### nvcuda::wmma can be used to target the CUDA Tensor Cores
	C[row_range, col_range] += np.outer(frag_A, frag_B)
	print(f"C(t={n//block_size[1]}) =\n", C)

	"""
	OUTPUT
	=============
	C(t=0) =
	[[42 29 0 0]
	[22 14 0 0]
	[11 9 0 0]]
	C(t=1) =
	[[42 29 26 21]
	[22 14 16 10]
	[11 9 4 5]]
	"""