Last active
January 9, 2025 15:15
-
-
Save Snektron/1fb62a39ee0d7b572c3441f0a53d310c to your computer and use it in GitHub Desktop.
Measure maximum number of waves and blocks per CU
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <hip/hip_runtime.h> | |
#include <iostream> | |
#include <vector> | |
__global__ void kernel(unsigned* active_warps, unsigned* max_warps, unsigned* active_blocks, unsigned* max_blocks) { | |
const auto lane_id = __lane_id(); | |
if (lane_id == 0) { | |
const auto current_active_wg = atomicAdd(active_warps, 1) + 1; | |
atomicMax(max_warps, current_active_wg); | |
} | |
if (threadIdx.x == 0) { | |
const auto current_active_wg = atomicAdd(active_blocks, 1) + 1; | |
atomicMax(max_blocks, current_active_wg); | |
} | |
// Simulate some work | |
for (unsigned int i = 0; i < 10000; ++i) { | |
__builtin_amdgcn_s_sleep(0x7F); | |
} | |
if (lane_id == 0) { | |
atomicSub(active_warps, 1); | |
} | |
if (threadIdx.x == 0) { | |
atomicSub(active_blocks, 1); | |
} | |
} | |
void test(int threads, int n_streams) { | |
unsigned* d_gpu_data; | |
hipMalloc(&d_gpu_data, sizeof(unsigned) * 4); | |
hipMemset(d_gpu_data, 0, sizeof(unsigned) * 4); | |
constexpr int blocks = 1024 * 4; | |
std::vector<hipStream_t> streams(n_streams); | |
for (int i = 0; i < n_streams; ++i) { | |
hipStreamCreateWithFlags(&streams[i], hipStreamNonBlocking); | |
} | |
for (int i = 0; i < n_streams; ++i) { | |
kernel<<<blocks, threads, 0, streams[i]>>>(d_gpu_data, d_gpu_data + 1, d_gpu_data + 2, d_gpu_data + 3); | |
} | |
hipDeviceSynchronize(); | |
unsigned max_warps, max_blocks; | |
hipMemcpy(&max_warps, d_gpu_data + 1, sizeof(unsigned), hipMemcpyDeviceToHost); | |
hipMemcpy(&max_blocks, d_gpu_data + 3, sizeof(unsigned), hipMemcpyDeviceToHost); | |
hipDeviceProp_t props; | |
hipGetDeviceProperties(&props, 0); | |
std::cout << "streams: " << n_streams << std::endl; | |
std::cout << "threads per block: " << threads << std::endl; | |
std::cout << "blocks: " << blocks << std::endl; | |
std::cout << "warp size: " << props.warpSize << std::endl; | |
std::cout << "number of multiprocessors (CUs or WGPs): " << props.multiProcessorCount << std::endl; | |
std::cout << "total threads: " << (threads * blocks) << std::endl; | |
std::cout << "max warps active: " << max_warps << std::endl; | |
std::cout << "max blocks active: " << max_blocks << std::endl; | |
std::cout << "max threads per multiprocessor: " << props.maxThreadsPerMultiProcessor << std::endl; | |
std::cout << "est. max warps per CU/WGP: " << max_warps / static_cast<float>(props.multiProcessorCount) << std::endl; | |
std::cout << "est. max blocks per CU/WGP: " << max_blocks / static_cast<float>(props.multiProcessorCount) << std::endl; | |
std::cout << std::endl; | |
for (int i = 0; i < n_streams; ++i) { | |
hipStreamDestroy(streams[i]); | |
} | |
} | |
int main() { | |
for (int threads : {32, 64, 128, 256, 384, 512, 768, 1024}) { | |
for (int streams : {1, 4}) { | |
test(threads, streams); | |
} | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
streams: 1 | |
threads per block: 32 | |
blocks: 4096 | |
warp size: 64 | |
number of multiprocessors (CUs or WGPs): 120 | |
total threads: 131072 | |
max warps active: 4096 | |
max blocks active: 4096 | |
max threads per multiprocessor: 2560 | |
est. max warps per CU/WGP: 34.1333 | |
est. max blocks per CU/WGP: 34.1333 | |
streams: 4 | |
threads per block: 32 | |
blocks: 4096 | |
warp size: 64 | |
number of multiprocessors (CUs or WGPs): 120 | |
total threads: 131072 | |
max warps active: 4800 | |
max blocks active: 4800 | |
max threads per multiprocessor: 2560 | |
est. max warps per CU/WGP: 40 | |
est. max blocks per CU/WGP: 40 | |
streams: 1 | |
threads per block: 64 | |
blocks: 4096 | |
warp size: 64 | |
number of multiprocessors (CUs or WGPs): 120 | |
total threads: 262144 | |
max warps active: 4096 | |
max blocks active: 4096 | |
max threads per multiprocessor: 2560 | |
est. max warps per CU/WGP: 34.1333 | |
est. max blocks per CU/WGP: 34.1333 | |
streams: 4 | |
threads per block: 64 | |
blocks: 4096 | |
warp size: 64 | |
number of multiprocessors (CUs or WGPs): 120 | |
total threads: 262144 | |
max warps active: 4800 | |
max blocks active: 4800 | |
max threads per multiprocessor: 2560 | |
est. max warps per CU/WGP: 40 | |
est. max blocks per CU/WGP: 40 | |
streams: 1 | |
threads per block: 128 | |
blocks: 4096 | |
warp size: 64 | |
number of multiprocessors (CUs or WGPs): 120 | |
total threads: 524288 | |
max warps active: 3840 | |
max blocks active: 1920 | |
max threads per multiprocessor: 2560 | |
est. max warps per CU/WGP: 32 | |
est. max blocks per CU/WGP: 16 | |
streams: 4 | |
threads per block: 128 | |
blocks: 4096 | |
warp size: 64 | |
number of multiprocessors (CUs or WGPs): 120 | |
total threads: 524288 | |
max warps active: 3840 | |
max blocks active: 1920 | |
max threads per multiprocessor: 2560 | |
est. max warps per CU/WGP: 32 | |
est. max blocks per CU/WGP: 16 | |
streams: 1 | |
threads per block: 256 | |
blocks: 4096 | |
warp size: 64 | |
number of multiprocessors (CUs or WGPs): 120 | |
total threads: 1048576 | |
max warps active: 4096 | |
max blocks active: 1208 | |
max threads per multiprocessor: 2560 | |
est. max warps per CU/WGP: 34.1333 | |
est. max blocks per CU/WGP: 10.0667 | |
streams: 4 | |
threads per block: 256 | |
blocks: 4096 | |
warp size: 64 | |
number of multiprocessors (CUs or WGPs): 120 | |
total threads: 1048576 | |
max warps active: 4800 | |
max blocks active: 1415 | |
max threads per multiprocessor: 2560 | |
est. max warps per CU/WGP: 40 | |
est. max blocks per CU/WGP: 11.7917 | |
streams: 1 | |
threads per block: 384 | |
blocks: 4096 | |
warp size: 64 | |
number of multiprocessors (CUs or WGPs): 120 | |
total threads: 1572864 | |
max warps active: 4080 | |
max blocks active: 794 | |
max threads per multiprocessor: 2560 | |
est. max warps per CU/WGP: 34 | |
est. max blocks per CU/WGP: 6.61667 | |
streams: 4 | |
threads per block: 384 | |
blocks: 4096 | |
warp size: 64 | |
number of multiprocessors (CUs or WGPs): 120 | |
total threads: 1572864 | |
max warps active: 4080 | |
max blocks active: 807 | |
max threads per multiprocessor: 2560 | |
est. max warps per CU/WGP: 34 | |
est. max blocks per CU/WGP: 6.725 | |
streams: 1 | |
threads per block: 512 | |
blocks: 4096 | |
warp size: 64 | |
number of multiprocessors (CUs or WGPs): 120 | |
total threads: 2097152 | |
max warps active: 4096 | |
max blocks active: 612 | |
max threads per multiprocessor: 2560 | |
est. max warps per CU/WGP: 34.1333 | |
est. max blocks per CU/WGP: 5.1 | |
streams: 4 | |
threads per block: 512 | |
blocks: 4096 | |
warp size: 64 | |
number of multiprocessors (CUs or WGPs): 120 | |
total threads: 2097152 | |
max warps active: 4800 | |
max blocks active: 706 | |
max threads per multiprocessor: 2560 | |
est. max warps per CU/WGP: 40 | |
est. max blocks per CU/WGP: 5.88333 | |
streams: 1 | |
threads per block: 768 | |
blocks: 4096 | |
warp size: 64 | |
number of multiprocessors (CUs or WGPs): 120 | |
total threads: 3145728 | |
max warps active: 4032 | |
max blocks active: 406 | |
max threads per multiprocessor: 2560 | |
est. max warps per CU/WGP: 33.6 | |
est. max blocks per CU/WGP: 3.38333 | |
streams: 4 | |
threads per block: 768 | |
blocks: 4096 | |
warp size: 64 | |
number of multiprocessors (CUs or WGPs): 120 | |
total threads: 3145728 | |
max warps active: 4320 | |
max blocks active: 432 | |
max threads per multiprocessor: 2560 | |
est. max warps per CU/WGP: 36 | |
est. max blocks per CU/WGP: 3.6 | |
streams: 1 | |
threads per block: 1024 | |
blocks: 4096 | |
warp size: 64 | |
number of multiprocessors (CUs or WGPs): 120 | |
total threads: 4194304 | |
max warps active: 3841 | |
max blocks active: 299 | |
max threads per multiprocessor: 2560 | |
est. max warps per CU/WGP: 32.0083 | |
est. max blocks per CU/WGP: 2.49167 | |
streams: 4 | |
threads per block: 1024 | |
blocks: 4096 | |
warp size: 64 | |
number of multiprocessors (CUs or WGPs): 120 | |
total threads: 4194304 | |
max warps active: 3840 | |
max blocks active: 299 | |
max threads per multiprocessor: 2560 | |
est. max warps per CU/WGP: 32 | |
est. max blocks per CU/WGP: 2.49167 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
streams: 1 | |
threads per block: 32 | |
blocks: 4096 | |
warp size: 64 | |
number of multiprocessors (CUs or WGPs): 104 | |
total threads: 131072 | |
max warps active: 3328 | |
max blocks active: 3328 | |
max threads per multiprocessor: 2048 | |
est. max warps per CU/WGP: 32 | |
est. max blocks per CU/WGP: 32 | |
streams: 4 | |
threads per block: 32 | |
blocks: 4096 | |
warp size: 64 | |
number of multiprocessors (CUs or WGPs): 104 | |
total threads: 131072 | |
max warps active: 3328 | |
max blocks active: 3328 | |
max threads per multiprocessor: 2048 | |
est. max warps per CU/WGP: 32 | |
est. max blocks per CU/WGP: 32 | |
streams: 1 | |
threads per block: 64 | |
blocks: 4096 | |
warp size: 64 | |
number of multiprocessors (CUs or WGPs): 104 | |
total threads: 262144 | |
max warps active: 3328 | |
max blocks active: 3328 | |
max threads per multiprocessor: 2048 | |
est. max warps per CU/WGP: 32 | |
est. max blocks per CU/WGP: 32 | |
streams: 4 | |
threads per block: 64 | |
blocks: 4096 | |
warp size: 64 | |
number of multiprocessors (CUs or WGPs): 104 | |
total threads: 262144 | |
max warps active: 3328 | |
max blocks active: 3328 | |
max threads per multiprocessor: 2048 | |
est. max warps per CU/WGP: 32 | |
est. max blocks per CU/WGP: 32 | |
streams: 1 | |
threads per block: 128 | |
blocks: 4096 | |
warp size: 64 | |
number of multiprocessors (CUs or WGPs): 104 | |
total threads: 524288 | |
max warps active: 3328 | |
max blocks active: 1664 | |
max threads per multiprocessor: 2048 | |
est. max warps per CU/WGP: 32 | |
est. max blocks per CU/WGP: 16 | |
streams: 4 | |
threads per block: 128 | |
blocks: 4096 | |
warp size: 64 | |
number of multiprocessors (CUs or WGPs): 104 | |
total threads: 524288 | |
max warps active: 3320 | |
max blocks active: 1664 | |
max threads per multiprocessor: 2048 | |
est. max warps per CU/WGP: 31.9231 | |
est. max blocks per CU/WGP: 16 | |
streams: 1 | |
threads per block: 256 | |
blocks: 4096 | |
warp size: 64 | |
number of multiprocessors (CUs or WGPs): 104 | |
total threads: 1048576 | |
max warps active: 3328 | |
max blocks active: 975 | |
max threads per multiprocessor: 2048 | |
est. max warps per CU/WGP: 32 | |
est. max blocks per CU/WGP: 9.375 | |
streams: 4 | |
threads per block: 256 | |
blocks: 4096 | |
warp size: 64 | |
number of multiprocessors (CUs or WGPs): 104 | |
total threads: 1048576 | |
max warps active: 3328 | |
max blocks active: 942 | |
max threads per multiprocessor: 2048 | |
est. max warps per CU/WGP: 32 | |
est. max blocks per CU/WGP: 9.05769 | |
streams: 1 | |
threads per block: 384 | |
blocks: 4096 | |
warp size: 64 | |
number of multiprocessors (CUs or WGPs): 104 | |
total threads: 1572864 | |
max warps active: 2868 | |
max blocks active: 556 | |
max threads per multiprocessor: 2048 | |
est. max warps per CU/WGP: 27.5769 | |
est. max blocks per CU/WGP: 5.34615 | |
streams: 4 | |
threads per block: 384 | |
blocks: 4096 | |
warp size: 64 | |
number of multiprocessors (CUs or WGPs): 104 | |
total threads: 1572864 | |
max warps active: 2682 | |
max blocks active: 480 | |
max threads per multiprocessor: 2048 | |
est. max warps per CU/WGP: 25.7885 | |
est. max blocks per CU/WGP: 4.61538 | |
streams: 1 | |
threads per block: 512 | |
blocks: 4096 | |
warp size: 64 | |
number of multiprocessors (CUs or WGPs): 104 | |
total threads: 2097152 | |
max warps active: 3328 | |
max blocks active: 510 | |
max threads per multiprocessor: 2048 | |
est. max warps per CU/WGP: 32 | |
est. max blocks per CU/WGP: 4.90385 | |
streams: 4 | |
threads per block: 512 | |
blocks: 4096 | |
warp size: 64 | |
number of multiprocessors (CUs or WGPs): 104 | |
total threads: 2097152 | |
max warps active: 3328 | |
max blocks active: 466 | |
max threads per multiprocessor: 2048 | |
est. max warps per CU/WGP: 32 | |
est. max blocks per CU/WGP: 4.48077 | |
streams: 1 | |
threads per block: 768 | |
blocks: 4096 | |
warp size: 64 | |
number of multiprocessors (CUs or WGPs): 104 | |
total threads: 3145728 | |
max warps active: 2505 | |
max blocks active: 283 | |
max threads per multiprocessor: 2048 | |
est. max warps per CU/WGP: 24.0865 | |
est. max blocks per CU/WGP: 2.72115 | |
streams: 4 | |
threads per block: 768 | |
blocks: 4096 | |
warp size: 64 | |
number of multiprocessors (CUs or WGPs): 104 | |
total threads: 3145728 | |
max warps active: 2515 | |
max blocks active: 241 | |
max threads per multiprocessor: 2048 | |
est. max warps per CU/WGP: 24.1827 | |
est. max blocks per CU/WGP: 2.31731 | |
streams: 1 | |
threads per block: 1024 | |
blocks: 4096 | |
warp size: 64 | |
number of multiprocessors (CUs or WGPs): 104 | |
total threads: 4194304 | |
max warps active: 3328 | |
max blocks active: 234 | |
max threads per multiprocessor: 2048 | |
est. max warps per CU/WGP: 32 | |
est. max blocks per CU/WGP: 2.25 | |
streams: 4 | |
threads per block: 1024 | |
blocks: 4096 | |
warp size: 64 | |
number of multiprocessors (CUs or WGPs): 104 | |
total threads: 4194304 | |
max warps active: 3328 | |
max blocks active: 209 | |
max threads per multiprocessor: 2048 | |
est. max warps per CU/WGP: 32 | |
est. max blocks per CU/WGP: 2.00962 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
streams: 1 | |
threads per block: 32 | |
blocks: 4096 | |
warp size: 32 | |
number of multiprocessors (CUs or WGPs): 48 | |
total threads: 131072 | |
max warps active: 3067 | |
max blocks active: 3067 | |
max threads per multiprocessor: 2048 | |
est. max warps per CU/WGP: 63.8958 | |
est. max blocks per CU/WGP: 63.8958 | |
streams: 4 | |
threads per block: 32 | |
blocks: 4096 | |
warp size: 32 | |
number of multiprocessors (CUs or WGPs): 48 | |
total threads: 131072 | |
max warps active: 3068 | |
max blocks active: 3068 | |
max threads per multiprocessor: 2048 | |
est. max warps per CU/WGP: 63.9167 | |
est. max blocks per CU/WGP: 63.9167 | |
streams: 1 | |
threads per block: 64 | |
blocks: 4096 | |
warp size: 32 | |
number of multiprocessors (CUs or WGPs): 48 | |
total threads: 262144 | |
max warps active: 2976 | |
max blocks active: 1488 | |
max threads per multiprocessor: 2048 | |
est. max warps per CU/WGP: 62 | |
est. max blocks per CU/WGP: 31 | |
streams: 4 | |
threads per block: 64 | |
blocks: 4096 | |
warp size: 32 | |
number of multiprocessors (CUs or WGPs): 48 | |
total threads: 262144 | |
max warps active: 3072 | |
max blocks active: 1536 | |
max threads per multiprocessor: 2048 | |
est. max warps per CU/WGP: 64 | |
est. max blocks per CU/WGP: 32 | |
streams: 1 | |
threads per block: 128 | |
blocks: 4096 | |
warp size: 32 | |
number of multiprocessors (CUs or WGPs): 48 | |
total threads: 524288 | |
max warps active: 2600 | |
max blocks active: 650 | |
max threads per multiprocessor: 2048 | |
est. max warps per CU/WGP: 54.1667 | |
est. max blocks per CU/WGP: 13.5417 | |
streams: 4 | |
threads per block: 128 | |
blocks: 4096 | |
warp size: 32 | |
number of multiprocessors (CUs or WGPs): 48 | |
total threads: 524288 | |
max warps active: 3072 | |
max blocks active: 768 | |
max threads per multiprocessor: 2048 | |
est. max warps per CU/WGP: 64 | |
est. max blocks per CU/WGP: 16 | |
streams: 1 | |
threads per block: 256 | |
blocks: 4096 | |
warp size: 32 | |
number of multiprocessors (CUs or WGPs): 48 | |
total threads: 1048576 | |
max warps active: 3048 | |
max blocks active: 381 | |
max threads per multiprocessor: 2048 | |
est. max warps per CU/WGP: 63.5 | |
est. max blocks per CU/WGP: 7.9375 | |
streams: 4 | |
threads per block: 256 | |
blocks: 4096 | |
warp size: 32 | |
number of multiprocessors (CUs or WGPs): 48 | |
total threads: 1048576 | |
max warps active: 3072 | |
max blocks active: 384 | |
max threads per multiprocessor: 2048 | |
est. max warps per CU/WGP: 64 | |
est. max blocks per CU/WGP: 8 | |
streams: 1 | |
threads per block: 384 | |
blocks: 4096 | |
warp size: 32 | |
number of multiprocessors (CUs or WGPs): 48 | |
total threads: 1572864 | |
max warps active: 2881 | |
max blocks active: 241 | |
max threads per multiprocessor: 2048 | |
est. max warps per CU/WGP: 60.0208 | |
est. max blocks per CU/WGP: 5.02083 | |
streams: 4 | |
threads per block: 384 | |
blocks: 4096 | |
warp size: 32 | |
number of multiprocessors (CUs or WGPs): 48 | |
total threads: 1572864 | |
max warps active: 2880 | |
max blocks active: 240 | |
max threads per multiprocessor: 2048 | |
est. max warps per CU/WGP: 60 | |
est. max blocks per CU/WGP: 5 | |
streams: 1 | |
threads per block: 512 | |
blocks: 4096 | |
warp size: 32 | |
number of multiprocessors (CUs or WGPs): 48 | |
total threads: 2097152 | |
max warps active: 3072 | |
max blocks active: 192 | |
max threads per multiprocessor: 2048 | |
est. max warps per CU/WGP: 64 | |
est. max blocks per CU/WGP: 4 | |
streams: 4 | |
threads per block: 512 | |
blocks: 4096 | |
warp size: 32 | |
number of multiprocessors (CUs or WGPs): 48 | |
total threads: 2097152 | |
max warps active: 3072 | |
max blocks active: 192 | |
max threads per multiprocessor: 2048 | |
est. max warps per CU/WGP: 64 | |
est. max blocks per CU/WGP: 4 | |
streams: 1 | |
threads per block: 768 | |
blocks: 4096 | |
warp size: 32 | |
number of multiprocessors (CUs or WGPs): 48 | |
total threads: 3145728 | |
max warps active: 2305 | |
max blocks active: 96 | |
max threads per multiprocessor: 2048 | |
est. max warps per CU/WGP: 48.0208 | |
est. max blocks per CU/WGP: 2 | |
streams: 4 | |
threads per block: 768 | |
blocks: 4096 | |
warp size: 32 | |
number of multiprocessors (CUs or WGPs): 48 | |
total threads: 3145728 | |
max warps active: 2305 | |
max blocks active: 97 | |
max threads per multiprocessor: 2048 | |
est. max warps per CU/WGP: 48.0208 | |
est. max blocks per CU/WGP: 2.02083 | |
streams: 1 | |
threads per block: 1024 | |
blocks: 4096 | |
warp size: 32 | |
number of multiprocessors (CUs or WGPs): 48 | |
total threads: 4194304 | |
max warps active: 3072 | |
max blocks active: 96 | |
max threads per multiprocessor: 2048 | |
est. max warps per CU/WGP: 64 | |
est. max blocks per CU/WGP: 2 | |
streams: 4 | |
threads per block: 1024 | |
blocks: 4096 | |
warp size: 32 | |
number of multiprocessors (CUs or WGPs): 48 | |
total threads: 4194304 | |
max warps active: 3072 | |
max blocks active: 96 | |
max threads per multiprocessor: 2048 | |
est. max warps per CU/WGP: 64 | |
est. max blocks per CU/WGP: 2 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment