Skip to content

Instantly share code, notes, and snippets.

@Snektron
Last active January 9, 2025 15:15
Show Gist options
  • Save Snektron/1fb62a39ee0d7b572c3441f0a53d310c to your computer and use it in GitHub Desktop.
Save Snektron/1fb62a39ee0d7b572c3441f0a53d310c to your computer and use it in GitHub Desktop.
Measure maximum number of waves and blocks per CU
#include <hip/hip_runtime.h>
#include <iostream>
#include <vector>
__global__ void kernel(unsigned* active_warps, unsigned* max_warps, unsigned* active_blocks, unsigned* max_blocks) {
const auto lane_id = __lane_id();
if (lane_id == 0) {
const auto current_active_wg = atomicAdd(active_warps, 1) + 1;
atomicMax(max_warps, current_active_wg);
}
if (threadIdx.x == 0) {
const auto current_active_wg = atomicAdd(active_blocks, 1) + 1;
atomicMax(max_blocks, current_active_wg);
}
// Simulate some work
for (unsigned int i = 0; i < 10000; ++i) {
__builtin_amdgcn_s_sleep(0x7F);
}
if (lane_id == 0) {
atomicSub(active_warps, 1);
}
if (threadIdx.x == 0) {
atomicSub(active_blocks, 1);
}
}
void test(int threads, int n_streams) {
unsigned* d_gpu_data;
hipMalloc(&d_gpu_data, sizeof(unsigned) * 4);
hipMemset(d_gpu_data, 0, sizeof(unsigned) * 4);
constexpr int blocks = 1024 * 4;
std::vector<hipStream_t> streams(n_streams);
for (int i = 0; i < n_streams; ++i) {
hipStreamCreateWithFlags(&streams[i], hipStreamNonBlocking);
}
for (int i = 0; i < n_streams; ++i) {
kernel<<<blocks, threads, 0, streams[i]>>>(d_gpu_data, d_gpu_data + 1, d_gpu_data + 2, d_gpu_data + 3);
}
hipDeviceSynchronize();
unsigned max_warps, max_blocks;
hipMemcpy(&max_warps, d_gpu_data + 1, sizeof(unsigned), hipMemcpyDeviceToHost);
hipMemcpy(&max_blocks, d_gpu_data + 3, sizeof(unsigned), hipMemcpyDeviceToHost);
hipDeviceProp_t props;
hipGetDeviceProperties(&props, 0);
std::cout << "streams: " << n_streams << std::endl;
std::cout << "threads per block: " << threads << std::endl;
std::cout << "blocks: " << blocks << std::endl;
std::cout << "warp size: " << props.warpSize << std::endl;
std::cout << "number of multiprocessors (CUs or WGPs): " << props.multiProcessorCount << std::endl;
std::cout << "total threads: " << (threads * blocks) << std::endl;
std::cout << "max warps active: " << max_warps << std::endl;
std::cout << "max blocks active: " << max_blocks << std::endl;
std::cout << "max threads per multiprocessor: " << props.maxThreadsPerMultiProcessor << std::endl;
std::cout << "est. max warps per CU/WGP: " << max_warps / static_cast<float>(props.multiProcessorCount) << std::endl;
std::cout << "est. max blocks per CU/WGP: " << max_blocks / static_cast<float>(props.multiProcessorCount) << std::endl;
std::cout << std::endl;
for (int i = 0; i < n_streams; ++i) {
hipStreamDestroy(streams[i]);
}
}
int main() {
for (int threads : {32, 64, 128, 256, 384, 512, 768, 1024}) {
for (int streams : {1, 4}) {
test(threads, streams);
}
}
}
streams: 1
threads per block: 32
blocks: 4096
warp size: 64
number of multiprocessors (CUs or WGPs): 120
total threads: 131072
max warps active: 4096
max blocks active: 4096
max threads per multiprocessor: 2560
est. max warps per CU/WGP: 34.1333
est. max blocks per CU/WGP: 34.1333
streams: 4
threads per block: 32
blocks: 4096
warp size: 64
number of multiprocessors (CUs or WGPs): 120
total threads: 131072
max warps active: 4800
max blocks active: 4800
max threads per multiprocessor: 2560
est. max warps per CU/WGP: 40
est. max blocks per CU/WGP: 40
streams: 1
threads per block: 64
blocks: 4096
warp size: 64
number of multiprocessors (CUs or WGPs): 120
total threads: 262144
max warps active: 4096
max blocks active: 4096
max threads per multiprocessor: 2560
est. max warps per CU/WGP: 34.1333
est. max blocks per CU/WGP: 34.1333
streams: 4
threads per block: 64
blocks: 4096
warp size: 64
number of multiprocessors (CUs or WGPs): 120
total threads: 262144
max warps active: 4800
max blocks active: 4800
max threads per multiprocessor: 2560
est. max warps per CU/WGP: 40
est. max blocks per CU/WGP: 40
streams: 1
threads per block: 128
blocks: 4096
warp size: 64
number of multiprocessors (CUs or WGPs): 120
total threads: 524288
max warps active: 3840
max blocks active: 1920
max threads per multiprocessor: 2560
est. max warps per CU/WGP: 32
est. max blocks per CU/WGP: 16
streams: 4
threads per block: 128
blocks: 4096
warp size: 64
number of multiprocessors (CUs or WGPs): 120
total threads: 524288
max warps active: 3840
max blocks active: 1920
max threads per multiprocessor: 2560
est. max warps per CU/WGP: 32
est. max blocks per CU/WGP: 16
streams: 1
threads per block: 256
blocks: 4096
warp size: 64
number of multiprocessors (CUs or WGPs): 120
total threads: 1048576
max warps active: 4096
max blocks active: 1208
max threads per multiprocessor: 2560
est. max warps per CU/WGP: 34.1333
est. max blocks per CU/WGP: 10.0667
streams: 4
threads per block: 256
blocks: 4096
warp size: 64
number of multiprocessors (CUs or WGPs): 120
total threads: 1048576
max warps active: 4800
max blocks active: 1415
max threads per multiprocessor: 2560
est. max warps per CU/WGP: 40
est. max blocks per CU/WGP: 11.7917
streams: 1
threads per block: 384
blocks: 4096
warp size: 64
number of multiprocessors (CUs or WGPs): 120
total threads: 1572864
max warps active: 4080
max blocks active: 794
max threads per multiprocessor: 2560
est. max warps per CU/WGP: 34
est. max blocks per CU/WGP: 6.61667
streams: 4
threads per block: 384
blocks: 4096
warp size: 64
number of multiprocessors (CUs or WGPs): 120
total threads: 1572864
max warps active: 4080
max blocks active: 807
max threads per multiprocessor: 2560
est. max warps per CU/WGP: 34
est. max blocks per CU/WGP: 6.725
streams: 1
threads per block: 512
blocks: 4096
warp size: 64
number of multiprocessors (CUs or WGPs): 120
total threads: 2097152
max warps active: 4096
max blocks active: 612
max threads per multiprocessor: 2560
est. max warps per CU/WGP: 34.1333
est. max blocks per CU/WGP: 5.1
streams: 4
threads per block: 512
blocks: 4096
warp size: 64
number of multiprocessors (CUs or WGPs): 120
total threads: 2097152
max warps active: 4800
max blocks active: 706
max threads per multiprocessor: 2560
est. max warps per CU/WGP: 40
est. max blocks per CU/WGP: 5.88333
streams: 1
threads per block: 768
blocks: 4096
warp size: 64
number of multiprocessors (CUs or WGPs): 120
total threads: 3145728
max warps active: 4032
max blocks active: 406
max threads per multiprocessor: 2560
est. max warps per CU/WGP: 33.6
est. max blocks per CU/WGP: 3.38333
streams: 4
threads per block: 768
blocks: 4096
warp size: 64
number of multiprocessors (CUs or WGPs): 120
total threads: 3145728
max warps active: 4320
max blocks active: 432
max threads per multiprocessor: 2560
est. max warps per CU/WGP: 36
est. max blocks per CU/WGP: 3.6
streams: 1
threads per block: 1024
blocks: 4096
warp size: 64
number of multiprocessors (CUs or WGPs): 120
total threads: 4194304
max warps active: 3841
max blocks active: 299
max threads per multiprocessor: 2560
est. max warps per CU/WGP: 32.0083
est. max blocks per CU/WGP: 2.49167
streams: 4
threads per block: 1024
blocks: 4096
warp size: 64
number of multiprocessors (CUs or WGPs): 120
total threads: 4194304
max warps active: 3840
max blocks active: 299
max threads per multiprocessor: 2560
est. max warps per CU/WGP: 32
est. max blocks per CU/WGP: 2.49167
streams: 1
threads per block: 32
blocks: 4096
warp size: 64
number of multiprocessors (CUs or WGPs): 104
total threads: 131072
max warps active: 3328
max blocks active: 3328
max threads per multiprocessor: 2048
est. max warps per CU/WGP: 32
est. max blocks per CU/WGP: 32
streams: 4
threads per block: 32
blocks: 4096
warp size: 64
number of multiprocessors (CUs or WGPs): 104
total threads: 131072
max warps active: 3328
max blocks active: 3328
max threads per multiprocessor: 2048
est. max warps per CU/WGP: 32
est. max blocks per CU/WGP: 32
streams: 1
threads per block: 64
blocks: 4096
warp size: 64
number of multiprocessors (CUs or WGPs): 104
total threads: 262144
max warps active: 3328
max blocks active: 3328
max threads per multiprocessor: 2048
est. max warps per CU/WGP: 32
est. max blocks per CU/WGP: 32
streams: 4
threads per block: 64
blocks: 4096
warp size: 64
number of multiprocessors (CUs or WGPs): 104
total threads: 262144
max warps active: 3328
max blocks active: 3328
max threads per multiprocessor: 2048
est. max warps per CU/WGP: 32
est. max blocks per CU/WGP: 32
streams: 1
threads per block: 128
blocks: 4096
warp size: 64
number of multiprocessors (CUs or WGPs): 104
total threads: 524288
max warps active: 3328
max blocks active: 1664
max threads per multiprocessor: 2048
est. max warps per CU/WGP: 32
est. max blocks per CU/WGP: 16
streams: 4
threads per block: 128
blocks: 4096
warp size: 64
number of multiprocessors (CUs or WGPs): 104
total threads: 524288
max warps active: 3320
max blocks active: 1664
max threads per multiprocessor: 2048
est. max warps per CU/WGP: 31.9231
est. max blocks per CU/WGP: 16
streams: 1
threads per block: 256
blocks: 4096
warp size: 64
number of multiprocessors (CUs or WGPs): 104
total threads: 1048576
max warps active: 3328
max blocks active: 975
max threads per multiprocessor: 2048
est. max warps per CU/WGP: 32
est. max blocks per CU/WGP: 9.375
streams: 4
threads per block: 256
blocks: 4096
warp size: 64
number of multiprocessors (CUs or WGPs): 104
total threads: 1048576
max warps active: 3328
max blocks active: 942
max threads per multiprocessor: 2048
est. max warps per CU/WGP: 32
est. max blocks per CU/WGP: 9.05769
streams: 1
threads per block: 384
blocks: 4096
warp size: 64
number of multiprocessors (CUs or WGPs): 104
total threads: 1572864
max warps active: 2868
max blocks active: 556
max threads per multiprocessor: 2048
est. max warps per CU/WGP: 27.5769
est. max blocks per CU/WGP: 5.34615
streams: 4
threads per block: 384
blocks: 4096
warp size: 64
number of multiprocessors (CUs or WGPs): 104
total threads: 1572864
max warps active: 2682
max blocks active: 480
max threads per multiprocessor: 2048
est. max warps per CU/WGP: 25.7885
est. max blocks per CU/WGP: 4.61538
streams: 1
threads per block: 512
blocks: 4096
warp size: 64
number of multiprocessors (CUs or WGPs): 104
total threads: 2097152
max warps active: 3328
max blocks active: 510
max threads per multiprocessor: 2048
est. max warps per CU/WGP: 32
est. max blocks per CU/WGP: 4.90385
streams: 4
threads per block: 512
blocks: 4096
warp size: 64
number of multiprocessors (CUs or WGPs): 104
total threads: 2097152
max warps active: 3328
max blocks active: 466
max threads per multiprocessor: 2048
est. max warps per CU/WGP: 32
est. max blocks per CU/WGP: 4.48077
streams: 1
threads per block: 768
blocks: 4096
warp size: 64
number of multiprocessors (CUs or WGPs): 104
total threads: 3145728
max warps active: 2505
max blocks active: 283
max threads per multiprocessor: 2048
est. max warps per CU/WGP: 24.0865
est. max blocks per CU/WGP: 2.72115
streams: 4
threads per block: 768
blocks: 4096
warp size: 64
number of multiprocessors (CUs or WGPs): 104
total threads: 3145728
max warps active: 2515
max blocks active: 241
max threads per multiprocessor: 2048
est. max warps per CU/WGP: 24.1827
est. max blocks per CU/WGP: 2.31731
streams: 1
threads per block: 1024
blocks: 4096
warp size: 64
number of multiprocessors (CUs or WGPs): 104
total threads: 4194304
max warps active: 3328
max blocks active: 234
max threads per multiprocessor: 2048
est. max warps per CU/WGP: 32
est. max blocks per CU/WGP: 2.25
streams: 4
threads per block: 1024
blocks: 4096
warp size: 64
number of multiprocessors (CUs or WGPs): 104
total threads: 4194304
max warps active: 3328
max blocks active: 209
max threads per multiprocessor: 2048
est. max warps per CU/WGP: 32
est. max blocks per CU/WGP: 2.00962
streams: 1
threads per block: 32
blocks: 4096
warp size: 32
number of multiprocessors (CUs or WGPs): 48
total threads: 131072
max warps active: 3067
max blocks active: 3067
max threads per multiprocessor: 2048
est. max warps per CU/WGP: 63.8958
est. max blocks per CU/WGP: 63.8958
streams: 4
threads per block: 32
blocks: 4096
warp size: 32
number of multiprocessors (CUs or WGPs): 48
total threads: 131072
max warps active: 3068
max blocks active: 3068
max threads per multiprocessor: 2048
est. max warps per CU/WGP: 63.9167
est. max blocks per CU/WGP: 63.9167
streams: 1
threads per block: 64
blocks: 4096
warp size: 32
number of multiprocessors (CUs or WGPs): 48
total threads: 262144
max warps active: 2976
max blocks active: 1488
max threads per multiprocessor: 2048
est. max warps per CU/WGP: 62
est. max blocks per CU/WGP: 31
streams: 4
threads per block: 64
blocks: 4096
warp size: 32
number of multiprocessors (CUs or WGPs): 48
total threads: 262144
max warps active: 3072
max blocks active: 1536
max threads per multiprocessor: 2048
est. max warps per CU/WGP: 64
est. max blocks per CU/WGP: 32
streams: 1
threads per block: 128
blocks: 4096
warp size: 32
number of multiprocessors (CUs or WGPs): 48
total threads: 524288
max warps active: 2600
max blocks active: 650
max threads per multiprocessor: 2048
est. max warps per CU/WGP: 54.1667
est. max blocks per CU/WGP: 13.5417
streams: 4
threads per block: 128
blocks: 4096
warp size: 32
number of multiprocessors (CUs or WGPs): 48
total threads: 524288
max warps active: 3072
max blocks active: 768
max threads per multiprocessor: 2048
est. max warps per CU/WGP: 64
est. max blocks per CU/WGP: 16
streams: 1
threads per block: 256
blocks: 4096
warp size: 32
number of multiprocessors (CUs or WGPs): 48
total threads: 1048576
max warps active: 3048
max blocks active: 381
max threads per multiprocessor: 2048
est. max warps per CU/WGP: 63.5
est. max blocks per CU/WGP: 7.9375
streams: 4
threads per block: 256
blocks: 4096
warp size: 32
number of multiprocessors (CUs or WGPs): 48
total threads: 1048576
max warps active: 3072
max blocks active: 384
max threads per multiprocessor: 2048
est. max warps per CU/WGP: 64
est. max blocks per CU/WGP: 8
streams: 1
threads per block: 384
blocks: 4096
warp size: 32
number of multiprocessors (CUs or WGPs): 48
total threads: 1572864
max warps active: 2881
max blocks active: 241
max threads per multiprocessor: 2048
est. max warps per CU/WGP: 60.0208
est. max blocks per CU/WGP: 5.02083
streams: 4
threads per block: 384
blocks: 4096
warp size: 32
number of multiprocessors (CUs or WGPs): 48
total threads: 1572864
max warps active: 2880
max blocks active: 240
max threads per multiprocessor: 2048
est. max warps per CU/WGP: 60
est. max blocks per CU/WGP: 5
streams: 1
threads per block: 512
blocks: 4096
warp size: 32
number of multiprocessors (CUs or WGPs): 48
total threads: 2097152
max warps active: 3072
max blocks active: 192
max threads per multiprocessor: 2048
est. max warps per CU/WGP: 64
est. max blocks per CU/WGP: 4
streams: 4
threads per block: 512
blocks: 4096
warp size: 32
number of multiprocessors (CUs or WGPs): 48
total threads: 2097152
max warps active: 3072
max blocks active: 192
max threads per multiprocessor: 2048
est. max warps per CU/WGP: 64
est. max blocks per CU/WGP: 4
streams: 1
threads per block: 768
blocks: 4096
warp size: 32
number of multiprocessors (CUs or WGPs): 48
total threads: 3145728
max warps active: 2305
max blocks active: 96
max threads per multiprocessor: 2048
est. max warps per CU/WGP: 48.0208
est. max blocks per CU/WGP: 2
streams: 4
threads per block: 768
blocks: 4096
warp size: 32
number of multiprocessors (CUs or WGPs): 48
total threads: 3145728
max warps active: 2305
max blocks active: 97
max threads per multiprocessor: 2048
est. max warps per CU/WGP: 48.0208
est. max blocks per CU/WGP: 2.02083
streams: 1
threads per block: 1024
blocks: 4096
warp size: 32
number of multiprocessors (CUs or WGPs): 48
total threads: 4194304
max warps active: 3072
max blocks active: 96
max threads per multiprocessor: 2048
est. max warps per CU/WGP: 64
est. max blocks per CU/WGP: 2
streams: 4
threads per block: 1024
blocks: 4096
warp size: 32
number of multiprocessors (CUs or WGPs): 48
total threads: 4194304
max warps active: 3072
max blocks active: 96
max threads per multiprocessor: 2048
est. max warps per CU/WGP: 64
est. max blocks per CU/WGP: 2
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment