Skip to content

Instantly share code, notes, and snippets.

@malfet
Created October 21, 2025 21:50
Show Gist options
  • Save malfet/790a78754ad22fb5bcc67939e5760c69 to your computer and use it in GitHub Desktop.
Save malfet/790a78754ad22fb5bcc67939e5760c69 to your computer and use it in GitHub Desktop.
#include <stdio.h>
#include <chrono>
__global__ void noop() { }
int main(int argc, const char *argv[]) {
cudaDeviceProp prop;
auto rc = cudaGetDeviceProperties(&prop, 0);
printf("Running on %s sm%d.%d multiProcessorCount = %d maxBlocksPerMultiProcessor = %d maxThreadsPerBlock = %d\n",
prop.name, prop.major, prop.minor, prop.multiProcessorCount, prop.maxBlocksPerMultiProcessor, prop.maxThreadsPerBlock);
if (rc != cudaSuccess) {
printf("cudaGetDeviceProperties()=%d (%s)\n", rc, cudaGetErrorString(rc));
return rc;
}
if (argc > 1) {
rc = cudaSetDeviceFlags(cudaDeviceScheduleYield);
if (rc != cudaSuccess) {
printf("cudaSetDeviceFlags)=%d (%s)\n", rc, cudaGetErrorString(rc));
}
}
auto start = std::chrono::system_clock::now();
constexpr auto launch_count = 16384;
for(auto cnt = 0; cnt < launch_count; ++cnt) {
noop<<<prop.multiProcessorCount * 10, 16384>>>();
}
rc = cudaDeviceSynchronize();
auto end = std::chrono::system_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end-start).count();
if (rc != cudaSuccess) {
printf("cudaDeviceSynchronie()=%d (%s)\n", rc, cudaGetErrorString(rc));
}
printf("Launching CUDA kernel takes %.2f ms\n", 1. * duration / launch_count);
return rc;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment