Skip to content

Instantly share code, notes, and snippets.

@sateesh12
Last active July 21, 2024 17:46
Show Gist options
  • Save sateesh12/f5d600a385bbf719711f364ed3640a93 to your computer and use it in GitHub Desktop.
Save sateesh12/f5d600a385bbf719711f364ed3640a93 to your computer and use it in GitHub Desktop.
21072024_Cpp_CUDA.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"gpuType": "T4",
"authorship_tag": "ABX9TyN7nQmzr9CppU7ABly48i0M",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/sateesh12/f5d600a385bbf719711f364ed3640a93/21072024_cpp_cuda.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"source": [
"#Author : Sateesh Kalidas\n",
"#Date : 21/July/2024\n",
"#Purpose: Trying out CUDA on NVidia GPU via co-lab\n",
"#Inspiration : https://colab.research.google.com/github/NVDLI/notebooks/blob/master/even-easier-cuda/An_Even_Easier_Introduction_to_CUDA.ipynb"
],
"metadata": {
"id": "cmqXm_RRE7SA"
}
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "tzwxlXBxE55v",
"outputId": "21e125f2-1c76-45a2-9fdd-164f653fb200"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Overwriting add.cu\n"
]
}
],
"source": [
"%%writefile add.cu\n",
"// The file name is ending with .cu to vmake it be compilable by nvcc which is the Nvidia version of gcc\n",
"\n",
"#include <iostream>\n",
"#include <math.h>\n",
"/*\n",
" * Method: add\n",
" * y this is a pointer to a super large array which is also where the summation is done\n",
" * x is a pointer to millions of elements which is the input\n",
" * Simply add two numbers\n",
" * return: void\n",
"*/\n",
"\n",
"#define GPU\n",
"\n",
"// The global compiler directive moves the so called Kernel to the GPU for handling.\n",
"#ifdef GPU\n",
"__global__\n",
"void add(int n, float *y, float *x)\n",
"{\n",
" int index = blockIdx.x * blockDim.x + threadIdx.x;\n",
" int stride = blockDim.x * gridDim.x;\n",
"\n",
"\n",
" for(int i = index; i < n; i += stride)\n",
" {\n",
" y[i] = y[i] + x[i];\n",
" printf(\"index: %d\\n\", i);\n",
" printf(\"stride: %d\\n\", stride);\n",
"\n",
" }\n",
"}\n",
"// The below code is cool, there is no for loop, but addition is happening\n",
"// due to the parallemism of the CUDA kernel.\n",
"// The below code is from\n",
"// https://developer.nvidia.com/blog/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/\n",
"\n",
"__global__\n",
"void add_without_loop(int n, float *y, float *x)\n",
"{\n",
" int index = threadIdx.x + blockIdx.x * blockDim.x;\n",
" if(index < n)\n",
" {\n",
" y[index] = y[index] + x[index];\n",
" }\n",
"\n",
"}\n",
"#endif\n",
"\n",
"int main()\n",
"{\n",
" float *x, *y;\n",
" int N = 1 << 20;\n",
"#ifdef CPU\n",
" float *x = new float[N];\n",
" float *y = new float[N];\n",
"#endif\n",
"\n",
"#ifdef GPU\n",
" cudaMallocManaged(&x, N*sizeof(float));\n",
" cudaMallocManaged(&y, N*sizeof(float));\n",
"#endif\n",
"\n",
" // Initialize the super large array\n",
"\n",
" for(int i = 0; i < N; i++)\n",
" {\n",
" x[i] = 2.0f;\n",
" y[i] = 1.0f;\n",
"\n",
" }\n",
"\n",
" // Add the addition kernel to the GPU\n",
" // This is done with the <<< marker in code.\n",
"#ifdef CPU\n",
" add(N, y, x);\n",
"#endif\n",
"\n",
"#ifdef GPU\n",
" // Compute the number of blocks needed\n",
" int blockSize = 256;\n",
" int numBlocks = (N + blockSize - 1)/blockSize;\n",
"\n",
" // Invoke the kernel in paralle configuration\n",
"\n",
" add_without_loop<<<numBlocks,blockSize>>>(N, y, x);\n",
" cudaDeviceSynchronize();\n",
" // Below values for 2 << 10 element size.\n",
" // <<1 block , 1 thread>> takes 139ms\n",
" // <<1 block, 32 threads>> takes 14ms\n",
" // <<1 block, 256 threads>> takes 3ms\n",
" // << 1 block, 256 threads>> changes matter only if threadIdx.x and blockDim.x are used !\n",
" // << computed block size, 256>> 3.3ms\n",
"\n",
"#endif\n",
"\n",
" // Validate the results\n",
" float maxError = 0.0f;\n",
" for (int i =0; i < N; i++)\n",
" {\n",
" maxError = fmax(maxError, fabs(y[i] - 3.0f));\n",
" }\n",
" std::cout << \"Max Error is:\" << maxError << std::endl;\n",
"\n",
" // Free the memory resources\n",
"#ifdef CPU\n",
" delete [] x;\n",
" delete [] y;\n",
"#endif\n",
"\n",
"#ifdef GPU\n",
" cudaFree(x);\n",
" cudaFree(y);\n",
"#endif\n",
"}\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"source": [
"%%shell\n",
"rm add_cuda\n",
"nvcc add.cu -o add_cuda"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Ed-YbD_cKhYu",
"outputId": "9c33d810-ec4a-48dc-cf24-2c277707caf5"
},
"execution_count": 4,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"rm: cannot remove 'add_cuda': No such file or directory\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": []
},
"metadata": {},
"execution_count": 4
}
]
},
{
"cell_type": "code",
"source": [
"%%shell\n",
"nvprof ./add_cuda"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "bogwPHjmN6E7",
"outputId": "8c34eeed-ff41-4d12-fa19-dabdfb872d0d"
},
"execution_count": 5,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"==677== NVPROF is profiling process 677, command: ./add_cuda\n",
"Max Error is:0\n",
"==677== Profiling application: ./add_cuda\n",
"==677== Profiling result:\n",
" Type Time(%) Time Calls Avg Min Max Name\n",
" GPU activities: 100.00% 2.6142ms 1 2.6142ms 2.6142ms 2.6142ms add_without_loop(int, float*, float*)\n",
" API calls: 67.61% 237.93ms 2 118.96ms 45.594us 237.88ms cudaMallocManaged\n",
" 31.44% 110.64ms 1 110.64ms 110.64ms 110.64ms cudaLaunchKernel\n",
" 0.74% 2.6037ms 1 2.6037ms 2.6037ms 2.6037ms cudaDeviceSynchronize\n",
" 0.15% 511.35us 2 255.67us 235.67us 275.68us cudaFree\n",
" 0.06% 200.41us 114 1.7570us 205ns 82.393us cuDeviceGetAttribute\n",
" 0.00% 13.353us 1 13.353us 13.353us 13.353us cuDeviceGetName\n",
" 0.00% 9.2590us 1 9.2590us 9.2590us 9.2590us cuDeviceGetPCIBusId\n",
" 0.00% 5.1830us 1 5.1830us 5.1830us 5.1830us cuDeviceTotalMem\n",
" 0.00% 2.0120us 3 670ns 364ns 1.1910us cuDeviceGetCount\n",
" 0.00% 1.4340us 2 717ns 373ns 1.0610us cuDeviceGet\n",
" 0.00% 819ns 1 819ns 819ns 819ns cuModuleGetLoadingMode\n",
" 0.00% 382ns 1 382ns 382ns 382ns cuDeviceGetUuid\n",
"\n",
"==677== Unified Memory profiling result:\n",
"Device \"Tesla T4 (0)\"\n",
" Count Avg Size Min Size Max Size Total Size Total Time Name\n",
" 92 89.043KB 4.0000KB 976.00KB 8.000000MB 913.9070us Host To Device\n",
" 24 170.67KB 4.0000KB 0.9961MB 4.000000MB 361.1440us Device To Host\n",
" 10 - - - - 2.564402ms Gpu page fault groups\n",
"Total CPU Page faults: 36\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": []
},
"metadata": {},
"execution_count": 5
}
]
},
{
"cell_type": "code",
"source": [
"%%shell\n",
"nvidia-smi"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "p8kirdLNVe-Z",
"outputId": "cf5103af-f033-4166-8266-e4935f10917e"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Sun Jul 21 14:48:25 2024 \n",
"+---------------------------------------------------------------------------------------+\n",
"| NVIDIA-SMI 535.104.05 Driver Version: 535.104.05 CUDA Version: 12.2 |\n",
"|-----------------------------------------+----------------------+----------------------+\n",
"| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
"| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n",
"| | | MIG M. |\n",
"|=========================================+======================+======================|\n",
"| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n",
"| N/A 76C P0 40W / 70W | 0MiB / 15360MiB | 2% Default |\n",
"| | | N/A |\n",
"+-----------------------------------------+----------------------+----------------------+\n",
" \n",
"+---------------------------------------------------------------------------------------+\n",
"| Processes: |\n",
"| GPU GI CI PID Type Process name GPU Memory |\n",
"| ID ID Usage |\n",
"|=======================================================================================|\n",
"| No running processes found |\n",
"+---------------------------------------------------------------------------------------+\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": []
},
"metadata": {},
"execution_count": 12
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment