Last active
July 21, 2024 17:46
-
-
Save sateesh12/f5d600a385bbf719711f364ed3640a93 to your computer and use it in GitHub Desktop.
21072024_Cpp_CUDA.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"provenance": [], | |
"gpuType": "T4", | |
"authorship_tag": "ABX9TyN7nQmzr9CppU7ABly48i0M", | |
"include_colab_link": true | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
}, | |
"accelerator": "GPU" | |
}, | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"id": "view-in-github", | |
"colab_type": "text" | |
}, | |
"source": [ | |
"<a href=\"https://colab.research.google.com/gist/sateesh12/f5d600a385bbf719711f364ed3640a93/21072024_cpp_cuda.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"source": [ | |
"#Author : Sateesh Kalidas\n", | |
"#Date : 21/July/2024\n", | |
"#Purpose: Trying out CUDA on NVidia GPU via co-lab\n", | |
"#Inspiration : https://colab.research.google.com/github/NVDLI/notebooks/blob/master/even-easier-cuda/An_Even_Easier_Introduction_to_CUDA.ipynb" | |
], | |
"metadata": { | |
"id": "cmqXm_RRE7SA" | |
} | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "tzwxlXBxE55v", | |
"outputId": "21e125f2-1c76-45a2-9fdd-164f653fb200" | |
}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Overwriting add.cu\n" | |
] | |
} | |
], | |
"source": [ | |
"%%writefile add.cu\n", | |
"// The file name is ending with .cu to vmake it be compilable by nvcc which is the Nvidia version of gcc\n", | |
"\n", | |
"#include <iostream>\n", | |
"#include <math.h>\n", | |
"/*\n", | |
" * Method: add\n", | |
" * y this is a pointer to a super large array which is also where the summation is done\n", | |
" * x is a pointer to millions of elements which is the input\n", | |
" * Simply add two numbers\n", | |
" * return: void\n", | |
"*/\n", | |
"\n", | |
"#define GPU\n", | |
"\n", | |
"// The global compiler directive moves the so called Kernel to the GPU for handling.\n", | |
"#ifdef GPU\n", | |
"__global__\n", | |
"void add(int n, float *y, float *x)\n", | |
"{\n", | |
" int index = blockIdx.x * blockDim.x + threadIdx.x;\n", | |
" int stride = blockDim.x * gridDim.x;\n", | |
"\n", | |
"\n", | |
" for(int i = index; i < n; i += stride)\n", | |
" {\n", | |
" y[i] = y[i] + x[i];\n", | |
" printf(\"index: %d\\n\", i);\n", | |
" printf(\"stride: %d\\n\", stride);\n", | |
"\n", | |
" }\n", | |
"}\n", | |
"// The below code is cool, there is no for loop, but addition is happening\n", | |
"// due to the parallemism of the CUDA kernel.\n", | |
"// The below code is from\n", | |
"// https://developer.nvidia.com/blog/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/\n", | |
"\n", | |
"__global__\n", | |
"void add_without_loop(int n, float *y, float *x)\n", | |
"{\n", | |
" int index = threadIdx.x + blockIdx.x * blockDim.x;\n", | |
" if(index < n)\n", | |
" {\n", | |
" y[index] = y[index] + x[index];\n", | |
" }\n", | |
"\n", | |
"}\n", | |
"#endif\n", | |
"\n", | |
"int main()\n", | |
"{\n", | |
" float *x, *y;\n", | |
" int N = 1 << 20;\n", | |
"#ifdef CPU\n", | |
" float *x = new float[N];\n", | |
" float *y = new float[N];\n", | |
"#endif\n", | |
"\n", | |
"#ifdef GPU\n", | |
" cudaMallocManaged(&x, N*sizeof(float));\n", | |
" cudaMallocManaged(&y, N*sizeof(float));\n", | |
"#endif\n", | |
"\n", | |
" // Initialize the super large array\n", | |
"\n", | |
" for(int i = 0; i < N; i++)\n", | |
" {\n", | |
" x[i] = 2.0f;\n", | |
" y[i] = 1.0f;\n", | |
"\n", | |
" }\n", | |
"\n", | |
" // Add the addition kernel to the GPU\n", | |
" // This is done with the <<< marker in code.\n", | |
"#ifdef CPU\n", | |
" add(N, y, x);\n", | |
"#endif\n", | |
"\n", | |
"#ifdef GPU\n", | |
" // Compute the number of blocks needed\n", | |
" int blockSize = 256;\n", | |
" int numBlocks = (N + blockSize - 1)/blockSize;\n", | |
"\n", | |
" // Invoke the kernel in paralle configuration\n", | |
"\n", | |
" add_without_loop<<<numBlocks,blockSize>>>(N, y, x);\n", | |
" cudaDeviceSynchronize();\n", | |
" // Below values for 2 << 10 element size.\n", | |
" // <<1 block , 1 thread>> takes 139ms\n", | |
" // <<1 block, 32 threads>> takes 14ms\n", | |
" // <<1 block, 256 threads>> takes 3ms\n", | |
" // << 1 block, 256 threads>> changes matter only if threadIdx.x and blockDim.x are used !\n", | |
" // << computed block size, 256>> 3.3ms\n", | |
"\n", | |
"#endif\n", | |
"\n", | |
" // Validate the results\n", | |
" float maxError = 0.0f;\n", | |
" for (int i =0; i < N; i++)\n", | |
" {\n", | |
" maxError = fmax(maxError, fabs(y[i] - 3.0f));\n", | |
" }\n", | |
" std::cout << \"Max Error is:\" << maxError << std::endl;\n", | |
"\n", | |
" // Free the memory resources\n", | |
"#ifdef CPU\n", | |
" delete [] x;\n", | |
" delete [] y;\n", | |
"#endif\n", | |
"\n", | |
"#ifdef GPU\n", | |
" cudaFree(x);\n", | |
" cudaFree(y);\n", | |
"#endif\n", | |
"}\n", | |
"\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"%%shell\n", | |
"rm add_cuda\n", | |
"nvcc add.cu -o add_cuda" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "Ed-YbD_cKhYu", | |
"outputId": "9c33d810-ec4a-48dc-cf24-2c277707caf5" | |
}, | |
"execution_count": 4, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"rm: cannot remove 'add_cuda': No such file or directory\n" | |
] | |
}, | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [] | |
}, | |
"metadata": {}, | |
"execution_count": 4 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"%%shell\n", | |
"nvprof ./add_cuda" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "bogwPHjmN6E7", | |
"outputId": "8c34eeed-ff41-4d12-fa19-dabdfb872d0d" | |
}, | |
"execution_count": 5, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"==677== NVPROF is profiling process 677, command: ./add_cuda\n", | |
"Max Error is:0\n", | |
"==677== Profiling application: ./add_cuda\n", | |
"==677== Profiling result:\n", | |
" Type Time(%) Time Calls Avg Min Max Name\n", | |
" GPU activities: 100.00% 2.6142ms 1 2.6142ms 2.6142ms 2.6142ms add_without_loop(int, float*, float*)\n", | |
" API calls: 67.61% 237.93ms 2 118.96ms 45.594us 237.88ms cudaMallocManaged\n", | |
" 31.44% 110.64ms 1 110.64ms 110.64ms 110.64ms cudaLaunchKernel\n", | |
" 0.74% 2.6037ms 1 2.6037ms 2.6037ms 2.6037ms cudaDeviceSynchronize\n", | |
" 0.15% 511.35us 2 255.67us 235.67us 275.68us cudaFree\n", | |
" 0.06% 200.41us 114 1.7570us 205ns 82.393us cuDeviceGetAttribute\n", | |
" 0.00% 13.353us 1 13.353us 13.353us 13.353us cuDeviceGetName\n", | |
" 0.00% 9.2590us 1 9.2590us 9.2590us 9.2590us cuDeviceGetPCIBusId\n", | |
" 0.00% 5.1830us 1 5.1830us 5.1830us 5.1830us cuDeviceTotalMem\n", | |
" 0.00% 2.0120us 3 670ns 364ns 1.1910us cuDeviceGetCount\n", | |
" 0.00% 1.4340us 2 717ns 373ns 1.0610us cuDeviceGet\n", | |
" 0.00% 819ns 1 819ns 819ns 819ns cuModuleGetLoadingMode\n", | |
" 0.00% 382ns 1 382ns 382ns 382ns cuDeviceGetUuid\n", | |
"\n", | |
"==677== Unified Memory profiling result:\n", | |
"Device \"Tesla T4 (0)\"\n", | |
" Count Avg Size Min Size Max Size Total Size Total Time Name\n", | |
" 92 89.043KB 4.0000KB 976.00KB 8.000000MB 913.9070us Host To Device\n", | |
" 24 170.67KB 4.0000KB 0.9961MB 4.000000MB 361.1440us Device To Host\n", | |
" 10 - - - - 2.564402ms Gpu page fault groups\n", | |
"Total CPU Page faults: 36\n" | |
] | |
}, | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [] | |
}, | |
"metadata": {}, | |
"execution_count": 5 | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"source": [ | |
"%%shell\n", | |
"nvidia-smi" | |
], | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "p8kirdLNVe-Z", | |
"outputId": "cf5103af-f033-4166-8266-e4935f10917e" | |
}, | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"name": "stdout", | |
"text": [ | |
"Sun Jul 21 14:48:25 2024 \n", | |
"+---------------------------------------------------------------------------------------+\n", | |
"| NVIDIA-SMI 535.104.05 Driver Version: 535.104.05 CUDA Version: 12.2 |\n", | |
"|-----------------------------------------+----------------------+----------------------+\n", | |
"| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |\n", | |
"| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |\n", | |
"| | | MIG M. |\n", | |
"|=========================================+======================+======================|\n", | |
"| 0 Tesla T4 Off | 00000000:00:04.0 Off | 0 |\n", | |
"| N/A 76C P0 40W / 70W | 0MiB / 15360MiB | 2% Default |\n", | |
"| | | N/A |\n", | |
"+-----------------------------------------+----------------------+----------------------+\n", | |
" \n", | |
"+---------------------------------------------------------------------------------------+\n", | |
"| Processes: |\n", | |
"| GPU GI CI PID Type Process name GPU Memory |\n", | |
"| ID ID Usage |\n", | |
"|=======================================================================================|\n", | |
"| No running processes found |\n", | |
"+---------------------------------------------------------------------------------------+\n" | |
] | |
}, | |
{ | |
"output_type": "execute_result", | |
"data": { | |
"text/plain": [] | |
}, | |
"metadata": {}, | |
"execution_count": 12 | |
} | |
] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment