sateesh12 · July 21, 2024 17:46
diff --git a/21072024_cpp_cuda.ipynb b/21072024_cpp_cuda.ipynb
 {
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "gpuType": "T4",
      "authorship_tag": "ABX9TyN7nQmzr9CppU7ABly48i0M",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/sateesh12/f5d600a385bbf719711f364ed3640a93/21072024_cpp_cuda.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "#Author : Sateesh Kalidas\n",
        "#Date   : 21/July/2024\n",
        "#Purpose: Trying out CUDA on NVidia GPU via co-lab\n",
        "#Inspiration : https://colab.research.google.com/github/NVDLI/notebooks/blob/master/even-easier-cuda/An_Even_Easier_Introduction_to_CUDA.ipynb"
      ],
      "metadata": {
        "id": "cmqXm_RRE7SA"
      }
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "tzwxlXBxE55v",
        "outputId": "21e125f2-1c76-45a2-9fdd-164f653fb200"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Overwriting add.cu\n"
          ]
        }
      ],
      "source": [
        "%%writefile add.cu\n",
        "// The file name is ending with .cu to vmake it be compilable by nvcc which is the Nvidia version of gcc\n",
        "\n",
        "#include <iostream>\n",
        "#include <math.h>\n",
        "/*\n",
        " * Method: add\n",
        "  * y this is a pointer to a super large array which is also where the summation is done\n",
        "  * x is a pointer to millions of elements which is the input\n",
        "  * Simply add two numbers\n",
        "  * return: void\n",
        "*/\n",
        "\n",
        "#define GPU\n",
        "\n",
        "// The global compiler directive moves the so called Kernel to the GPU for handling.\n",
        "#ifdef GPU\n",
        "__global__\n",
        "void add(int n, float *y, float *x)\n",
        "{\n",
        "    int index = blockIdx.x * blockDim.x + threadIdx.x;\n",
        "    int stride = blockDim.x * gridDim.x;\n",
        "\n",
        "\n",
        "    for(int i = index; i < n; i += stride)\n",
        "    {\n",
        "        y[i] = y[i] + x[i];\n",
        "        printf(\"index: %d\\n\", i);\n",
        "        printf(\"stride: %d\\n\", stride);\n",
        "\n",
        "    }\n",
        "}\n",
        "// The below code is cool, there is no for loop, but addition is happening\n",
        "// due to the parallemism of the CUDA kernel.\n",
        "// The below code is from\n",
        "// https://developer.nvidia.com/blog/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/\n",
        "\n",
        "__global__\n",
        "void add_without_loop(int n, float *y, float *x)\n",
        "{\n",
        "  int index = threadIdx.x + blockIdx.x * blockDim.x;\n",
        "  if(index < n)\n",
        "  {\n",
        "    y[index] = y[index] + x[index];\n",
        "  }\n",
        "\n",
        "}\n",
        "#endif\n",
        "\n",
        "int main()\n",
        "{\n",
        "    float *x, *y;\n",
        "    int N = 1 << 20;\n",
        "#ifdef CPU\n",
        "    float *x = new float[N];\n",
        "    float *y = new float[N];\n",
        "#endif\n",
        "\n",
        "#ifdef GPU\n",
        "    cudaMallocManaged(&x, N*sizeof(float));\n",
        "    cudaMallocManaged(&y, N*sizeof(float));\n",
        "#endif\n",
        "\n",
        "    // Initialize the super large array\n",
        "\n",
        "    for(int i = 0; i < N; i++)\n",
        "    {\n",
        "        x[i] = 2.0f;\n",
        "        y[i] = 1.0f;\n",
        "\n",
        "    }\n",
        "\n",
        "    // Add the addition kernel to the GPU\n",
        "    // This is done with the <<< marker in code.\n",
        "#ifdef CPU\n",
        "    add(N, y, x);\n",
        "#endif\n",
        "\n",
        "#ifdef GPU\n",
        "    // Compute the number of blocks needed\n",
        "    int blockSize = 256;\n",
        "    int numBlocks = (N + blockSize - 1)/blockSize;\n",
        "\n",
        "    // Invoke the kernel in paralle configuration\n",
        "\n",
        "    add_without_loop<<<numBlocks,blockSize>>>(N, y, x);\n",
        "    cudaDeviceSynchronize();\n",
        "    // Below values for 2 << 10 element size.\n",
        "    // <<1 block , 1 thread>> takes 139ms\n",
        "    // <<1 block,  32 threads>> takes 14ms\n",
        "    // <<1 block, 256 threads>> takes 3ms\n",
        "    // << 1 block, 256  threads>> changes matter only if threadIdx.x and blockDim.x are used !\n",
        "    // << computed block size, 256>> 3.3ms\n",
        "\n",
        "#endif\n",
        "\n",
        "    // Validate the results\n",
        "    float maxError = 0.0f;\n",
        "    for (int i =0; i < N; i++)\n",
        "    {\n",
        "        maxError = fmax(maxError, fabs(y[i] - 3.0f));\n",
        "    }\n",
        "    std::cout << \"Max Error is:\" << maxError << std::endl;\n",
        "\n",
        "    // Free the memory resources\n",
        "#ifdef CPU\n",
        "    delete [] x;\n",
        "    delete [] y;\n",
        "#endif\n",
        "\n",
        "#ifdef GPU\n",
        "  cudaFree(x);\n",
        "  cudaFree(y);\n",
        "#endif\n",
        "}\n",
        "\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "%%shell\n",
        "rm add_cuda\n",
        "nvcc add.cu -o add_cuda"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Ed-YbD_cKhYu",
        "outputId": "9c33d810-ec4a-48dc-cf24-2c277707caf5"
      },
      "execution_count": 4,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "rm: cannot remove 'add_cuda': No such file or directory\n"
          ]
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": []
          },
          "metadata": {},
          "execution_count": 4
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "%%shell\n",
        "nvprof ./add_cuda"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "bogwPHjmN6E7",
        "outputId": "8c34eeed-ff41-4d12-fa19-dabdfb872d0d"
      },
      "execution_count": 5,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "==677== NVPROF is profiling process 677, command: ./add_cuda\n",
            "Max Error is:0\n",
            "==677== Profiling application: ./add_cuda\n",
            "==677== Profiling result:\n",
            "            Type  Time(%)      Time     Calls       Avg       Min       Max  Name\n",
            " GPU activities:  100.00%  2.6142ms         1  2.6142ms  2.6142ms  2.6142ms  add_without_loop(int, float*, float*)\n",
            "      API calls:   67.61%  237.93ms         2  118.96ms  45.594us  237.88ms  cudaMallocManaged\n",
            "                   31.44%  110.64ms         1  110.64ms  110.64ms  110.64ms  cudaLaunchKernel\n",
            "                    0.74%  2.6037ms         1  2.6037ms  2.6037ms  2.6037ms  cudaDeviceSynchronize\n",
            "                    0.15%  511.35us         2  255.67us  235.67us  275.68us  cudaFree\n",
            "                    0.06%  200.41us       114  1.7570us     205ns  82.393us  cuDeviceGetAttribute\n",
            "                    0.00%  13.353us         1  13.353us  13.353us  13.353us  cuDeviceGetName\n",
            "                    0.00%  9.2590us         1  9.2590us  9.2590us  9.2590us  cuDeviceGetPCIBusId\n",
            "                    0.00%  5.1830us         1  5.1830us  5.1830us  5.1830us  cuDeviceTotalMem\n",
            "                    0.00%  2.0120us         3     670ns     364ns  1.1910us  cuDeviceGetCount\n",
            "                    0.00%  1.4340us         2     717ns     373ns  1.0610us  cuDeviceGet\n",
            "                    0.00%     819ns         1     819ns     819ns     819ns  cuModuleGetLoadingMode\n",
            "                    0.00%     382ns         1     382ns     382ns     382ns  cuDeviceGetUuid\n",
            "\n",
            "==677== Unified Memory profiling result:\n",
            "Device \"Tesla T4 (0)\"\n",
            "   Count  Avg Size  Min Size  Max Size  Total Size  Total Time  Name\n",
            "      92  89.043KB  4.0000KB  976.00KB  8.000000MB  913.9070us  Host To Device\n",
            "      24  170.67KB  4.0000KB  0.9961MB  4.000000MB  361.1440us  Device To Host\n",
            "      10         -         -         -           -  2.564402ms  Gpu page fault groups\n",
            "Total CPU Page faults: 36\n"
          ]
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": []
          },
          "metadata": {},
          "execution_count": 5
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "%%shell\n",
        "nvidia-smi"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "p8kirdLNVe-Z",
        "outputId": "cf5103af-f033-4166-8266-e4935f10917e"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Sun Jul 21 14:48:25 2024       \n",
            "+---------------------------------------------------------------------------------------+\n",
            "| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |\n",
            "|-----------------------------------------+----------------------+----------------------+\n",
            "| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
            "| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |\n",
            "|                                         |                      |               MIG M. |\n",
            "|=========================================+======================+======================|\n",
            "|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |\n",
            "| N/A   76C    P0              40W /  70W |      0MiB / 15360MiB |      2%      Default |\n",
            "|                                         |                      |                  N/A |\n",
            "+-----------------------------------------+----------------------+----------------------+\n",
            "                                                                                         \n",
            "+---------------------------------------------------------------------------------------+\n",
            "| Processes:                                                                            |\n",
            "|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |\n",
            "|        ID   ID                                                             Usage      |\n",
            "|=======================================================================================|\n",
            "|  No running processes found                                                           |\n",
            "+---------------------------------------------------------------------------------------+\n"
          ]
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": []
          },
          "metadata": {},
          "execution_count": 12
        }
      ]
    }
  ]
 }
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"provenance": [],
	"gpuType": "T4",
	"authorship_tag": "ABX9TyN7nQmzr9CppU7ABly48i0M",
	"include_colab_link": true
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"language_info": {
	"name": "python"
	},
	"accelerator": "GPU"
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/sateesh12/f5d600a385bbf719711f364ed3640a93/21072024_cpp_cuda.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "markdown",
	"source": [
	"#Author : Sateesh Kalidas\n",
	"#Date : 21/July/2024\n",
	"#Purpose: Trying out CUDA on NVidia GPU via co-lab\n",
	"#Inspiration : https://colab.research.google.com/github/NVDLI/notebooks/blob/master/even-easier-cuda/An_Even_Easier_Introduction_to_CUDA.ipynb"
	],
	"metadata": {
	"id": "cmqXm_RRE7SA"
	}
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "tzwxlXBxE55v",
	"outputId": "21e125f2-1c76-45a2-9fdd-164f653fb200"
	},
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"Overwriting add.cu\n"
	]
	}
	],
	"source": [
	"%%writefile add.cu\n",
	"// The file name is ending with .cu to vmake it be compilable by nvcc which is the Nvidia version of gcc\n",
	"\n",
	"#include <iostream>\n",
	"#include <math.h>\n",
	"/*\n",
	" * Method: add\n",
	" * y this is a pointer to a super large array which is also where the summation is done\n",
	" * x is a pointer to millions of elements which is the input\n",
	" * Simply add two numbers\n",
	" * return: void\n",
	"*/\n",
	"\n",
	"#define GPU\n",
	"\n",
	"// The global compiler directive moves the so called Kernel to the GPU for handling.\n",
	"#ifdef GPU\n",
	"__global__\n",
	"void add(int n, float y, float x)\n",
	"{\n",
	" int index = blockIdx.x * blockDim.x + threadIdx.x;\n",
	" int stride = blockDim.x * gridDim.x;\n",
	"\n",
	"\n",
	" for(int i = index; i < n; i += stride)\n",
	" {\n",
	" y[i] = y[i] + x[i];\n",
	" printf(\"index: %d\\n\", i);\n",
	" printf(\"stride: %d\\n\", stride);\n",
	"\n",
	" }\n",
	"}\n",
	"// The below code is cool, there is no for loop, but addition is happening\n",
	"// due to the parallemism of the CUDA kernel.\n",
	"// The below code is from\n",
	"// https://developer.nvidia.com/blog/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/\n",
	"\n",
	"__global__\n",
	"void add_without_loop(int n, float y, float x)\n",
	"{\n",
	" int index = threadIdx.x + blockIdx.x * blockDim.x;\n",
	" if(index < n)\n",
	" {\n",
	" y[index] = y[index] + x[index];\n",
	" }\n",
	"\n",
	"}\n",
	"#endif\n",
	"\n",
	"int main()\n",
	"{\n",
	" float x, y;\n",
	" int N = 1 << 20;\n",
	"#ifdef CPU\n",
	" float *x = new float[N];\n",
	" float *y = new float[N];\n",
	"#endif\n",
	"\n",
	"#ifdef GPU\n",
	" cudaMallocManaged(&x, N*sizeof(float));\n",
	" cudaMallocManaged(&y, N*sizeof(float));\n",
	"#endif\n",
	"\n",
	" // Initialize the super large array\n",
	"\n",
	" for(int i = 0; i < N; i++)\n",
	" {\n",
	" x[i] = 2.0f;\n",
	" y[i] = 1.0f;\n",
	"\n",
	" }\n",
	"\n",
	" // Add the addition kernel to the GPU\n",
	" // This is done with the <<< marker in code.\n",
	"#ifdef CPU\n",
	" add(N, y, x);\n",
	"#endif\n",
	"\n",
	"#ifdef GPU\n",
	" // Compute the number of blocks needed\n",
	" int blockSize = 256;\n",
	" int numBlocks = (N + blockSize - 1)/blockSize;\n",
	"\n",
	" // Invoke the kernel in paralle configuration\n",
	"\n",
	" add_without_loop<<<numBlocks,blockSize>>>(N, y, x);\n",
	" cudaDeviceSynchronize();\n",
	" // Below values for 2 << 10 element size.\n",
	" // <<1 block , 1 thread>> takes 139ms\n",
	" // <<1 block, 32 threads>> takes 14ms\n",
	" // <<1 block, 256 threads>> takes 3ms\n",
	" // << 1 block, 256 threads>> changes matter only if threadIdx.x and blockDim.x are used !\n",
	" // << computed block size, 256>> 3.3ms\n",
	"\n",
	"#endif\n",
	"\n",
	" // Validate the results\n",
	" float maxError = 0.0f;\n",
	" for (int i =0; i < N; i++)\n",
	" {\n",
	" maxError = fmax(maxError, fabs(y[i] - 3.0f));\n",
	" }\n",
	" std::cout << \"Max Error is:\" << maxError << std::endl;\n",
	"\n",
	" // Free the memory resources\n",
	"#ifdef CPU\n",
	" delete [] x;\n",
	" delete [] y;\n",
	"#endif\n",
	"\n",
	"#ifdef GPU\n",
	" cudaFree(x);\n",
	" cudaFree(y);\n",
	"#endif\n",
	"}\n",
	"\n",
	"\n"
	]
	},
	{
	"cell_type": "code",
	"source": [
	"%%shell\n",
	"rm add_cuda\n",
	"nvcc add.cu -o add_cuda"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "Ed-YbD_cKhYu",
	"outputId": "9c33d810-ec4a-48dc-cf24-2c277707caf5"
	},
	"execution_count": 4,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"rm: cannot remove 'add_cuda': No such file or directory\n"
	]
	},
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": []
	},
	"metadata": {},
	"execution_count": 4
	}
	]
	},
	{
	"cell_type": "code",
	"source": [
	"%%shell\n",
	"nvprof ./add_cuda"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "bogwPHjmN6E7",
	"outputId": "8c34eeed-ff41-4d12-fa19-dabdfb872d0d"
	},
	"execution_count": 5,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"==677== NVPROF is profiling process 677, command: ./add_cuda\n",
	"Max Error is:0\n",
	"==677== Profiling application: ./add_cuda\n",
	"==677== Profiling result:\n",
	" Type Time(%) Time Calls Avg Min Max Name\n",
	" GPU activities: 100.00% 2.6142ms 1 2.6142ms 2.6142ms 2.6142ms add_without_loop(int, float, float)\n",
	" API calls: 67.61% 237.93ms 2 118.96ms 45.594us 237.88ms cudaMallocManaged\n",
	" 31.44% 110.64ms 1 110.64ms 110.64ms 110.64ms cudaLaunchKernel\n",
	" 0.74% 2.6037ms 1 2.6037ms 2.6037ms 2.6037ms cudaDeviceSynchronize\n",
	" 0.15% 511.35us 2 255.67us 235.67us 275.68us cudaFree\n",
	" 0.06% 200.41us 114 1.7570us 205ns 82.393us cuDeviceGetAttribute\n",
	" 0.00% 13.353us 1 13.353us 13.353us 13.353us cuDeviceGetName\n",
	" 0.00% 9.2590us 1 9.2590us 9.2590us 9.2590us cuDeviceGetPCIBusId\n",
	" 0.00% 5.1830us 1 5.1830us 5.1830us 5.1830us cuDeviceTotalMem\n",
	" 0.00% 2.0120us 3 670ns 364ns 1.1910us cuDeviceGetCount\n",
	" 0.00% 1.4340us 2 717ns 373ns 1.0610us cuDeviceGet\n",
	" 0.00% 819ns 1 819ns 819ns 819ns cuModuleGetLoadingMode\n",
	" 0.00% 382ns 1 382ns 382ns 382ns cuDeviceGetUuid\n",
	"\n",
	"==677== Unified Memory profiling result:\n",
	"Device \"Tesla T4 (0)\"\n",
	" Count Avg Size Min Size Max Size Total Size Total Time Name\n",
	" 92 89.043KB 4.0000KB 976.00KB 8.000000MB 913.9070us Host To Device\n",
	" 24 170.67KB 4.0000KB 0.9961MB 4.000000MB 361.1440us Device To Host\n",
	" 10 - - - - 2.564402ms Gpu page fault groups\n",
	"Total CPU Page faults: 36\n"
	]
	},
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": []
	},
	"metadata": {},
	"execution_count": 5
	}
	]
	},
	{
	"cell_type": "code",
	"source": [
	"%%shell\n",
	"nvidia-smi"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "p8kirdLNVe-Z",
	"outputId": "cf5103af-f033-4166-8266-e4935f10917e"
	},
	"execution_count": null,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"Sun Jul 21 14:48:25 2024 \n",
	"+---------------------------------------------------------------------------------------+\n",
	"\| NVIDIA-SMI 535.104.05 Driver Version: 535.104.05 CUDA Version: 12.2 \|\n",
	"\|-----------------------------------------+----------------------+----------------------+\n",
	"\| GPU Name Persistence-M \| Bus-Id Disp.A \| Volatile Uncorr. ECC \|\n",
	"\| Fan Temp Perf Pwr:Usage/Cap \| Memory-Usage \| GPU-Util Compute M. \|\n",
	"\| \| \| MIG M. \|\n",
	"\|=========================================+======================+======================\|\n",
	"\| 0 Tesla T4 Off \| 00000000:00:04.0 Off \| 0 \|\n",
	"\| N/A 76C P0 40W / 70W \| 0MiB / 15360MiB \| 2% Default \|\n",
	"\| \| \| N/A \|\n",
	"+-----------------------------------------+----------------------+----------------------+\n",
	" \n",
	"+---------------------------------------------------------------------------------------+\n",
	"\| Processes: \|\n",
	"\| GPU GI CI PID Type Process name GPU Memory \|\n",
	"\| ID ID Usage \|\n",
	"\|=======================================================================================\|\n",
	"\| No running processes found \|\n",
	"+---------------------------------------------------------------------------------------+\n"
	]
	},
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": []
	},
	"metadata": {},
	"execution_count": 12
	}
	]
	}
	]
	}