wheremyfoodat · July 21, 2025 20:22
diff --git a/CUDA.ipynb b/CUDA.ipynb
 {
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "gpuType": "T4"
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "# CUDA on Colab\n",
        "In this notebook we'll set up Colab so that we can run CUDA kernels online, for free. Our kernel source is loaded from Google Drive (From a file named hello.cu at the root of our drive) for ease-of-use. The code can be easily adapted so that it loads the kernel from another location, such as Colab storage.\n",
        "\n",
        "Before running this notebook, make sure to pick a runtime that comes with a GPU. The notebook has been tested with the free T4 GPU, although other GPUs should work too. However, you might need to adjust the `arch` argument of your nvcc invocation depending on the GPU you're using.\n"
      ],
      "metadata": {
        "id": "YD8R98yPedYA"
      }
    },
    {
      "cell_type": "code",
      "execution_count": 26,
      "metadata": {
        "id": "q3gnhLIuVAFR"
      },
      "outputs": [],
      "source": [
        "%%capture\n",
        "\n",
        "# Update packages\n",
        "!sudo apt update\n",
        "!sudo apt upgrade\n",
        "\n",
        "!sudo apt install ubuntu-drivers-common\n",
        "# !sudo ubuntu-drivers install --gpgpu"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Make sure nvcc and our Nvidia drivers have been installed properly. The CUDA version should ideally match between nvcc\n",
        "# and our driver, otherwise we might have trouble launching our kernels\n",
        "!nvcc --version\n",
        "!nvidia-smi"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "gecr6wUuVSpR",
        "outputId": "977f5d42-ad18-4d4d-bebd-1248d2baf6e4"
      },
      "execution_count": 27,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "nvcc: NVIDIA (R) Cuda compiler driver\n",
            "Copyright (c) 2005-2024 NVIDIA Corporation\n",
            "Built on Thu_Jun__6_02:18:23_PDT_2024\n",
            "Cuda compilation tools, release 12.5, V12.5.82\n",
            "Build cuda_12.5.r12.5/compiler.34385749_0\n",
            "Mon Jul 21 19:51:28 2025       \n",
            "+-----------------------------------------------------------------------------------------+\n",
            "| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |\n",
            "|-----------------------------------------+------------------------+----------------------+\n",
            "| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |\n",
            "| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |\n",
            "|                                         |                        |               MIG M. |\n",
            "|=========================================+========================+======================|\n",
            "|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |\n",
            "| N/A   36C    P8             11W /   70W |       0MiB /  15360MiB |      0%      Default |\n",
            "|                                         |                        |                  N/A |\n",
            "+-----------------------------------------+------------------------+----------------------+\n",
            "                                                                                         \n",
            "+-----------------------------------------------------------------------------------------+\n",
            "| Processes:                                                                              |\n",
            "|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |\n",
            "|        ID   ID                                                               Usage      |\n",
            "|=========================================================================================|\n",
            "|  No running processes found                                                             |\n",
            "+-----------------------------------------------------------------------------------------+\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Mount our Google Drive to /content/drive\n",
        "from google.colab import drive\n",
        "drive.mount('/content/drive')"
      ],
      "metadata": {
        "id": "B72zcurYW7iS",
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "48f51a50-1f19-45ca-d872-10cd44ace6c1"
      },
      "execution_count": 28,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Compile our CUDA kernel & run it. If using T4 GPU, -arch=sm_75 is necessary, otherwise the PTX nvcc generates can't run on our GPU drivers!!\n",
        "!nvcc \"/content/drive/MyDrive/hello.cu\" -o hello.out -arch=sm_75 && ./hello.out"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "5FrPoyE5YDIT",
        "outputId": "a9d94b1e-11eb-40de-904e-d72d10d752db"
      },
      "execution_count": 29,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Hello CUDA from CPU\n",
            "Hello CUDA from GPU!\n",
            "Exiting kernel\n"
          ]
        }
      ]
    }
  ]
 }
diff --git a/hello.cu b/hello.cu
 // The notebook expects to load this CUDA kernel from the root of your Google Drive.

 #include <cstdio>
 #include <cstdlib>
 #include <cuda_runtime.h>

 __constant__ char d_message[64];

 __global__ void welcome(char* msg) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    msg[idx] = d_message[idx];
 }

 void printErrors(const char* label) {
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess) {
        std::fprintf(stderr, "%s: %s\n", label, cudaGetErrorString(err));
    }
 }

 int main() {
    printf("Hello CUDA from CPU\n");
  
    char* d_msg;
    char* h_msg;
    const char message[] = "Hello CUDA from GPU!";
    const int length = strlen(message) + 1;

    // Allocate host and device memory
    h_msg = (char*)std::malloc(length * sizeof(char));
    cudaMalloc(&d_msg, length * sizeof(char));
    
    // Copy message to constant memory
    cudaMemcpyToSymbol(d_message, message, length);
    
    // Run CUDA kernel and wait till it's done
    welcome<<<1, length>>>(d_msg);
    printErrors("Kernel launch failed");

    // Copy result back to host
    cudaMemcpy(h_msg, d_msg, length * sizeof(char), cudaMemcpyDeviceToHost);
    h_msg[length-1] = '\0';
    printErrors("Device->Host memcpy failed");

    std::printf("%s\n", h_msg);
    std::printf("Exiting kernel\n");
    
    // Cleanup
    std::free(h_msg);
    cudaFree(d_msg);
    
    return 0;
 }
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"provenance": [],
	"gpuType": "T4"
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"language_info": {
	"name": "python"
	},
	"accelerator": "GPU"
	},
	"cells": [
	{
	"cell_type": "markdown",
	"source": [
	"# CUDA on Colab\n",
	"In this notebook we'll set up Colab so that we can run CUDA kernels online, for free. Our kernel source is loaded from Google Drive (From a file named hello.cu at the root of our drive) for ease-of-use. The code can be easily adapted so that it loads the kernel from another location, such as Colab storage.\n",
	"\n",
	"Before running this notebook, make sure to pick a runtime that comes with a GPU. The notebook has been tested with the free T4 GPU, although other GPUs should work too. However, you might need to adjust the `arch` argument of your nvcc invocation depending on the GPU you're using.\n"
	],
	"metadata": {
	"id": "YD8R98yPedYA"
	}
	},
	{
	"cell_type": "code",
	"execution_count": 26,
	"metadata": {
	"id": "q3gnhLIuVAFR"
	},
	"outputs": [],
	"source": [
	"%%capture\n",
	"\n",
	"# Update packages\n",
	"!sudo apt update\n",
	"!sudo apt upgrade\n",
	"\n",
	"!sudo apt install ubuntu-drivers-common\n",
	"# !sudo ubuntu-drivers install --gpgpu"
	]
	},
	{
	"cell_type": "code",
	"source": [
	"# Make sure nvcc and our Nvidia drivers have been installed properly. The CUDA version should ideally match between nvcc\n",
	"# and our driver, otherwise we might have trouble launching our kernels\n",
	"!nvcc --version\n",
	"!nvidia-smi"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "gecr6wUuVSpR",
	"outputId": "977f5d42-ad18-4d4d-bebd-1248d2baf6e4"
	},
	"execution_count": 27,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"nvcc: NVIDIA (R) Cuda compiler driver\n",
	"Copyright (c) 2005-2024 NVIDIA Corporation\n",
	"Built on Thu_Jun__6_02:18:23_PDT_2024\n",
	"Cuda compilation tools, release 12.5, V12.5.82\n",
	"Build cuda_12.5.r12.5/compiler.34385749_0\n",
	"Mon Jul 21 19:51:28 2025 \n",
	"+-----------------------------------------------------------------------------------------+\n",
	"\| NVIDIA-SMI 550.54.15 Driver Version: 550.54.15 CUDA Version: 12.4 \|\n",
	"\|-----------------------------------------+------------------------+----------------------+\n",
	"\| GPU Name Persistence-M \| Bus-Id Disp.A \| Volatile Uncorr. ECC \|\n",
	"\| Fan Temp Perf Pwr:Usage/Cap \| Memory-Usage \| GPU-Util Compute M. \|\n",
	"\| \| \| MIG M. \|\n",
	"\|=========================================+========================+======================\|\n",
	"\| 0 Tesla T4 Off \| 00000000:00:04.0 Off \| 0 \|\n",
	"\| N/A 36C P8 11W / 70W \| 0MiB / 15360MiB \| 0% Default \|\n",
	"\| \| \| N/A \|\n",
	"+-----------------------------------------+------------------------+----------------------+\n",
	" \n",
	"+-----------------------------------------------------------------------------------------+\n",
	"\| Processes: \|\n",
	"\| GPU GI CI PID Type Process name GPU Memory \|\n",
	"\| ID ID Usage \|\n",
	"\|=========================================================================================\|\n",
	"\| No running processes found \|\n",
	"+-----------------------------------------------------------------------------------------+\n"
	]
	}
	]
	},
	{
	"cell_type": "code",
	"source": [
	"# Mount our Google Drive to /content/drive\n",
	"from google.colab import drive\n",
	"drive.mount('/content/drive')"
	],
	"metadata": {
	"id": "B72zcurYW7iS",
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"outputId": "48f51a50-1f19-45ca-d872-10cd44ace6c1"
	},
	"execution_count": 28,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"
	]
	}
	]
	},
	{
	"cell_type": "code",
	"source": [
	"# Compile our CUDA kernel & run it. If using T4 GPU, -arch=sm_75 is necessary, otherwise the PTX nvcc generates can't run on our GPU drivers!!\n",
	"!nvcc \"/content/drive/MyDrive/hello.cu\" -o hello.out -arch=sm_75 && ./hello.out"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "5FrPoyE5YDIT",
	"outputId": "a9d94b1e-11eb-40de-904e-d72d10d752db"
	},
	"execution_count": 29,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"Hello CUDA from CPU\n",
	"Hello CUDA from GPU!\n",
	"Exiting kernel\n"
	]
	}
	]
	}
	]
	}
	// The notebook expects to load this CUDA kernel from the root of your Google Drive.

	#include <cstdio>
	#include <cstdlib>
	#include <cuda_runtime.h>

	__constant__ char d_message[64];

	__global__ void welcome(char* msg) {
	int idx = blockIdx.x * blockDim.x + threadIdx.x;
	msg[idx] = d_message[idx];
	}

	void printErrors(const char* label) {
	cudaError_t err = cudaGetLastError();
	if (err != cudaSuccess) {
	std::fprintf(stderr, "%s: %s\n", label, cudaGetErrorString(err));
	}
	}

	int main() {
	printf("Hello CUDA from CPU\n");

	char* d_msg;
	char* h_msg;
	const char message[] = "Hello CUDA from GPU!";
	const int length = strlen(message) + 1;

	// Allocate host and device memory
	h_msg = (char)std::malloc(length sizeof(char));
	cudaMalloc(&d_msg, length * sizeof(char));

	// Copy message to constant memory
	cudaMemcpyToSymbol(d_message, message, length);

	// Run CUDA kernel and wait till it's done
	welcome<<<1, length>>>(d_msg);
	printErrors("Kernel launch failed");

	// Copy result back to host
	cudaMemcpy(h_msg, d_msg, length * sizeof(char), cudaMemcpyDeviceToHost);
	h_msg[length-1] = '\0';
	printErrors("Device->Host memcpy failed");

	std::printf("%s\n", h_msg);
	std::printf("Exiting kernel\n");

	// Cleanup
	std::free(h_msg);
	cudaFree(d_msg);

	return 0;
	}