apowers313 · July 25, 2024 00:46
diff --git a/cupy_profiling.ipynb b/cupy_profiling.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Info"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wed Jul 24 17:41:09 PDT 2024\n",
      "=========\n",
      "Linux 39f11d97b672 5.15.0-113-generic #123-Ubuntu SMP Mon Jun 10 08:16:17 UTC 2024 x86_64 x86_64 x86_64 GNU/Linux\n",
      "=========\n",
      "nvcc: NVIDIA (R) Cuda compiler driver\n",
      "Copyright (c) 2005-2022 NVIDIA Corporation\n",
      "Built on Mon_Oct_24_19:12:58_PDT_2022\n",
      "Cuda compilation tools, release 12.0, V12.0.76\n",
      "Build cuda_12.0.r12.0/compiler.31968024_0\n",
      "=========\n",
      "Wed Jul 24 17:41:10 2024       \n",
      "+---------------------------------------------------------------------------------------+\n",
      "| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |\n",
      "|-----------------------------------------+----------------------+----------------------+\n",
      "| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
      "| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |\n",
      "|                                         |                      |               MIG M. |\n",
      "|=========================================+======================+======================|\n",
      "|   0  Tesla T4                       Off | 00000000:3B:00.0 Off |                    0 |\n",
      "| N/A   40C    P8               9W /  70W |      2MiB / 15360MiB |      0%      Default |\n",
      "|                                         |                      |                  N/A |\n",
      "+-----------------------------------------+----------------------+----------------------+\n",
      "                                                                                         \n",
      "+---------------------------------------------------------------------------------------+\n",
      "| Processes:                                                                            |\n",
      "|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |\n",
      "|        ID   ID                                                             Usage      |\n",
      "|=======================================================================================|\n",
      "|  No running processes found                                                           |\n",
      "+---------------------------------------------------------------------------------------+\n",
      "=========\n",
      "CuPy version 13.2.0\n",
      "=========\n",
      "NumPy version 1.26.4\n"
     ]
    }
   ],
   "source": [
    "!date\n",
    "print(\"=========\")\n",
    "!uname -a\n",
    "print(\"=========\")\n",
    "import os\n",
    "os.environ[\"PATH\"] += os.pathsep + \"/usr/local/cuda/bin\"\n",
    "!nvcc --version\n",
    "print(\"=========\")\n",
    "!nvidia-smi\n",
    "print(\"=========\")\n",
    "import cupy as cp\n",
    "print(\"CuPy version\", cp.__version__)\n",
    "print(\"=========\")\n",
    "import numpy as np\n",
    "print(\"NumPy version\", np.__version__)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Examples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "f                   :    CPU:    79.255 us   +/-  0.927 (min:    78.492 / max:    85.846) us     GPU-0:    84.850 us   +/-  1.108 (min:    82.592 / max:    92.064) us\n"
     ]
    }
   ],
   "source": [
    "# https://docs.cupy.dev/en/stable/reference/generated/cupyx.profiler.benchmark.html\n",
    "import cupy as cp\n",
    "from cupyx.profiler import benchmark\n",
    "\n",
    "def f(a, b):\n",
    "    return 3 * cp.sin(-a) * b\n",
    "\n",
    "a = 0.5 - cp.random.random((100,))\n",
    "b = cp.random.random((100,))\n",
    "print(benchmark(f, (a, b), n_repeat=1000))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "my_func             :    CPU:    97.239 us   +/-  9.314 (min:    94.165 / max:   187.152) us     GPU-0:   263.434 us   +/- 23.482 (min:   255.840 / max:   376.352) us\n"
     ]
    }
   ],
   "source": [
    "# https://docs.cupy.dev/en/stable/user_guide/performance.html\n",
    "import cupy as cp\n",
    "from cupyx.profiler import benchmark\n",
    "\n",
    "def my_func(a):\n",
    "    return cp.sqrt(cp.sum(a**2, axis=-1))\n",
    "\n",
    "a = cp.random.random((256, 1024))\n",
    "print(benchmark(my_func, (a,), n_repeat=10000))  "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Profiling"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import cupy as cp\n",
    "from cupyx.profiler import benchmark\n",
    "\n",
    "# https://github.com/cupy/cupy-benchmark/blob/master/benchmarks/numpy/bench_linalg.py\n",
    "a = np.arange(60000000.0).reshape(1500, 40000)\n",
    "b = np.arange(240000000.0).reshape(40000, 6000)\n",
    "c = np.arange(6000)\n",
    "d = np.arange(40000)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## NumPy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "33.8 ms ± 2.03 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "def np_rand_max(a, b):\n",
    "    np.dot(d, np.dot(b, c))\n",
    "\n",
    "%timeit np_rand_max(a, b)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CuPy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "33.4 ms ± 1.71 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "def cp_rand_max(a, b):\n",
    "    # https://github.com/cupy/cupy-benchmark/tree/master?tab=readme-ov-file#numpy-benchmark\n",
    "    # the following was stolen from the @sync decorator used in CuPy's emulation\n",
    "    # of NumPy benchmarks\n",
    "    # https://github.com/cupy/cupy-benchmark/blob/master/benchmarks/utils/helper.py\n",
    "    event = cp.cuda.stream.Event()\n",
    "    event.record()\n",
    "    event.synchronize()\n",
    "    cp.dot(d, cp.dot(b, c))\n",
    "    event = cp.cuda.stream.Event()\n",
    "    event.record()\n",
    "    event.synchronize()\n",
    "\n",
    "%timeit cp_rand_max(a, b)\n",
    "# print(benchmark(cp_rand_max, (a,b), n_repeat=10))  "
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Info"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Wed Jul 24 17:41:09 PDT 2024\n",
	"=========\n",
	"Linux 39f11d97b672 5.15.0-113-generic #123-Ubuntu SMP Mon Jun 10 08:16:17 UTC 2024 x86_64 x86_64 x86_64 GNU/Linux\n",
	"=========\n",
	"nvcc: NVIDIA (R) Cuda compiler driver\n",
	"Copyright (c) 2005-2022 NVIDIA Corporation\n",
	"Built on Mon_Oct_24_19:12:58_PDT_2022\n",
	"Cuda compilation tools, release 12.0, V12.0.76\n",
	"Build cuda_12.0.r12.0/compiler.31968024_0\n",
	"=========\n",
	"Wed Jul 24 17:41:10 2024 \n",
	"+---------------------------------------------------------------------------------------+\n",
	"\| NVIDIA-SMI 535.183.01 Driver Version: 535.183.01 CUDA Version: 12.2 \|\n",
	"\|-----------------------------------------+----------------------+----------------------+\n",
	"\| GPU Name Persistence-M \| Bus-Id Disp.A \| Volatile Uncorr. ECC \|\n",
	"\| Fan Temp Perf Pwr:Usage/Cap \| Memory-Usage \| GPU-Util Compute M. \|\n",
	"\| \| \| MIG M. \|\n",
	"\|=========================================+======================+======================\|\n",
	"\| 0 Tesla T4 Off \| 00000000:3B:00.0 Off \| 0 \|\n",
	"\| N/A 40C P8 9W / 70W \| 2MiB / 15360MiB \| 0% Default \|\n",
	"\| \| \| N/A \|\n",
	"+-----------------------------------------+----------------------+----------------------+\n",
	" \n",
	"+---------------------------------------------------------------------------------------+\n",
	"\| Processes: \|\n",
	"\| GPU GI CI PID Type Process name GPU Memory \|\n",
	"\| ID ID Usage \|\n",
	"\|=======================================================================================\|\n",
	"\| No running processes found \|\n",
	"+---------------------------------------------------------------------------------------+\n",
	"=========\n",
	"CuPy version 13.2.0\n",
	"=========\n",
	"NumPy version 1.26.4\n"
	]
	}
	],
	"source": [
	"!date\n",
	"print(\"=========\")\n",
	"!uname -a\n",
	"print(\"=========\")\n",
	"import os\n",
	"os.environ[\"PATH\"] += os.pathsep + \"/usr/local/cuda/bin\"\n",
	"!nvcc --version\n",
	"print(\"=========\")\n",
	"!nvidia-smi\n",
	"print(\"=========\")\n",
	"import cupy as cp\n",
	"print(\"CuPy version\", cp.__version__)\n",
	"print(\"=========\")\n",
	"import numpy as np\n",
	"print(\"NumPy version\", np.__version__)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Examples"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"f : CPU: 79.255 us +/- 0.927 (min: 78.492 / max: 85.846) us GPU-0: 84.850 us +/- 1.108 (min: 82.592 / max: 92.064) us\n"
	]
	}
	],
	"source": [
	"# https://docs.cupy.dev/en/stable/reference/generated/cupyx.profiler.benchmark.html\n",
	"import cupy as cp\n",
	"from cupyx.profiler import benchmark\n",
	"\n",
	"def f(a, b):\n",
	" return 3 * cp.sin(-a) * b\n",
	"\n",
	"a = 0.5 - cp.random.random((100,))\n",
	"b = cp.random.random((100,))\n",
	"print(benchmark(f, (a, b), n_repeat=1000))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"my_func : CPU: 97.239 us +/- 9.314 (min: 94.165 / max: 187.152) us GPU-0: 263.434 us +/- 23.482 (min: 255.840 / max: 376.352) us\n"
	]
	}
	],
	"source": [
	"# https://docs.cupy.dev/en/stable/user_guide/performance.html\n",
	"import cupy as cp\n",
	"from cupyx.profiler import benchmark\n",
	"\n",
	"def my_func(a):\n",
	" return cp.sqrt(cp.sum(a**2, axis=-1))\n",
	"\n",
	"a = cp.random.random((256, 1024))\n",
	"print(benchmark(my_func, (a,), n_repeat=10000)) "
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Profiling"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Setup"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"import cupy as cp\n",
	"from cupyx.profiler import benchmark\n",
	"\n",
	"# https://github.com/cupy/cupy-benchmark/blob/master/benchmarks/numpy/bench_linalg.py\n",
	"a = np.arange(60000000.0).reshape(1500, 40000)\n",
	"b = np.arange(240000000.0).reshape(40000, 6000)\n",
	"c = np.arange(6000)\n",
	"d = np.arange(40000)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## NumPy"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"33.8 ms ± 2.03 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
	]
	}
	],
	"source": [
	"def np_rand_max(a, b):\n",
	" np.dot(d, np.dot(b, c))\n",
	"\n",
	"%timeit np_rand_max(a, b)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## CuPy"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 10,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"33.4 ms ± 1.71 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
	]
	}
	],
	"source": [
	"def cp_rand_max(a, b):\n",
	" # https://github.com/cupy/cupy-benchmark/tree/master?tab=readme-ov-file#numpy-benchmark\n",
	" # the following was stolen from the @sync decorator used in CuPy's emulation\n",
	" # of NumPy benchmarks\n",
	" # https://github.com/cupy/cupy-benchmark/blob/master/benchmarks/utils/helper.py\n",
	" event = cp.cuda.stream.Event()\n",
	" event.record()\n",
	" event.synchronize()\n",
	" cp.dot(d, cp.dot(b, c))\n",
	" event = cp.cuda.stream.Event()\n",
	" event.record()\n",
	" event.synchronize()\n",
	"\n",
	"%timeit cp_rand_max(a, b)\n",
	"# print(benchmark(cp_rand_max, (a,b), n_repeat=10)) "
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": ".venv",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.11.9"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}