Skip to content

Instantly share code, notes, and snippets.

@bwasti
Created October 10, 2021 00:15
Show Gist options
  • Save bwasti/29197f4931e99ff431a0433515b58123 to your computer and use it in GitHub Desktop.
Save bwasti/29197f4931e99ff431a0433515b58123 to your computer and use it in GitHub Desktop.
loop_tool_lowlevel_sweep.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "loop_tool_lowlevel_sweep.ipynb",
"provenance": [],
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/bwasti/29197f4931e99ff431a0433515b58123/loop_tool_lowlevel_sweep.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "DsWTNf6MT4vz",
"outputId": "9e6cd04e-f5e3-4775-f783-e414db7a9555"
},
"source": [
"!pip install loop_tool_py"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Collecting loop_tool_py\n",
" Downloading loop_tool_py-0.0.8-3-cp37-cp37m-manylinux_2_24_x86_64.whl (378 kB)\n",
"\u001b[?25l\r\u001b[K |▉ | 10 kB 27.2 MB/s eta 0:00:01\r\u001b[K |█▊ | 20 kB 28.8 MB/s eta 0:00:01\r\u001b[K |██▋ | 30 kB 33.2 MB/s eta 0:00:01\r\u001b[K |███▌ | 40 kB 23.6 MB/s eta 0:00:01\r\u001b[K |████▎ | 51 kB 18.8 MB/s eta 0:00:01\r\u001b[K |█████▏ | 61 kB 18.7 MB/s eta 0:00:01\r\u001b[K |██████ | 71 kB 19.0 MB/s eta 0:00:01\r\u001b[K |███████ | 81 kB 20.0 MB/s eta 0:00:01\r\u001b[K |███████▉ | 92 kB 20.8 MB/s eta 0:00:01\r\u001b[K |████████▋ | 102 kB 21.7 MB/s eta 0:00:01\r\u001b[K |█████████▌ | 112 kB 21.7 MB/s eta 0:00:01\r\u001b[K |██████████▍ | 122 kB 21.7 MB/s eta 0:00:01\r\u001b[K |███████████▎ | 133 kB 21.7 MB/s eta 0:00:01\r\u001b[K |████████████ | 143 kB 21.7 MB/s eta 0:00:01\r\u001b[K |█████████████ | 153 kB 21.7 MB/s eta 0:00:01\r\u001b[K |█████████████▉ | 163 kB 21.7 MB/s eta 0:00:01\r\u001b[K |██████████████▊ | 174 kB 21.7 MB/s eta 0:00:01\r\u001b[K |███████████████▋ | 184 kB 21.7 MB/s eta 0:00:01\r\u001b[K |████████████████▍ | 194 kB 21.7 MB/s eta 0:00:01\r\u001b[K |█████████████████▎ | 204 kB 21.7 MB/s eta 0:00:01\r\u001b[K |██████████████████▏ | 215 kB 21.7 MB/s eta 0:00:01\r\u001b[K |███████████████████ | 225 kB 21.7 MB/s eta 0:00:01\r\u001b[K |████████████████████ | 235 kB 21.7 MB/s eta 0:00:01\r\u001b[K |████████████████████▊ | 245 kB 21.7 MB/s eta 0:00:01\r\u001b[K |█████████████████████▋ | 256 kB 21.7 MB/s eta 0:00:01\r\u001b[K |██████████████████████▌ | 266 kB 21.7 MB/s eta 0:00:01\r\u001b[K |███████████████████████▍ | 276 kB 21.7 MB/s eta 0:00:01\r\u001b[K |████████████████████████▏ | 286 kB 21.7 MB/s eta 0:00:01\r\u001b[K |█████████████████████████ | 296 kB 21.7 MB/s eta 0:00:01\r\u001b[K |██████████████████████████ | 307 kB 21.7 MB/s eta 0:00:01\r\u001b[K |██████████████████████████▉ | 317 kB 21.7 MB/s eta 0:00:01\r\u001b[K |███████████████████████████▊ | 327 kB 21.7 MB/s eta 0:00:01\r\u001b[K |████████████████████████████▌ | 337 kB 21.7 MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▍ | 348 kB 21.7 MB/s eta 0:00:01\r\u001b[K |██████████████████████████████▎ | 358 kB 21.7 MB/s eta 0:00:01\r\u001b[K |███████████████████████████████▏| 368 kB 21.7 MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 378 kB 21.7 MB/s \n",
"\u001b[?25hCollecting pybind11\n",
" Downloading pybind11-2.8.0-py2.py3-none-any.whl (207 kB)\n",
"\u001b[K |████████████████████████████████| 207 kB 42.2 MB/s \n",
"\u001b[?25hInstalling collected packages: pybind11, loop-tool-py\n",
"Successfully installed loop-tool-py-0.0.8 pybind11-2.8.0\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 248
},
"id": "zJe5zowCT3c8",
"outputId": "ff029bd9-9c1a-47f9-c4e7-e17d52e02b0e"
},
"source": [
"import loop_tool_py as lt\n",
"import numpy as np\n",
"import time\n",
"\n",
"assert \"cuda\" in lt.backends(), \"this needs to be run on a GPU enabled notebook\"\n",
"lt.set_default_hardware(\"cuda\")\n",
"\n",
"def gen_pw_add():\n",
" ir = lt.IR()\n",
" a = ir.create_var(\"a\")\n",
" r0 = ir.create_node(lt.read, [], [a])\n",
" r1 = ir.create_node(lt.read, [], [a])\n",
" add = ir.create_node(lt.add, [r0, r1], [a])\n",
" w = ir.create_node(lt.write, [add], [a])\n",
" ir.set_inputs([r0, r1])\n",
" ir.set_outputs([w])\n",
" return ir, a\n",
"\n",
"\n",
"def test_pw(size, inner_size, vec_size):\n",
" assert size >= (inner_size * vec_size)\n",
" ir, v = gen_pw_add() # v = pointwise var\n",
" size_map = {}\n",
" size_map[v] = size\n",
" for n in ir.nodes:\n",
" outer = size // (inner_size * vec_size)\n",
" outer_rem = size % (inner_size * vec_size)\n",
"\n",
" ir.set_order(\n",
" n, [(v, (outer, outer_rem)), (v, (inner_size, 0)), (v, (vec_size, 0))]\n",
" )\n",
" ir.disable_reuse(n, 2)\n",
" loop_tree = lt.LoopTree(ir)\n",
" A = lt.RawTensor(size)\n",
" B = lt.RawTensor(size)\n",
" C = lt.RawTensor(size)\n",
" Ap = np.random.randn(size)\n",
" Bp = np.random.randn(size)\n",
" A.set(Ap)\n",
" B.set(Bp)\n",
" C_ref = Ap + Bp\n",
" C.set(1337.0)\n",
" parallel = set(loop_tree.children(loop_tree.roots[0]))\n",
" c = lt.cuda(loop_tree, parallel)\n",
" c([A, B, C])\n",
" C_test = C.to_numpy()\n",
" max_diff = np.max(np.abs(C_test - C_ref))\n",
" mean_val = np.mean(np.abs(C_ref))\n",
" assert max_diff < 1e-3 * mean_val\n",
" iters = 10000\n",
" # warmup\n",
" for i in range(50):\n",
" c([A, B, C])\n",
" t = time.time()\n",
" for i in range(iters - 1):\n",
" c([A, B, C], False)\n",
" c([A, B, C])\n",
" t_ = time.time()\n",
" # 2 read 1 write, 4 bytes per float\n",
" bytes_moved = (2 + 1) * 4 * size * iters / (t_ - t) / 1e9\n",
" pct = bytes_moved / c.bandwidth\n",
" usec = (t_ - t) / iters * 1e6\n",
" # print(f\"peak: {c.bandwidth} GB/sec\")\n",
" print(\n",
" f\"{bytes_moved:.2f} GB/sec\",\n",
" f\"({100 * pct:.2f}% of peak, {usec:.2f} usec per iter)\",\n",
" )\n",
" return (\n",
" bytes_moved,\n",
" c.code,\n",
" loop_tree.dump(lambda x: \"// Threaded\" if x in parallel else \"\"),\n",
" )\n",
"\n",
"\n",
"s = 1024 * 1024\n",
"best = 0\n",
"code = \"\"\n",
"loop_tree = \"\"\n",
"inner_scale = 512 * 8\n",
"for i in range(1, s // inner_scale):\n",
" inner = i * inner_scale\n",
" for vec_pow in range(0, 3):\n",
" vec = 2 ** vec_pow\n",
" inner = inner // vec\n",
" b, c, l = test_pw(s, inner, vec)\n",
" if b > best:\n",
" best = b\n",
" code = c\n",
" loop_tree = l\n",
"print(f\"Best kernel found ({best:.2f} GB/sec):\")\n",
"print(loop_tree)\n",
"print(code)"
],
"execution_count": null,
"outputs": [
{
"output_type": "error",
"ename": "AssertionError",
"evalue": "ignored",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-2-6393985f1619>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0;32massert\u001b[0m \u001b[0;34m\"cuda\"\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mlt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbackends\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"this needs to be run on a GPU enabled notebook\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mlt\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mset_default_hardware\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"cuda\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mAssertionError\u001b[0m: this needs to be run on a GPU enabled notebook"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "EnFO-MYYUZmA"
},
"source": [
""
],
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment