genkuroki · May 29, 2021 12:09
diff --git a/Octavian M=K=N=1024.ipynb b/Octavian M=K=N=1024.ipynb
diff --git a/Octavian M=K=N=128.ipynb b/Octavian M=K=N=128.ipynb
diff --git a/Octavian M=K=N=32.ipynb b/Octavian M=K=N=32.ipynb
 {
  "cells": [
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "https://discourse.julialang.org/t/intel-c-c-compiler-performance-versus-julia/61929/18"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "versioninfo()",
      "execution_count": 1,
      "outputs": [
        {
          "output_type": "stream",
          "text": "Julia Version 1.7.0-DEV.1129\nCommit 9117b4d6d6 (2021-05-20 16:42 UTC)\nPlatform Info:\n  OS: Windows (x86_64-w64-mingw32)\n  CPU: Intel(R) Core(TM) i7-10750H CPU @ 2.60GHz\n  WORD_SIZE: 64\n  LIBM: libopenlibm\n  LLVM: libLLVM-11.0.1 (ORCJIT, skylake)\nEnvironment:\n  JULIA_NUM_THREADS = 12\n  JULIA_PYTHONCALL_EXE = C:\\Users\\genkuroki\\.julia\\conda\\3\\python.exe\n",
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "using LinearAlgebra\nusing BLASBenchmarksCPU\nusing Octavian\nusing BenchmarkHistograms",
      "execution_count": 2,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "M = K = N = 32\nA = rand(M, K)\nB = rand(K, N)\nC1 = @time(A * B)\nC0 = similar(C1);",
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "stream",
          "text": "  0.539203 seconds (2.53 M allocations: 134.135 MiB, 7.92% gc time, 99.95% compilation time)\n",
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "@benchmark mul!($C0, $A, $B)",
      "execution_count": 4,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 4,
          "data": {
            "text/plain": "samples: 10000; evals/sample: 9; memory estimate: 0 bytes; allocs estimate: 0\nns\n\n (2290.0 - 2700.0 ]  \u001b[32m██████████████████████████████ \u001b[39m9341\n (2700.0 - 3100.0 ]  \u001b[32m█▏\u001b[39m350\n (3100.0 - 3510.0 ]  \u001b[32m▏\u001b[39m8\n (3510.0 - 3920.0 ]  \u001b[32m▏\u001b[39m19\n (3920.0 - 4330.0 ]  \u001b[32m▍\u001b[39m79\n (4330.0 - 4740.0 ]  \u001b[32m▎\u001b[39m51\n (4740.0 - 5140.0 ]  \u001b[32m▏\u001b[39m38\n (5140.0 - 5550.0 ]  \u001b[32m▏\u001b[39m16\n (5550.0 - 5960.0 ]  \u001b[32m▏\u001b[39m31\n (5960.0 - 6370.0 ]  \u001b[32m▏\u001b[39m20\n (6370.0 - 6780.0 ]  \u001b[32m▏\u001b[39m14\n (6780.0 - 7180.0 ]  \u001b[32m▏\u001b[39m15\n (7180.0 - 7590.0 ]  \u001b[32m▏\u001b[39m5\n (7590.0 - 8000.0 ]  \u001b[32m▏\u001b[39m3\n (8000.0 - 10600.0]  \u001b[32m▏\u001b[39m10\n\n                  Counts\n\nmin: 2.289 μs (0.00% GC); mean: 2.426 μs (0.00% GC); median: 2.311 μs (0.00% GC); max: 10.600 μs (0.00% GC)."
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "# MKL dgemm\n@benchmark gemmmkl!($C0, $A, $B)",
      "execution_count": 5,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 5,
          "data": {
            "text/plain": "samples: 10000; evals/sample: 10; memory estimate: 0 bytes; allocs estimate: 0\nns\n\n (1290.0 - 1600.0]  \u001b[32m██████████████████████████████ \u001b[39m9490\n (1600.0 - 1910.0]  \u001b[32m█▎\u001b[39m356\n (1910.0 - 2220.0]  \u001b[32m▏\u001b[39m4\n (2220.0 - 2530.0]  \u001b[32m▏\u001b[39m2\n (2530.0 - 2840.0]  \u001b[32m▏\u001b[39m17\n (2840.0 - 3150.0]  \u001b[32m▎\u001b[39m40\n (3150.0 - 3470.0]  \u001b[32m▏\u001b[39m11\n (3470.0 - 3780.0]  \u001b[32m▏\u001b[39m16\n (3780.0 - 4090.0]  \u001b[32m▏\u001b[39m6\n (4090.0 - 4400.0]  \u001b[32m▏\u001b[39m10\n (4400.0 - 4710.0]  \u001b[32m▏\u001b[39m14\n (4710.0 - 5020.0]  \u001b[32m▏\u001b[39m8\n (5020.0 - 5330.0]  \u001b[32m▏\u001b[39m9\n (5330.0 - 5640.0]  \u001b[32m▏\u001b[39m7\n (5640.0 - 6950.0]  \u001b[32m▏\u001b[39m10\n\n                  Counts\n\nmin: 1.290 μs (0.00% GC); mean: 1.370 μs (0.00% GC); median: 1.300 μs (0.00% GC); max: 6.950 μs (0.00% GC)."
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "# MKL dgemm_direct\n@benchmark gemmmkl_direct!($C0, $A, $B)",
      "execution_count": 6,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 6,
          "data": {
            "text/plain": "samples: 10000; evals/sample: 10; memory estimate: 0 bytes; allocs estimate: 0\nns\n\n (1200.0 - 1520.0]  \u001b[32m██████████████████████████████\u001b[39m9321\n (1520.0 - 1830.0]  \u001b[32m█▌\u001b[39m430\n (1830.0 - 2150.0]  \u001b[32m▏\u001b[39m19\n (2150.0 - 2470.0]  \u001b[32m▏\u001b[39m25\n (2470.0 - 2780.0]  \u001b[32m▏\u001b[39m34\n (2780.0 - 3100.0]  \u001b[32m▎\u001b[39m47\n (3100.0 - 3420.0]  \u001b[32m▏\u001b[39m29\n (3420.0 - 3730.0]  \u001b[32m▏\u001b[39m31\n (3730.0 - 4050.0]  \u001b[32m▏\u001b[39m14\n (4050.0 - 4360.0]  \u001b[32m▏\u001b[39m7\n (4360.0 - 4680.0]  \u001b[32m▏\u001b[39m7\n (4680.0 - 5000.0]  \u001b[32m▏\u001b[39m8\n (5000.0 - 5310.0]  \u001b[32m▏\u001b[39m10\n (5310.0 - 5630.0]  \u001b[32m▏\u001b[39m8\n (5630.0 - 7420.0]  \u001b[32m▏\u001b[39m10\n\n                  Counts\n\nmin: 1.200 μs (0.00% GC); mean: 1.302 μs (0.00% GC); median: 1.220 μs (0.00% GC); max: 7.420 μs (0.00% GC)."
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "# Octavian.jl\n@benchmark matmul!($C0, $A, $B)",
      "execution_count": 7,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 7,
          "data": {
            "text/plain": "samples: 10000; evals/sample: 142; memory estimate: 0 bytes; allocs estimate: 0\nns\n\n (700.0  - 790.0 ]  \u001b[32m██████████████████████████████ \u001b[39m8213\n (790.0  - 870.0 ]  \u001b[32m██▌\u001b[39m661\n (870.0  - 960.0 ]  \u001b[32m█\u001b[39m249\n (960.0  - 1040.0]  \u001b[32m█▏\u001b[39m283\n (1040.0 - 1130.0]  \u001b[32m▋\u001b[39m152\n (1130.0 - 1210.0]  \u001b[32m▋\u001b[39m137\n (1210.0 - 1300.0]  \u001b[32m▍\u001b[39m82\n (1300.0 - 1380.0]  \u001b[32m▍\u001b[39m78\n (1380.0 - 1470.0]  \u001b[32m▎\u001b[39m48\n (1470.0 - 1550.0]  \u001b[32m▏\u001b[39m31\n (1550.0 - 1640.0]  \u001b[32m▏\u001b[39m10\n (1640.0 - 1720.0]  \u001b[32m▏\u001b[39m13\n (1720.0 - 1800.0]  \u001b[32m▏\u001b[39m20\n (1800.0 - 1890.0]  \u001b[32m▏\u001b[39m13\n (1890.0 - 2380.0]  \u001b[32m▏\u001b[39m10\n\n                  Counts\n\nmin: 704.225 ns (0.00% GC); mean: 781.582 ns (0.00% GC); median: 727.465 ns (0.00% GC); max: 2.385 μs (0.00% GC)."
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "using StaticArrays\nAm = MMatrix{M, K}(A)\nBm = MMatrix{K, N}(B)\nCm = similar(Am);",
      "execution_count": 8,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "@benchmark mul!($Cm, $Am, $Bm)",
      "execution_count": 9,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 9,
          "data": {
            "text/plain": "samples: 10000; evals/sample: 9; memory estimate: 0 bytes; allocs estimate: 0\nns\n\n (2270.0 - 2700.0 ]  \u001b[32m██████████████████████████████▏\u001b[39m7222\n (2700.0 - 3130.0 ]  \u001b[32m███▌\u001b[39m826\n (3130.0 - 3570.0 ]  \u001b[32m█▍\u001b[39m329\n (3570.0 - 4000.0 ]  \u001b[32m███▉\u001b[39m913\n (4000.0 - 4440.0 ]  \u001b[32m█▋\u001b[39m362\n (4440.0 - 4870.0 ]  \u001b[32m▋\u001b[39m123\n (4870.0 - 5310.0 ]  \u001b[32m▎\u001b[39m52\n (5310.0 - 5740.0 ]  \u001b[32m▎\u001b[39m32\n (5740.0 - 6170.0 ]  \u001b[32m▏\u001b[39m27\n (6170.0 - 6610.0 ]  \u001b[32m▎\u001b[39m31\n (6610.0 - 7040.0 ]  \u001b[32m▏\u001b[39m23\n (7040.0 - 7480.0 ]  \u001b[32m▏\u001b[39m11\n (7480.0 - 7910.0 ]  \u001b[32m▏\u001b[39m25\n (7910.0 - 8340.0 ]  \u001b[32m▏\u001b[39m14\n (8340.0 - 33570.0]  \u001b[32m▏\u001b[39m10\n\n                  Counts\n\nmin: 2.267 μs (0.00% GC); mean: 2.850 μs (0.00% GC); median: 2.511 μs (0.00% GC); max: 33.567 μs (0.00% GC)."
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "# StaticArrays.MMatrix are statically sized\n@benchmark matmul!($Cm, $Am, $Bm)",
      "execution_count": 10,
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 10,
          "data": {
            "text/plain": "samples: 10000; evals/sample: 157; memory estimate: 0 bytes; allocs estimate: 0\nns\n\n (670.0  - 760.0 ]  \u001b[32m██████████████████████████████ \u001b[39m7928\n (760.0  - 850.0 ]  \u001b[32m█▍\u001b[39m360\n (850.0  - 940.0 ]  \u001b[32m█▏\u001b[39m277\n (940.0  - 1030.0]  \u001b[32m█▊\u001b[39m441\n (1030.0 - 1130.0]  \u001b[32m█\u001b[39m257\n (1130.0 - 1220.0]  \u001b[32m█▋\u001b[39m417\n (1220.0 - 1310.0]  \u001b[32m▌\u001b[39m119\n (1310.0 - 1400.0]  \u001b[32m▍\u001b[39m71\n (1400.0 - 1490.0]  \u001b[32m▎\u001b[39m55\n (1490.0 - 1580.0]  \u001b[32m▏\u001b[39m23\n (1580.0 - 1680.0]  \u001b[32m▏\u001b[39m17\n (1680.0 - 1770.0]  \u001b[32m▏\u001b[39m14\n (1770.0 - 1860.0]  \u001b[32m▏\u001b[39m7\n (1860.0 - 1950.0]  \u001b[32m▏\u001b[39m4\n (1950.0 - 2730.0]  \u001b[32m▏\u001b[39m10\n\n                  Counts\n\nmin: 668.790 ns (0.00% GC); mean: 785.391 ns (0.00% GC); median: 714.013 ns (0.00% GC); max: 2.734 μs (0.00% GC)."
          },
          "metadata": {}
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "",
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "@webio": {
      "lastKernelId": null,
      "lastCommId": null
    },
    "kernelspec": {
      "name": "julia-1.7-depwarn-o3",
      "display_name": "Julia 1.7.0-DEV depwarn -O3",
      "language": "julia"
    },
    "language_info": {
      "file_extension": ".jl",
      "name": "julia",
      "mimetype": "application/julia",
      "version": "1.7.0"
    },
    "toc": {
      "nav_menu": {},
      "number_sections": true,
      "sideBar": true,
      "skip_h1_title": false,
      "base_numbering": 1,
      "title_cell": "Table of Contents",
      "title_sidebar": "Contents",
      "toc_cell": false,
      "toc_position": {},
      "toc_section_display": true,
      "toc_window_display": false
    },
    "gist": {
      "id": "6123aef79488bc20b52047656fc6f015",
      "data": {
        "description": "Octavian",
        "public": true
      }
    },
    "_draft": {
      "nbviewer_url": "https://gist.github.com/6123aef79488bc20b52047656fc6f015"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 5
 }
diff --git a/Octavian M=K=N=512.ipynb b/Octavian M=K=N=512.ipynb
diff --git a/Octavian M=K=N=8.ipynb b/Octavian M=K=N=8.ipynb
	{
	"cells": [
	{
	"metadata": {},
	"cell_type": "markdown",
	"source": "https://discourse.julialang.org/t/intel-c-c-compiler-performance-versus-julia/61929/18"
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "versioninfo()",
	"execution_count": 1,
	"outputs": [
	{
	"output_type": "stream",
	"text": "Julia Version 1.7.0-DEV.1129\nCommit 9117b4d6d6 (2021-05-20 16:42 UTC)\nPlatform Info:\n OS: Windows (x86_64-w64-mingw32)\n CPU: Intel(R) Core(TM) i7-10750H CPU @ 2.60GHz\n WORD_SIZE: 64\n LIBM: libopenlibm\n LLVM: libLLVM-11.0.1 (ORCJIT, skylake)\nEnvironment:\n JULIA_NUM_THREADS = 12\n JULIA_PYTHONCALL_EXE = C:\\Users\\genkuroki\\.julia\\conda\\3\\python.exe\n",
	"name": "stdout"
	}
	]
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "using LinearAlgebra\nusing BLASBenchmarksCPU\nusing Octavian\nusing BenchmarkHistograms",
	"execution_count": 2,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "M = K = N = 32\nA = rand(M, K)\nB = rand(K, N)\nC1 = @time(A * B)\nC0 = similar(C1);",
	"execution_count": 3,
	"outputs": [
	{
	"output_type": "stream",
	"text": " 0.539203 seconds (2.53 M allocations: 134.135 MiB, 7.92% gc time, 99.95% compilation time)\n",
	"name": "stdout"
	}
	]
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "@benchmark mul!($C0, $A, $B)",
	"execution_count": 4,
	"outputs": [
	{
	"output_type": "execute_result",
	"execution_count": 4,
	"data": {
	"text/plain": "samples: 10000; evals/sample: 9; memory estimate: 0 bytes; allocs estimate: 0\nns\n\n (2290.0 - 2700.0 ] \u001b[32m██████████████████████████████ \u001b[39m9341\n (2700.0 - 3100.0 ] \u001b[32m█▏\u001b[39m350\n (3100.0 - 3510.0 ] \u001b[32m▏\u001b[39m8\n (3510.0 - 3920.0 ] \u001b[32m▏\u001b[39m19\n (3920.0 - 4330.0 ] \u001b[32m▍\u001b[39m79\n (4330.0 - 4740.0 ] \u001b[32m▎\u001b[39m51\n (4740.0 - 5140.0 ] \u001b[32m▏\u001b[39m38\n (5140.0 - 5550.0 ] \u001b[32m▏\u001b[39m16\n (5550.0 - 5960.0 ] \u001b[32m▏\u001b[39m31\n (5960.0 - 6370.0 ] \u001b[32m▏\u001b[39m20\n (6370.0 - 6780.0 ] \u001b[32m▏\u001b[39m14\n (6780.0 - 7180.0 ] \u001b[32m▏\u001b[39m15\n (7180.0 - 7590.0 ] \u001b[32m▏\u001b[39m5\n (7590.0 - 8000.0 ] \u001b[32m▏\u001b[39m3\n (8000.0 - 10600.0] \u001b[32m▏\u001b[39m10\n\n Counts\n\nmin: 2.289 μs (0.00% GC); mean: 2.426 μs (0.00% GC); median: 2.311 μs (0.00% GC); max: 10.600 μs (0.00% GC)."
	},
	"metadata": {}
	}
	]
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "# MKL dgemm\n@benchmark gemmmkl!($C0, $A, $B)",
	"execution_count": 5,
	"outputs": [
	{
	"output_type": "execute_result",
	"execution_count": 5,
	"data": {
	"text/plain": "samples: 10000; evals/sample: 10; memory estimate: 0 bytes; allocs estimate: 0\nns\n\n (1290.0 - 1600.0] \u001b[32m██████████████████████████████ \u001b[39m9490\n (1600.0 - 1910.0] \u001b[32m█▎\u001b[39m356\n (1910.0 - 2220.0] \u001b[32m▏\u001b[39m4\n (2220.0 - 2530.0] \u001b[32m▏\u001b[39m2\n (2530.0 - 2840.0] \u001b[32m▏\u001b[39m17\n (2840.0 - 3150.0] \u001b[32m▎\u001b[39m40\n (3150.0 - 3470.0] \u001b[32m▏\u001b[39m11\n (3470.0 - 3780.0] \u001b[32m▏\u001b[39m16\n (3780.0 - 4090.0] \u001b[32m▏\u001b[39m6\n (4090.0 - 4400.0] \u001b[32m▏\u001b[39m10\n (4400.0 - 4710.0] \u001b[32m▏\u001b[39m14\n (4710.0 - 5020.0] \u001b[32m▏\u001b[39m8\n (5020.0 - 5330.0] \u001b[32m▏\u001b[39m9\n (5330.0 - 5640.0] \u001b[32m▏\u001b[39m7\n (5640.0 - 6950.0] \u001b[32m▏\u001b[39m10\n\n Counts\n\nmin: 1.290 μs (0.00% GC); mean: 1.370 μs (0.00% GC); median: 1.300 μs (0.00% GC); max: 6.950 μs (0.00% GC)."
	},
	"metadata": {}
	}
	]
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "# MKL dgemm_direct\n@benchmark gemmmkl_direct!($C0, $A, $B)",
	"execution_count": 6,
	"outputs": [
	{
	"output_type": "execute_result",
	"execution_count": 6,
	"data": {
	"text/plain": "samples: 10000; evals/sample: 10; memory estimate: 0 bytes; allocs estimate: 0\nns\n\n (1200.0 - 1520.0] \u001b[32m██████████████████████████████\u001b[39m9321\n (1520.0 - 1830.0] \u001b[32m█▌\u001b[39m430\n (1830.0 - 2150.0] \u001b[32m▏\u001b[39m19\n (2150.0 - 2470.0] \u001b[32m▏\u001b[39m25\n (2470.0 - 2780.0] \u001b[32m▏\u001b[39m34\n (2780.0 - 3100.0] \u001b[32m▎\u001b[39m47\n (3100.0 - 3420.0] \u001b[32m▏\u001b[39m29\n (3420.0 - 3730.0] \u001b[32m▏\u001b[39m31\n (3730.0 - 4050.0] \u001b[32m▏\u001b[39m14\n (4050.0 - 4360.0] \u001b[32m▏\u001b[39m7\n (4360.0 - 4680.0] \u001b[32m▏\u001b[39m7\n (4680.0 - 5000.0] \u001b[32m▏\u001b[39m8\n (5000.0 - 5310.0] \u001b[32m▏\u001b[39m10\n (5310.0 - 5630.0] \u001b[32m▏\u001b[39m8\n (5630.0 - 7420.0] \u001b[32m▏\u001b[39m10\n\n Counts\n\nmin: 1.200 μs (0.00% GC); mean: 1.302 μs (0.00% GC); median: 1.220 μs (0.00% GC); max: 7.420 μs (0.00% GC)."
	},
	"metadata": {}
	}
	]
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "# Octavian.jl\n@benchmark matmul!($C0, $A, $B)",
	"execution_count": 7,
	"outputs": [
	{
	"output_type": "execute_result",
	"execution_count": 7,
	"data": {
	"text/plain": "samples: 10000; evals/sample: 142; memory estimate: 0 bytes; allocs estimate: 0\nns\n\n (700.0 - 790.0 ] \u001b[32m██████████████████████████████ \u001b[39m8213\n (790.0 - 870.0 ] \u001b[32m██▌\u001b[39m661\n (870.0 - 960.0 ] \u001b[32m█\u001b[39m249\n (960.0 - 1040.0] \u001b[32m█▏\u001b[39m283\n (1040.0 - 1130.0] \u001b[32m▋\u001b[39m152\n (1130.0 - 1210.0] \u001b[32m▋\u001b[39m137\n (1210.0 - 1300.0] \u001b[32m▍\u001b[39m82\n (1300.0 - 1380.0] \u001b[32m▍\u001b[39m78\n (1380.0 - 1470.0] \u001b[32m▎\u001b[39m48\n (1470.0 - 1550.0] \u001b[32m▏\u001b[39m31\n (1550.0 - 1640.0] \u001b[32m▏\u001b[39m10\n (1640.0 - 1720.0] \u001b[32m▏\u001b[39m13\n (1720.0 - 1800.0] \u001b[32m▏\u001b[39m20\n (1800.0 - 1890.0] \u001b[32m▏\u001b[39m13\n (1890.0 - 2380.0] \u001b[32m▏\u001b[39m10\n\n Counts\n\nmin: 704.225 ns (0.00% GC); mean: 781.582 ns (0.00% GC); median: 727.465 ns (0.00% GC); max: 2.385 μs (0.00% GC)."
	},
	"metadata": {}
	}
	]
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "using StaticArrays\nAm = MMatrix{M, K}(A)\nBm = MMatrix{K, N}(B)\nCm = similar(Am);",
	"execution_count": 8,
	"outputs": []
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "@benchmark mul!($Cm, $Am, $Bm)",
	"execution_count": 9,
	"outputs": [
	{
	"output_type": "execute_result",
	"execution_count": 9,
	"data": {
	"text/plain": "samples: 10000; evals/sample: 9; memory estimate: 0 bytes; allocs estimate: 0\nns\n\n (2270.0 - 2700.0 ] \u001b[32m██████████████████████████████▏\u001b[39m7222\n (2700.0 - 3130.0 ] \u001b[32m███▌\u001b[39m826\n (3130.0 - 3570.0 ] \u001b[32m█▍\u001b[39m329\n (3570.0 - 4000.0 ] \u001b[32m███▉\u001b[39m913\n (4000.0 - 4440.0 ] \u001b[32m█▋\u001b[39m362\n (4440.0 - 4870.0 ] \u001b[32m▋\u001b[39m123\n (4870.0 - 5310.0 ] \u001b[32m▎\u001b[39m52\n (5310.0 - 5740.0 ] \u001b[32m▎\u001b[39m32\n (5740.0 - 6170.0 ] \u001b[32m▏\u001b[39m27\n (6170.0 - 6610.0 ] \u001b[32m▎\u001b[39m31\n (6610.0 - 7040.0 ] \u001b[32m▏\u001b[39m23\n (7040.0 - 7480.0 ] \u001b[32m▏\u001b[39m11\n (7480.0 - 7910.0 ] \u001b[32m▏\u001b[39m25\n (7910.0 - 8340.0 ] \u001b[32m▏\u001b[39m14\n (8340.0 - 33570.0] \u001b[32m▏\u001b[39m10\n\n Counts\n\nmin: 2.267 μs (0.00% GC); mean: 2.850 μs (0.00% GC); median: 2.511 μs (0.00% GC); max: 33.567 μs (0.00% GC)."
	},
	"metadata": {}
	}
	]
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "# StaticArrays.MMatrix are statically sized\n@benchmark matmul!($Cm, $Am, $Bm)",
	"execution_count": 10,
	"outputs": [
	{
	"output_type": "execute_result",
	"execution_count": 10,
	"data": {
	"text/plain": "samples: 10000; evals/sample: 157; memory estimate: 0 bytes; allocs estimate: 0\nns\n\n (670.0 - 760.0 ] \u001b[32m██████████████████████████████ \u001b[39m7928\n (760.0 - 850.0 ] \u001b[32m█▍\u001b[39m360\n (850.0 - 940.0 ] \u001b[32m█▏\u001b[39m277\n (940.0 - 1030.0] \u001b[32m█▊\u001b[39m441\n (1030.0 - 1130.0] \u001b[32m█\u001b[39m257\n (1130.0 - 1220.0] \u001b[32m█▋\u001b[39m417\n (1220.0 - 1310.0] \u001b[32m▌\u001b[39m119\n (1310.0 - 1400.0] \u001b[32m▍\u001b[39m71\n (1400.0 - 1490.0] \u001b[32m▎\u001b[39m55\n (1490.0 - 1580.0] \u001b[32m▏\u001b[39m23\n (1580.0 - 1680.0] \u001b[32m▏\u001b[39m17\n (1680.0 - 1770.0] \u001b[32m▏\u001b[39m14\n (1770.0 - 1860.0] \u001b[32m▏\u001b[39m7\n (1860.0 - 1950.0] \u001b[32m▏\u001b[39m4\n (1950.0 - 2730.0] \u001b[32m▏\u001b[39m10\n\n Counts\n\nmin: 668.790 ns (0.00% GC); mean: 785.391 ns (0.00% GC); median: 714.013 ns (0.00% GC); max: 2.734 μs (0.00% GC)."
	},
	"metadata": {}
	}
	]
	},
	{
	"metadata": {
	"trusted": true
	},
	"cell_type": "code",
	"source": "",
	"execution_count": null,
	"outputs": []
	}
	],
	"metadata": {
	"@webio": {
	"lastKernelId": null,
	"lastCommId": null
	},
	"kernelspec": {
	"name": "julia-1.7-depwarn-o3",
	"display_name": "Julia 1.7.0-DEV depwarn -O3",
	"language": "julia"
	},
	"language_info": {
	"file_extension": ".jl",
	"name": "julia",
	"mimetype": "application/julia",
	"version": "1.7.0"
	},
	"toc": {
	"nav_menu": {},
	"number_sections": true,
	"sideBar": true,
	"skip_h1_title": false,
	"base_numbering": 1,
	"title_cell": "Table of Contents",
	"title_sidebar": "Contents",
	"toc_cell": false,
	"toc_position": {},
	"toc_section_display": true,
	"toc_window_display": false
	},
	"gist": {
	"id": "6123aef79488bc20b52047656fc6f015",
	"data": {
	"description": "Octavian",
	"public": true
	}
	},
	"_draft": {
	"nbviewer_url": "https://gist.github.com/6123aef79488bc20b52047656fc6f015"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}