-
-
Save ajtulloch/a473ea2cf0772de91eb17d67479a3b41 to your computer and use it in GitHub Desktop.
opt_gemm.diff
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/tutorials/optimize/opt_gemm.py b/tutorials/optimize/opt_gemm.py | |
index 44ee53a7..c9785cbf 100644 | |
--- a/tutorials/optimize/opt_gemm.py | |
+++ b/tutorials/optimize/opt_gemm.py | |
@@ -44,24 +44,24 @@ import timeit | |
# The size of the matrix | |
# (M, K) x (K, N) | |
# You are free to try out different shapes, sometimes TVM optimization outperforms numpy with MKL. | |
-M = 1024 | |
-K = 1024 | |
-N = 1024 | |
- | |
+M = 768 | |
+K = 768 | |
+N = 768 | |
+FLOPS = 2 * M * N * K | |
# The default tensor type in tvm | |
dtype = "float32" | |
# using Intel AVX2(Advanced Vector Extensions) ISA for SIMD | |
# To get the best performance, please change the following line | |
# to llvm -mcpu=core-avx2, or specific type of CPU you use | |
-target = 'llvm' | |
+target = 'llvm -mcpu=core-avx2' | |
ctx = tvm.context(target, 0) | |
- | |
+print("asdf") | |
# Random generated tensor for testing | |
a = tvm.nd.array(numpy.random.rand(M, K).astype(dtype), ctx) | |
b = tvm.nd.array(numpy.random.rand(K, N).astype(dtype), ctx) | |
-np_repeat = 100 | |
+REPEAT = 10 | |
np_runing_time = timeit.timeit(setup='import numpy\n' | |
'M = ' + str(M) + '\n' | |
'K = ' + str(K) + '\n' | |
@@ -70,8 +70,9 @@ np_runing_time = timeit.timeit(setup='import numpy\n' | |
'a = numpy.random.rand(M, K).astype(dtype)\n' | |
'b = numpy.random.rand(K, N).astype(dtype)\n', | |
stmt='answer = numpy.dot(a, b)', | |
- number=np_repeat) | |
-print("Numpy running time: %f" % (np_runing_time / np_repeat)) | |
+ number=REPEAT) | |
+ | |
+print("Numpy running time: %f" % (FLOPS * REPEAT / np_runing_time / 1.0E9)) | |
answer = numpy.dot(a.asnumpy(), b.asnumpy()) | |
@@ -94,7 +95,7 @@ func(a, b, c) | |
numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5) | |
evaluator = func.time_evaluator(func.entry_name, ctx, number=1) | |
-print('Baseline: %f' % evaluator(a, b, c).mean) | |
+print('Baseline flops: %f' % (FLOPS / evaluator(a, b, c).mean / 1E9)) | |
################################################################################################ | |
# In TVM, we can always inspect lower level IR to debug or optimize our schedule. | |
@@ -130,8 +131,8 @@ numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5) | |
# By simply tiling the loop 32x32, and hoisting ko, ki outside the blocking loops, | |
# we can see big speedup compared with the baseline. | |
-evaluator = func.time_evaluator(func.entry_name, ctx, number=10) | |
-print('Opt1: %f' % evaluator(a, b, c).mean) | |
+evaluator = func.time_evaluator(func.entry_name, ctx, number=REPEAT) | |
+print('Opt1: %f' % (FLOPS / evaluator(a, b, c).mean / 1E9)) | |
################################################################################################ | |
# Here is the generated IR after blocking. | |
@@ -164,8 +165,8 @@ c = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx) | |
func(a, b, c) | |
numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5) | |
-evaluator = func.time_evaluator(func.entry_name, ctx, number=10) | |
-print('Opt2: %f' % evaluator(a, b, c).mean) | |
+evaluator = func.time_evaluator(func.entry_name, ctx, number=REPEAT) | |
+print('Opt2: %f' % (FLOPS / evaluator(a, b, c).mean / 1E9)) | |
################################################################################################ | |
# Here is the generated IR after vectorization. | |
@@ -197,8 +198,8 @@ c = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx) | |
func(a, b, c) | |
numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5) | |
-evaluator = func.time_evaluator(func.entry_name, ctx, number=10) | |
-print('Opt3: %f' % evaluator(a, b, c).mean) | |
+evaluator = func.time_evaluator(func.entry_name, ctx, number=REPEAT) | |
+print('Opt3: %f' % (FLOPS / evaluator(a, b, c).mean / 1E9)) | |
################################################################################################ | |
# Here is the generated IR after loop permutation. | |
@@ -252,8 +253,8 @@ c = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx) | |
func(a, b, c) | |
numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5) | |
-evaluator = func.time_evaluator(func.entry_name, ctx, number=10) | |
-print('Opt4: %f' % evaluator(a, b, c).mean) | |
+evaluator = func.time_evaluator(func.entry_name, ctx, number=REPEAT) | |
+print('Opt4: %f' % (FLOPS / evaluator(a, b, c).mean / 1E9)) | |
################################################################################################ | |
# Here is the generated IR after array packing. | |
@@ -298,8 +299,8 @@ c = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx) | |
func(a, b, c) | |
numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5) | |
-evaluator = func.time_evaluator(func.entry_name, ctx, number=10) | |
-print('Opt5: %f' % evaluator(a, b, c).mean) | |
+evaluator = func.time_evaluator(func.entry_name, ctx, number=REPEAT) | |
+print('Opt5: %f' % (FLOPS / evaluator(a, b, c).mean / 1E9)) | |
################################################################################################ | |
# Here is the generated IR after blocking. | |
@@ -341,9 +342,8 @@ c = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx) | |
func(a, b, c) | |
numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5) | |
-evaluator = func.time_evaluator(func.entry_name, ctx, number=50) | |
-opt6_time = evaluator(a, b, c).mean | |
-print('Opt6: %f' % opt6_time) | |
+evaluator = func.time_evaluator(func.entry_name, ctx, number=REPEAT) | |
+print('Opt6: %f' % (FLOPS / evaluator(a, b, c).mean / 1E9)) | |
################################################################################################ | |
# Here is the generated IR after parallelization. | |
@@ -360,3 +360,118 @@ print(tvm.lower(s, [A, B, C], simple_mode=True)) | |
# Note that the outputs on the web page reflect the running times on a non-exclusive | |
# Docker container, thereby they are *unreliable*. It is highly encouraged to run the | |
# tutorial by yourself to observe the performance gain acheived by TVM. | |
+ | |
+from tvm.contrib import cblas | |
+ | |
+C = cblas.matmul(A, B) | |
+s = tvm.create_schedule(C.op) | |
+func = tvm.build(s, [A, B, C], target=target, name="blasgemm") | |
+assert func | |
+ | |
+c = tvm.nd.array(numpy.zeros((M, N), dtype = dtype), ctx) | |
+func(a, b, c) | |
+numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5) | |
+ | |
+evaluator = func.time_evaluator(func.entry_name, ctx, number=REPEAT) | |
+print('OptBLAS: %f' % (FLOPS / evaluator(a, b, c).mean / 1E9)) | |
+ | |
+ | |
+ | |
+BITCODE_PATHS = [ | |
+ "gemmMxN__avx2.bc" | |
+] | |
+ | |
[email protected]_func("tvm_callback_llvm_bitcode_path") | |
+def bitcode_paths(): | |
+ global BITCODE_PATHS | |
+ return BITCODE_PATHS | |
+ | |
+# Tensorized | |
+def intrin_gemm(M, N, K): | |
+ assert M == 4 | |
+ assert N == 24 | |
+ dtype = 'float32' | |
+ A = tvm.placeholder((K, M), dtype=dtype, name='A') | |
+ B = tvm.placeholder((K, N), dtype=dtype, name='B') | |
+ k = tvm.reduce_axis((0, K), name='k') | |
+ C = tvm.compute((M, N), lambda m, n: | |
+ tvm.sum(A[k, m] * B[k, n], axis=[k]), name='C') | |
+ | |
+ Ab = tvm.decl_buffer(A.shape, A.dtype, | |
+ name="A", | |
+ offset_factor=4, | |
+ strides=[M, 1]) | |
+ Bb = tvm.decl_buffer(B.shape, B.dtype, | |
+ name="B", | |
+ offset_factor=24, | |
+ strides=[N, 1]) | |
+ Cb = tvm.decl_buffer(C.shape, C.dtype, | |
+ name="C", | |
+ offset_factor=1, | |
+ strides=[tvm.var('ldc'), 1]) | |
+ | |
+ def intrin_func(ins, outs): | |
+ aa, bb = ins | |
+ cc = outs[0] | |
+ irb = tvm.ir_builder.create() | |
+ extern_call = tvm.call_extern( | |
+ "int32", | |
+ "sgemm_only_4x24__avx2", | |
+ K, | |
+ irb.buffer_ptr(aa), | |
+ aa.elem_offset, | |
+ irb.buffer_ptr(bb), | |
+ bb.elem_offset, | |
+ irb.buffer_ptr(cc), | |
+ cc.elem_offset, | |
+ cc.strides[0]) | |
+ irb.emit(extern_call) | |
+ return irb.get() | |
+ | |
+ with tvm.build_config(): | |
+ return tvm.decl_tensor_intrin(C.op, | |
+ intrin_func, | |
+ binds={A: Ab, B: Bb, C: Cb}) | |
+ | |
+MTile = 4 | |
+NTile = 24 | |
+ | |
+assert M % MTile == 0 | |
+assert N % NTile == 0 | |
+ | |
+APanel = tvm.compute( | |
+ (M / MTile, K, MTile), lambda mtile, k, m: A[m + mtile * MTile, k], name='APanel') | |
+BPanel = tvm.compute( | |
+ (N / NTile, K, NTile), lambda ntile, k, n: B[k, n + ntile * NTile], name='BPanel') | |
+print("APanel, ", APanel.shape) | |
+print("BPanel, ", BPanel.shape) | |
+k = tvm.reduce_axis((0, K), name='k') | |
+C = tvm.compute( | |
+ (M, N), | |
+ lambda m, n: tvm.sum( | |
+ APanel[m / MTile, k, m % MTile] * BPanel[n / NTile, k, n % NTile], | |
+ axis=[k]), | |
+ name='C') | |
+print("C", C.shape, M, N) | |
+s = tvm.create_schedule(C.op) | |
+x, y, z = BPanel.op.axis | |
+# s[BPanel].vectorize(z) | |
+x, y, z = APanel.op.axis | |
+s[APanel].unroll(z) | |
+xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, NTile) | |
+s[C].reorder(xo, yo, xi, yi) | |
+xii, xiii = s[C].split(xi, factor=MTile) | |
+gemm_intrinsic_function = intrin_gemm(M=MTile, N=NTile, K=K) | |
+s[C].tensorize(xiii, gemm_intrinsic_function) | |
+ | |
+print(tvm.lower(s, [A, B, C], simple_mode=True)) | |
+func = tvm.build(s, [A, B, C], target=target) | |
+assert func | |
+ | |
+c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), ctx) | |
+func(a, b, c) | |
+print("C shape", c.shape) | |
+numpy.testing.assert_allclose(c.asnumpy(), answer, rtol=1e-5) | |
+ | |
+evaluator = func.time_evaluator(func.entry_name, ctx, number=REPEAT) | |
+print('OptTensorize: %f' % (FLOPS / evaluator(a, b, c).mean / 1E9)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment