Last active
February 17, 2021 21:38
-
-
Save comaniac/2de1f15c0f6aa660472af9c2c72fb22d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import timeit | |
import numpy as np | |
import torch | |
import tvm | |
from tvm import auto_scheduler | |
import mnm | |
from mnm.testing.utils import ir_fusion, ir_simplify, get_vm_executor, get_vm_profiler | |
from mnm.utils.tuner import run_tuning | |
def randn( | |
shape, *, ctx="cuda", dtype="float32", std=1.0, mean=0.0, requires_grad=False, positive=False | |
): | |
if positive: | |
x = np.abs(np.random.randn(*shape)) * std + mean | |
else: | |
x = np.random.randn(*shape) * std + mean | |
if not isinstance(x, np.ndarray): | |
x = np.array(x) | |
assert list(x.shape) == list(shape) | |
x = x.astype(dtype) | |
m_x = mnm.array(x, ctx=ctx) | |
if requires_grad: | |
m_x.requires_grad = True | |
t_x = torch.tensor(x, requires_grad=requires_grad, device=ctx) # pylint: disable=not-callable | |
return m_x, t_x | |
def mul_conv2d_dw_add(ctx): | |
"""Task 83""" | |
class Model(mnm.Model): | |
def build(self, shape, stride, padding, dilation, groups, momentum): | |
self.shape = shape | |
self.stride = stride | |
self.padding = padding | |
self.dilation = dilation | |
self.groups = groups | |
self.momentum = mnm.array(momentum, dtype="float32", ctx="cpu") | |
@mnm.model.trace | |
def forward(self, x, y, dy, v): | |
dw = mnm.conv2d_dw( | |
x, | |
y, | |
dy, | |
shape=self.shape, | |
stride=self.stride, | |
padding=self.padding, | |
dilation=self.dilation, | |
groups=self.groups, | |
) | |
new_v = mnm.add(mnm.multiply(self.momentum, v), dw) | |
return new_v | |
#x_shape = (32, 3, 32, 32) | |
#y_shape = (32, 64, 32, 32) | |
#dy_shape = (32, 64, 32, 32) | |
#shape = (64, 32, 3, 3) | |
x_shape = (32, 256, 32, 32) | |
y_shape = (32, 128, 32, 32) | |
dy_shape = (32, 128, 32, 32) | |
shape = (128, 256, 1, 1) | |
stride = 1 | |
padding = 0 | |
dilation = 1 | |
groups = 1 | |
m_model = Model(shape, stride, padding, dilation, groups, momentum=0.01) | |
m_model.to(ctx=ctx) | |
m_x, t_x = randn(x_shape, ctx=ctx, requires_grad=True) | |
m_y, t_y = randn(y_shape, ctx=ctx, requires_grad=True) | |
m_dy, t_dy = randn(dy_shape, ctx=ctx) | |
m_v, t_v = randn(shape, ctx=ctx) | |
return m_model, [m_x, m_y, m_dy, m_v] | |
def conv2d_dx_relu_dx(ctx): | |
class Model(mnm.Model): | |
# fwd = x -> relu --(y_relu)-> conv -> y_conv | |
# bwd = conv_dx -> relu_dx | |
def build(self, shape, stride, padding, dilation, groups): | |
self.shape = shape | |
self.stride = stride | |
self.padding = padding | |
self.dilation = dilation | |
self.groups = groups | |
@mnm.model.trace | |
def forward(self, x, y_relu, y_conv, w, dy_conv): | |
dy_relu = mnm.conv2d_dx( | |
w, | |
y_conv, | |
dy_conv, | |
shape=self.shape, | |
stride=self.stride, | |
padding=self.padding, | |
dilation=self.dilation, | |
groups=self.groups, | |
) | |
dx = mnm.relu_dx(x, y_relu, dy_relu) | |
return dx | |
w_shape = (256, 256, 3, 3) | |
y_shape = (32, 256, 8, 8) | |
dy_shape = (32, 256, 8, 8) | |
shape = (32, 256, 16, 16) | |
x_shape = shape | |
stride = 2 | |
padding = 1 | |
dilation = 1 | |
groups = 1 | |
m_model = Model(shape, stride, padding, dilation, groups) | |
m_model.to(ctx=ctx) | |
x, _ = randn(x_shape, ctx=ctx) | |
y_relu, _ = randn(x_shape, ctx=ctx) # y_relu = mnm.relu(x) | |
w, _ = randn(w_shape, ctx=ctx) | |
y_conv, _ = randn(y_shape, ctx=ctx) # y_conv = mnm.conv2d(y_relu, w) | |
dy_conv, _ = randn(dy_shape, ctx=ctx) | |
return m_model, [x, y_relu, y_conv, w, dy_conv] | |
def conv2d(ctx): | |
"""Task 7""" | |
class Model(mnm.Model): | |
def build(self, stride, padding, dilation, groups): | |
self.stride = stride | |
self.padding = padding | |
self.dilation = dilation | |
self.groups = groups | |
@mnm.model.trace | |
def forward(self, x, y): | |
out = mnm.conv2d( | |
x, | |
y, | |
stride=self.stride, | |
padding=self.padding, | |
dilation=self.dilation, | |
groups=self.groups, | |
) | |
return out | |
x_shape = (32, 256, 32, 32) | |
y_shape = (128, 256, 1, 1) | |
stride = 1 | |
padding = 0 | |
dilation = 1 | |
groups = 1 | |
m_model = Model(stride, padding, dilation, groups) | |
m_model.to(ctx=ctx) | |
m_x, _ = randn(x_shape, ctx=ctx, requires_grad=True) | |
m_y, _ = randn(y_shape, ctx=ctx, requires_grad=True) | |
return m_model, [m_x, m_y] | |
def batch_norm(ctx): | |
"""Task ?""" | |
class Model(mnm.Model): | |
def build(self): | |
pass | |
@mnm.model.trace | |
def forward(self, m_x, m_m, m_v, m_w, m_b): | |
out = mnm.batch_norm_infer(m_x, m_m, m_v, m_w, m_b, 0.1, 1e-5) | |
return out | |
shape = (32, 128, 32, 32) | |
s_shape = (shape[1],) | |
m_m, _ = randn(s_shape, ctx=ctx) | |
m_v, _ = randn(s_shape, ctx=ctx, positive=True) | |
m_x, _ = randn(shape, ctx=ctx) | |
m_w, _ = randn(s_shape, ctx=ctx) | |
m_b, _ = randn(s_shape, ctx=ctx) | |
m_model = Model() | |
m_model.to(ctx=ctx) | |
return m_model, [m_x, m_m, m_v, m_w, m_b] | |
def batch_norm_train_dwxwb(ctx): | |
"""Task 66""" | |
class Model(mnm.Model): | |
def build(self): | |
pass | |
@mnm.model.trace | |
def forward(self, m_y, m_x, m_w, m_b): | |
out = mnm.batch_norm_train_dxwb(m_y, m_x, m_w, m_b, 1e-5) | |
return out | |
shape = (32, 128, 32, 32) | |
s_shape = (shape[1],) | |
m_y, _ = randn(shape, ctx=ctx) | |
m_x, _ = randn(shape, ctx=ctx) | |
m_w, _ = randn(s_shape, ctx=ctx) | |
m_b, _ = randn(s_shape, ctx=ctx) | |
m_model = Model() | |
m_model.to(ctx=ctx) | |
return m_model, [m_y, m_x, m_w, m_b] | |
def profile_wkl(wkl_func, ctx, fuse, log_file, number=100, warmup=100): | |
name = wkl_func.__qualname__ | |
m_model, inputs = wkl_func(ctx) | |
def ir_optimizer(func): | |
func = ir_simplify(func) | |
if fuse: | |
func = ir_fusion(func) | |
return func | |
print("Profiling workload %s with %s" % (name, "fused_tvmjit" if fuse else "unfused_cudnn")) | |
print("Individual latency") | |
def m_profiler(): | |
profiler, vm_inputs = get_vm_profiler(m_model, ctx, inputs, ir_optimizer) | |
vm = profiler.make_executor(sch_file=log_file) | |
# skip first several executions | |
for _ in range(warmup): | |
vm(*vm_inputs) | |
profiler.reset() | |
for _ in range(number): | |
vm(*vm_inputs) | |
print("\n{}".format(profiler.get_stat())) | |
m_profiler() | |
print("End-to-end latency") | |
executor, vm_inputs = get_vm_executor(m_model, ctx, inputs, ir_optimizer, sch_file=log_file) | |
def m_setup(): | |
# skip first several executions | |
for _ in range(warmup): | |
executor(*vm_inputs) | |
def m_stmt(): | |
executor(*vm_inputs) | |
m_time = ( | |
timeit.Timer( | |
stmt="m_stmt()", setup="m_setup();", globals={"m_stmt": m_stmt, "m_setup": m_setup} | |
).timeit(number) | |
/ number | |
* 1e3 | |
) | |
print("Latency: %.2f ms" % m_time) | |
if __name__ == "__main__": | |
for fuse in [False]: | |
profile_wkl(mul_conv2d_dw_add, "cuda", fuse, "sch_resnet_50.json") | |
# profile_wkl(conv2d, "cuda", fuse, "sch_resnet_50.json") | |
# profile_wkl(conv2d_dx_relu_dx, "cuda", fuse, "sch_resnet_50.json") | |
# profile_wkl(batch_norm_train_dwxwb, "cuda", fuse, "sch_resnet_50.json") | |
# profile_wkl(batch_norm, "cuda", fuse, "sch_resnet_50.json") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment