Skip to content

Instantly share code, notes, and snippets.

@comaniac
Last active February 17, 2021 21:38
Show Gist options
  • Save comaniac/2de1f15c0f6aa660472af9c2c72fb22d to your computer and use it in GitHub Desktop.
Save comaniac/2de1f15c0f6aa660472af9c2c72fb22d to your computer and use it in GitHub Desktop.
import timeit
import numpy as np
import torch
import tvm
from tvm import auto_scheduler
import mnm
from mnm.testing.utils import ir_fusion, ir_simplify, get_vm_executor, get_vm_profiler
from mnm.utils.tuner import run_tuning
def randn(
shape, *, ctx="cuda", dtype="float32", std=1.0, mean=0.0, requires_grad=False, positive=False
):
if positive:
x = np.abs(np.random.randn(*shape)) * std + mean
else:
x = np.random.randn(*shape) * std + mean
if not isinstance(x, np.ndarray):
x = np.array(x)
assert list(x.shape) == list(shape)
x = x.astype(dtype)
m_x = mnm.array(x, ctx=ctx)
if requires_grad:
m_x.requires_grad = True
t_x = torch.tensor(x, requires_grad=requires_grad, device=ctx) # pylint: disable=not-callable
return m_x, t_x
def mul_conv2d_dw_add(ctx):
"""Task 83"""
class Model(mnm.Model):
def build(self, shape, stride, padding, dilation, groups, momentum):
self.shape = shape
self.stride = stride
self.padding = padding
self.dilation = dilation
self.groups = groups
self.momentum = mnm.array(momentum, dtype="float32", ctx="cpu")
@mnm.model.trace
def forward(self, x, y, dy, v):
dw = mnm.conv2d_dw(
x,
y,
dy,
shape=self.shape,
stride=self.stride,
padding=self.padding,
dilation=self.dilation,
groups=self.groups,
)
new_v = mnm.add(mnm.multiply(self.momentum, v), dw)
return new_v
#x_shape = (32, 3, 32, 32)
#y_shape = (32, 64, 32, 32)
#dy_shape = (32, 64, 32, 32)
#shape = (64, 32, 3, 3)
x_shape = (32, 256, 32, 32)
y_shape = (32, 128, 32, 32)
dy_shape = (32, 128, 32, 32)
shape = (128, 256, 1, 1)
stride = 1
padding = 0
dilation = 1
groups = 1
m_model = Model(shape, stride, padding, dilation, groups, momentum=0.01)
m_model.to(ctx=ctx)
m_x, t_x = randn(x_shape, ctx=ctx, requires_grad=True)
m_y, t_y = randn(y_shape, ctx=ctx, requires_grad=True)
m_dy, t_dy = randn(dy_shape, ctx=ctx)
m_v, t_v = randn(shape, ctx=ctx)
return m_model, [m_x, m_y, m_dy, m_v]
def conv2d_dx_relu_dx(ctx):
class Model(mnm.Model):
# fwd = x -> relu --(y_relu)-> conv -> y_conv
# bwd = conv_dx -> relu_dx
def build(self, shape, stride, padding, dilation, groups):
self.shape = shape
self.stride = stride
self.padding = padding
self.dilation = dilation
self.groups = groups
@mnm.model.trace
def forward(self, x, y_relu, y_conv, w, dy_conv):
dy_relu = mnm.conv2d_dx(
w,
y_conv,
dy_conv,
shape=self.shape,
stride=self.stride,
padding=self.padding,
dilation=self.dilation,
groups=self.groups,
)
dx = mnm.relu_dx(x, y_relu, dy_relu)
return dx
w_shape = (256, 256, 3, 3)
y_shape = (32, 256, 8, 8)
dy_shape = (32, 256, 8, 8)
shape = (32, 256, 16, 16)
x_shape = shape
stride = 2
padding = 1
dilation = 1
groups = 1
m_model = Model(shape, stride, padding, dilation, groups)
m_model.to(ctx=ctx)
x, _ = randn(x_shape, ctx=ctx)
y_relu, _ = randn(x_shape, ctx=ctx) # y_relu = mnm.relu(x)
w, _ = randn(w_shape, ctx=ctx)
y_conv, _ = randn(y_shape, ctx=ctx) # y_conv = mnm.conv2d(y_relu, w)
dy_conv, _ = randn(dy_shape, ctx=ctx)
return m_model, [x, y_relu, y_conv, w, dy_conv]
def conv2d(ctx):
"""Task 7"""
class Model(mnm.Model):
def build(self, stride, padding, dilation, groups):
self.stride = stride
self.padding = padding
self.dilation = dilation
self.groups = groups
@mnm.model.trace
def forward(self, x, y):
out = mnm.conv2d(
x,
y,
stride=self.stride,
padding=self.padding,
dilation=self.dilation,
groups=self.groups,
)
return out
x_shape = (32, 256, 32, 32)
y_shape = (128, 256, 1, 1)
stride = 1
padding = 0
dilation = 1
groups = 1
m_model = Model(stride, padding, dilation, groups)
m_model.to(ctx=ctx)
m_x, _ = randn(x_shape, ctx=ctx, requires_grad=True)
m_y, _ = randn(y_shape, ctx=ctx, requires_grad=True)
return m_model, [m_x, m_y]
def batch_norm(ctx):
"""Task ?"""
class Model(mnm.Model):
def build(self):
pass
@mnm.model.trace
def forward(self, m_x, m_m, m_v, m_w, m_b):
out = mnm.batch_norm_infer(m_x, m_m, m_v, m_w, m_b, 0.1, 1e-5)
return out
shape = (32, 128, 32, 32)
s_shape = (shape[1],)
m_m, _ = randn(s_shape, ctx=ctx)
m_v, _ = randn(s_shape, ctx=ctx, positive=True)
m_x, _ = randn(shape, ctx=ctx)
m_w, _ = randn(s_shape, ctx=ctx)
m_b, _ = randn(s_shape, ctx=ctx)
m_model = Model()
m_model.to(ctx=ctx)
return m_model, [m_x, m_m, m_v, m_w, m_b]
def batch_norm_train_dwxwb(ctx):
"""Task 66"""
class Model(mnm.Model):
def build(self):
pass
@mnm.model.trace
def forward(self, m_y, m_x, m_w, m_b):
out = mnm.batch_norm_train_dxwb(m_y, m_x, m_w, m_b, 1e-5)
return out
shape = (32, 128, 32, 32)
s_shape = (shape[1],)
m_y, _ = randn(shape, ctx=ctx)
m_x, _ = randn(shape, ctx=ctx)
m_w, _ = randn(s_shape, ctx=ctx)
m_b, _ = randn(s_shape, ctx=ctx)
m_model = Model()
m_model.to(ctx=ctx)
return m_model, [m_y, m_x, m_w, m_b]
def profile_wkl(wkl_func, ctx, fuse, log_file, number=100, warmup=100):
name = wkl_func.__qualname__
m_model, inputs = wkl_func(ctx)
def ir_optimizer(func):
func = ir_simplify(func)
if fuse:
func = ir_fusion(func)
return func
print("Profiling workload %s with %s" % (name, "fused_tvmjit" if fuse else "unfused_cudnn"))
print("Individual latency")
def m_profiler():
profiler, vm_inputs = get_vm_profiler(m_model, ctx, inputs, ir_optimizer)
vm = profiler.make_executor(sch_file=log_file)
# skip first several executions
for _ in range(warmup):
vm(*vm_inputs)
profiler.reset()
for _ in range(number):
vm(*vm_inputs)
print("\n{}".format(profiler.get_stat()))
m_profiler()
print("End-to-end latency")
executor, vm_inputs = get_vm_executor(m_model, ctx, inputs, ir_optimizer, sch_file=log_file)
def m_setup():
# skip first several executions
for _ in range(warmup):
executor(*vm_inputs)
def m_stmt():
executor(*vm_inputs)
m_time = (
timeit.Timer(
stmt="m_stmt()", setup="m_setup();", globals={"m_stmt": m_stmt, "m_setup": m_setup}
).timeit(number)
/ number
* 1e3
)
print("Latency: %.2f ms" % m_time)
if __name__ == "__main__":
for fuse in [False]:
profile_wkl(mul_conv2d_dw_add, "cuda", fuse, "sch_resnet_50.json")
# profile_wkl(conv2d, "cuda", fuse, "sch_resnet_50.json")
# profile_wkl(conv2d_dx_relu_dx, "cuda", fuse, "sch_resnet_50.json")
# profile_wkl(batch_norm_train_dwxwb, "cuda", fuse, "sch_resnet_50.json")
# profile_wkl(batch_norm, "cuda", fuse, "sch_resnet_50.json")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment