Skip to content

Instantly share code, notes, and snippets.

@comaniac
Last active April 8, 2021 01:32
Show Gist options
  • Save comaniac/4d15553d73486a298634ab24f4fd1500 to your computer and use it in GitHub Desktop.
Save comaniac/4d15553d73486a298634ab24f4fd1500 to your computer and use it in GitHub Desktop.
# Pytorch reference
# https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html#train-the-network
import numpy as np
import torch
import torchvision
import torchvision.transforms as transforms
import tvm
import mnm
from mnm._core.executor import VMExecutor
from mnm.model.trace import _get_func_inputs
from mnm.testing import randn_torch, one_hot_torch, randint, asnumpy
from mnm.testing.utils import ir_fusion
import benchmark
# Configuration
run_pytorch = False
n_epoch = 2 # Change to a larger number for better accuracy.
batch_size = 32
image_size = (32, 32) # CIFAR-10: 32x32, ImageNet: 224x224
transform = transforms.Compose(
[transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
)
trainset = torchvision.datasets.CIFAR10(
root="./cifar10", train=True, download=True, transform=transform
)
trainloader = torch.utils.data.DataLoader(
trainset, batch_size=batch_size, shuffle=True, num_workers=4
)
testset = torchvision.datasets.CIFAR10(
root="./cifar10", train=False, download=True, transform=transform
)
testloader = torch.utils.data.DataLoader(
testset, batch_size=batch_size, shuffle=False, num_workers=4
)
if run_pytorch:
bencher = benchmark.get_model_bencher(
"resnet50", batch_size=batch_size, shape=image_size, include_orig_model=True
)
model = bencher.ref_bencher.model
model.to(device="cuda")
model.train()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.01)
for epoch in range(n_epoch):
running_loss = []
print_period = 500
for idx, (inputs, labels) in enumerate(trainloader, 0):
inputs = inputs.to(device="cuda")
labels = labels.to(device="cuda")
optimizer.zero_grad()
t_y = model(inputs)
if isinstance(t_y, tuple):
t_y = t_y[0]
t_ypred = torch.log_softmax(t_y, dim=-1)
t_loss = torch.nn.functional.nll_loss(t_ypred, labels)
if isinstance(t_loss, tuple):
if hasattr(t_loss[0], "backward"):
t_loss = t_loss[0]
else:
assert hasattr(t_loss[1], "backward")
t_loss = t_loss[1]
t_loss.backward()
optimizer.step()
torch.cuda.synchronize()
running_loss.append(t_loss.item())
if idx % print_period == print_period - 1:
print("[%d, %5d] loss: %.3f" % (epoch + 1, idx + 1, np.mean(running_loss)))
running_loss = []
if running_loss:
print("[%d, final] loss: %.3f" % (epoch + 1, np.mean(running_loss)))
model.eval()
correct = 0
total = 0
with torch.no_grad():
for inputs, labels in testloader:
inputs = inputs.to(device="cuda")
labels = labels.to(device="cuda")
outputs = model(inputs)
_, predicted = torch.max(outputs, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
print("Accuracy of the network on the 10000 test images: %d %%" % (100 * correct / total))
else:
def torch_to_meta(inputs, labels, batch_size, image_size, one_hot=True):
"""Convert PyTorch tensor inputs and labels to Meta."""
inputs = inputs.numpy()
targets = labels.numpy()
# FIXME: We don't support dynamic shape so we need to pad the last batch.
if targets.shape[0] < batch_size:
# FIXME: Remove this line results in CUDA error at the last batch. Unknown reason yet.
return None, None
pad = batch_size - targets.shape[0]
# Pad inputs with all 0s.
inputs = np.append(inputs, np.zeros([pad, 3, *image_size], dtype="float32"), axis=0)
# Pad outputs with the unused label.
targets = np.append(targets, [10 for _ in range(pad)])
inputs = mnm.array(inputs, device="cuda")
assert inputs.shape == (batch_size, 3, *image_size)
# FIXME: make nll_loss accept index in addition to one hot.
if one_hot:
labels = np.zeros((batch_size, 1000), dtype="float32")
labels[range(batch_size), targets] = 1
assert labels.shape == (batch_size, 1000)
else:
labels = targets
assert labels.shape == (batch_size,)
labels = mnm.array(labels, device="cuda")
return inputs, labels
def get_vm_executor(model, args):
"""Apply fusion level 1 and make a VM executor."""
record = model._internal(*args)
mod = record.mod
mod = ir_fusion(mod, fuse_opt_level=1)
executor = VMExecutor(mod, "cuda").make_executor()
return executor, record
def run_executor(executor, record, args):
"""Form VM inputs with inputs and paramters and run the executor."""
# Prepare VM inputs.
vm_inputs = _get_func_inputs(record, args, {}, get_handle=False)
return executor(*vm_inputs)
bencher = benchmark.get_model_bencher(
"resnet50", batch_size=batch_size, shape=image_size, include_orig_model=False
)
# The model without loss for inference.
model = bencher.model
model.to(device="cuda")
model.infer_mode()
# The model with loss function for training.
# Note that model_w_loss and model are different models but bind to the same NDArrays,
# so model can be directly used for inference after training model_w_loss.
model_w_loss = bencher.model_w_loss
model_w_loss.to(device="cuda")
model_w_loss.train_mode()
optimizer = mnm.optim.sgd.with_sgd(learning_rate=0.1, momentum=0.01)(model_w_loss)
record = None
executor = None
for epoch in range(n_epoch):
running_loss = []
print_period = 500
for idx, (inputs, labels) in enumerate(trainloader, 0):
inputs, labels = torch_to_meta(inputs, labels, batch_size, image_size)
if inputs is None and labels is None:
break
# FIXME: Make optimizer accept default.
dy, _ = randn_torch((), std=0.0, mean=1.0, requires_grad=False)
args = [dy, inputs, labels]
# Initialie the VM executor in the first run.
if record is None and executor is None:
executor, record = get_vm_executor(optimizer, args)
# Train a mini-batch
loss = run_executor(executor, record, args)
while isinstance(loss, (tuple, tvm.ir.container.Array, mnm._core.value.TupleValue)):
loss = loss[0]
running_loss.append(asnumpy(loss))
if idx % print_period == print_period - 1:
print("[%d, %5d] loss: %.3f" % (epoch + 1, idx + 1, np.mean(running_loss)))
running_loss = []
if running_loss:
print("[%d, final] loss: %.3f" % (epoch + 1, np.mean(running_loss)))
record = None
executor = None
correct = 0
total = 0
for inputs, labels in testloader:
inputs, _ = torch_to_meta(inputs, labels, batch_size, image_size, one_hot=False)
labels = labels.numpy()
if inputs is None:
break
args = [inputs]
# Initialie the VM executor in the first run.
if record is None and executor is None:
executor, record = get_vm_executor(model, args)
outputs = asnumpy(run_executor(executor, record, args)[0])
predicted = np.argmax(outputs, axis=1)
total += predicted.shape[0]
correct += (predicted == labels).sum()
print("Accuracy of the network on the 10000 test images: %d %%" % (100 * correct / total))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment