Last active
April 8, 2021 01:32
-
-
Save comaniac/4d15553d73486a298634ab24f4fd1500 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Pytorch reference | |
# https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html#train-the-network | |
import numpy as np | |
import torch | |
import torchvision | |
import torchvision.transforms as transforms | |
import tvm | |
import mnm | |
from mnm._core.executor import VMExecutor | |
from mnm.model.trace import _get_func_inputs | |
from mnm.testing import randn_torch, one_hot_torch, randint, asnumpy | |
from mnm.testing.utils import ir_fusion | |
import benchmark | |
# Configuration | |
run_pytorch = False | |
n_epoch = 2 # Change to a larger number for better accuracy. | |
batch_size = 32 | |
image_size = (32, 32) # CIFAR-10: 32x32, ImageNet: 224x224 | |
transform = transforms.Compose( | |
[transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))] | |
) | |
trainset = torchvision.datasets.CIFAR10( | |
root="./cifar10", train=True, download=True, transform=transform | |
) | |
trainloader = torch.utils.data.DataLoader( | |
trainset, batch_size=batch_size, shuffle=True, num_workers=4 | |
) | |
testset = torchvision.datasets.CIFAR10( | |
root="./cifar10", train=False, download=True, transform=transform | |
) | |
testloader = torch.utils.data.DataLoader( | |
testset, batch_size=batch_size, shuffle=False, num_workers=4 | |
) | |
if run_pytorch: | |
bencher = benchmark.get_model_bencher( | |
"resnet50", batch_size=batch_size, shape=image_size, include_orig_model=True | |
) | |
model = bencher.ref_bencher.model | |
model.to(device="cuda") | |
model.train() | |
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.01) | |
for epoch in range(n_epoch): | |
running_loss = [] | |
print_period = 500 | |
for idx, (inputs, labels) in enumerate(trainloader, 0): | |
inputs = inputs.to(device="cuda") | |
labels = labels.to(device="cuda") | |
optimizer.zero_grad() | |
t_y = model(inputs) | |
if isinstance(t_y, tuple): | |
t_y = t_y[0] | |
t_ypred = torch.log_softmax(t_y, dim=-1) | |
t_loss = torch.nn.functional.nll_loss(t_ypred, labels) | |
if isinstance(t_loss, tuple): | |
if hasattr(t_loss[0], "backward"): | |
t_loss = t_loss[0] | |
else: | |
assert hasattr(t_loss[1], "backward") | |
t_loss = t_loss[1] | |
t_loss.backward() | |
optimizer.step() | |
torch.cuda.synchronize() | |
running_loss.append(t_loss.item()) | |
if idx % print_period == print_period - 1: | |
print("[%d, %5d] loss: %.3f" % (epoch + 1, idx + 1, np.mean(running_loss))) | |
running_loss = [] | |
if running_loss: | |
print("[%d, final] loss: %.3f" % (epoch + 1, np.mean(running_loss))) | |
model.eval() | |
correct = 0 | |
total = 0 | |
with torch.no_grad(): | |
for inputs, labels in testloader: | |
inputs = inputs.to(device="cuda") | |
labels = labels.to(device="cuda") | |
outputs = model(inputs) | |
_, predicted = torch.max(outputs, 1) | |
total += labels.size(0) | |
correct += (predicted == labels).sum().item() | |
print("Accuracy of the network on the 10000 test images: %d %%" % (100 * correct / total)) | |
else: | |
def torch_to_meta(inputs, labels, batch_size, image_size, one_hot=True): | |
"""Convert PyTorch tensor inputs and labels to Meta.""" | |
inputs = inputs.numpy() | |
targets = labels.numpy() | |
# FIXME: We don't support dynamic shape so we need to pad the last batch. | |
if targets.shape[0] < batch_size: | |
# FIXME: Remove this line results in CUDA error at the last batch. Unknown reason yet. | |
return None, None | |
pad = batch_size - targets.shape[0] | |
# Pad inputs with all 0s. | |
inputs = np.append(inputs, np.zeros([pad, 3, *image_size], dtype="float32"), axis=0) | |
# Pad outputs with the unused label. | |
targets = np.append(targets, [10 for _ in range(pad)]) | |
inputs = mnm.array(inputs, device="cuda") | |
assert inputs.shape == (batch_size, 3, *image_size) | |
# FIXME: make nll_loss accept index in addition to one hot. | |
if one_hot: | |
labels = np.zeros((batch_size, 1000), dtype="float32") | |
labels[range(batch_size), targets] = 1 | |
assert labels.shape == (batch_size, 1000) | |
else: | |
labels = targets | |
assert labels.shape == (batch_size,) | |
labels = mnm.array(labels, device="cuda") | |
return inputs, labels | |
def get_vm_executor(model, args): | |
"""Apply fusion level 1 and make a VM executor.""" | |
record = model._internal(*args) | |
mod = record.mod | |
mod = ir_fusion(mod, fuse_opt_level=1) | |
executor = VMExecutor(mod, "cuda").make_executor() | |
return executor, record | |
def run_executor(executor, record, args): | |
"""Form VM inputs with inputs and paramters and run the executor.""" | |
# Prepare VM inputs. | |
vm_inputs = _get_func_inputs(record, args, {}, get_handle=False) | |
return executor(*vm_inputs) | |
bencher = benchmark.get_model_bencher( | |
"resnet50", batch_size=batch_size, shape=image_size, include_orig_model=False | |
) | |
# The model without loss for inference. | |
model = bencher.model | |
model.to(device="cuda") | |
model.infer_mode() | |
# The model with loss function for training. | |
# Note that model_w_loss and model are different models but bind to the same NDArrays, | |
# so model can be directly used for inference after training model_w_loss. | |
model_w_loss = bencher.model_w_loss | |
model_w_loss.to(device="cuda") | |
model_w_loss.train_mode() | |
optimizer = mnm.optim.sgd.with_sgd(learning_rate=0.1, momentum=0.01)(model_w_loss) | |
record = None | |
executor = None | |
for epoch in range(n_epoch): | |
running_loss = [] | |
print_period = 500 | |
for idx, (inputs, labels) in enumerate(trainloader, 0): | |
inputs, labels = torch_to_meta(inputs, labels, batch_size, image_size) | |
if inputs is None and labels is None: | |
break | |
# FIXME: Make optimizer accept default. | |
dy, _ = randn_torch((), std=0.0, mean=1.0, requires_grad=False) | |
args = [dy, inputs, labels] | |
# Initialie the VM executor in the first run. | |
if record is None and executor is None: | |
executor, record = get_vm_executor(optimizer, args) | |
# Train a mini-batch | |
loss = run_executor(executor, record, args) | |
while isinstance(loss, (tuple, tvm.ir.container.Array, mnm._core.value.TupleValue)): | |
loss = loss[0] | |
running_loss.append(asnumpy(loss)) | |
if idx % print_period == print_period - 1: | |
print("[%d, %5d] loss: %.3f" % (epoch + 1, idx + 1, np.mean(running_loss))) | |
running_loss = [] | |
if running_loss: | |
print("[%d, final] loss: %.3f" % (epoch + 1, np.mean(running_loss))) | |
record = None | |
executor = None | |
correct = 0 | |
total = 0 | |
for inputs, labels in testloader: | |
inputs, _ = torch_to_meta(inputs, labels, batch_size, image_size, one_hot=False) | |
labels = labels.numpy() | |
if inputs is None: | |
break | |
args = [inputs] | |
# Initialie the VM executor in the first run. | |
if record is None and executor is None: | |
executor, record = get_vm_executor(model, args) | |
outputs = asnumpy(run_executor(executor, record, args)[0]) | |
predicted = np.argmax(outputs, axis=1) | |
total += predicted.shape[0] | |
correct += (predicted == labels).sum() | |
print("Accuracy of the network on the 10000 test images: %d %%" % (100 * correct / total)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment