comaniac · April 8, 2021 01:32
diff --git a/meta_train_e2e_resnet.py b/meta_train_e2e_resnet.py
 # Pytorch reference
 # https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html#train-the-network

 import numpy as np

 import torch
 import torchvision
 import torchvision.transforms as transforms

 import tvm

 import mnm
 from mnm._core.executor import VMExecutor
 from mnm.model.trace import _get_func_inputs
 from mnm.testing import randn_torch, one_hot_torch, randint, asnumpy
 from mnm.testing.utils import ir_fusion

 import benchmark

 # Configuration
 run_pytorch = False
 n_epoch = 2 # Change to a larger number for better accuracy.
 batch_size = 32
 image_size = (32, 32) # CIFAR-10: 32x32, ImageNet: 224x224

 transform = transforms.Compose(
    [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
 )

 trainset = torchvision.datasets.CIFAR10(
    root="./cifar10", train=True, download=True, transform=transform
 )
 trainloader = torch.utils.data.DataLoader(
    trainset, batch_size=batch_size, shuffle=True, num_workers=4
 )

 testset = torchvision.datasets.CIFAR10(
    root="./cifar10", train=False, download=True, transform=transform
 )
 testloader = torch.utils.data.DataLoader(
    testset, batch_size=batch_size, shuffle=False, num_workers=4
 )


 if run_pytorch:
    bencher = benchmark.get_model_bencher(
        "resnet50", batch_size=batch_size, shape=image_size, include_orig_model=True
    )
    model = bencher.ref_bencher.model
    model.to(device="cuda")
    model.train()
    optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.01)

    for epoch in range(n_epoch):
        running_loss = []
        print_period = 500
        for idx, (inputs, labels) in enumerate(trainloader, 0):
            inputs = inputs.to(device="cuda")
            labels = labels.to(device="cuda")

            optimizer.zero_grad()
            t_y = model(inputs)
            if isinstance(t_y, tuple):
                t_y = t_y[0]

            t_ypred = torch.log_softmax(t_y, dim=-1)
            t_loss = torch.nn.functional.nll_loss(t_ypred, labels)
            if isinstance(t_loss, tuple):
                if hasattr(t_loss[0], "backward"):
                    t_loss = t_loss[0]
                else:
                    assert hasattr(t_loss[1], "backward")
                    t_loss = t_loss[1]
            t_loss.backward()
            optimizer.step()
            torch.cuda.synchronize()
            running_loss.append(t_loss.item())

            if idx % print_period == print_period - 1:
                print("[%d, %5d] loss: %.3f" % (epoch + 1, idx + 1, np.mean(running_loss)))
                running_loss = []

        if running_loss:
            print("[%d, final] loss: %.3f" % (epoch + 1, np.mean(running_loss)))

    model.eval()

    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in testloader:
            inputs = inputs.to(device="cuda")
            labels = labels.to(device="cuda")

            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print("Accuracy of the network on the 10000 test images: %d %%" % (100 * correct / total))
 else:

    def torch_to_meta(inputs, labels, batch_size, image_size, one_hot=True):
        """Convert PyTorch tensor inputs and labels to Meta."""
        inputs = inputs.numpy()
        targets = labels.numpy()

        # FIXME: We don't support dynamic shape so we need to pad the last batch.
        if targets.shape[0] < batch_size:
            # FIXME: Remove this line results in CUDA error at the last batch. Unknown reason yet.
            return None, None

            pad = batch_size - targets.shape[0]

            # Pad inputs with all 0s.
            inputs = np.append(inputs, np.zeros([pad, 3, *image_size], dtype="float32"), axis=0)

            # Pad outputs with the unused label.
            targets = np.append(targets, [10 for _ in range(pad)])

        inputs = mnm.array(inputs, device="cuda")
        assert inputs.shape == (batch_size, 3, *image_size)

        # FIXME: make nll_loss accept index in addition to one hot.
        if one_hot:
            labels = np.zeros((batch_size, 1000), dtype="float32")
            labels[range(batch_size), targets] = 1
            assert labels.shape == (batch_size, 1000)
        else:
            labels = targets
            assert labels.shape == (batch_size,)

        labels = mnm.array(labels, device="cuda")
        return inputs, labels

    def get_vm_executor(model, args):
        """Apply fusion level 1 and make a VM executor."""
        record = model._internal(*args)
        mod = record.mod
        mod = ir_fusion(mod, fuse_opt_level=1)
        executor = VMExecutor(mod, "cuda").make_executor()
        return executor, record

    def run_executor(executor, record, args):
        """Form VM inputs with inputs and paramters and run the executor."""
        # Prepare VM inputs.
        vm_inputs = _get_func_inputs(record, args, {}, get_handle=False)
        return executor(*vm_inputs)

    bencher = benchmark.get_model_bencher(
        "resnet50", batch_size=batch_size, shape=image_size, include_orig_model=False
    )

    # The model without loss for inference.
    model = bencher.model
    model.to(device="cuda")
    model.infer_mode()

    # The model with loss function for training.
    # Note that model_w_loss and model are different models but bind to the same NDArrays,
    # so model can be directly used for inference after training model_w_loss.
    model_w_loss = bencher.model_w_loss
    model_w_loss.to(device="cuda")
    model_w_loss.train_mode()
    optimizer = mnm.optim.sgd.with_sgd(learning_rate=0.1, momentum=0.01)(model_w_loss)

    record = None
    executor = None
    for epoch in range(n_epoch):
        running_loss = []
        print_period = 500
        for idx, (inputs, labels) in enumerate(trainloader, 0):
            inputs, labels = torch_to_meta(inputs, labels, batch_size, image_size)
            if inputs is None and labels is None:
                break

            # FIXME: Make optimizer accept default.
            dy, _ = randn_torch((), std=0.0, mean=1.0, requires_grad=False)

            args = [dy, inputs, labels]

            # Initialie the VM executor in the first run.
            if record is None and executor is None:
                executor, record = get_vm_executor(optimizer, args)

            # Train a mini-batch
            loss = run_executor(executor, record, args)
            while isinstance(loss, (tuple, tvm.ir.container.Array, mnm._core.value.TupleValue)):
                loss = loss[0]

            running_loss.append(asnumpy(loss))
            if idx % print_period == print_period - 1:
                print("[%d, %5d] loss: %.3f" % (epoch + 1, idx + 1, np.mean(running_loss)))
                running_loss = []

        if running_loss:
            print("[%d, final] loss: %.3f" % (epoch + 1, np.mean(running_loss)))

    record = None
    executor = None
    correct = 0
    total = 0
    for inputs, labels in testloader:
        inputs, _ = torch_to_meta(inputs, labels, batch_size, image_size, one_hot=False)
        labels = labels.numpy()
        if inputs is None:
            break

        args = [inputs]

        # Initialie the VM executor in the first run.
        if record is None and executor is None:
            executor, record = get_vm_executor(model, args)

        outputs = asnumpy(run_executor(executor, record, args)[0])
        predicted = np.argmax(outputs, axis=1)
        total += predicted.shape[0]
        correct += (predicted == labels).sum()

    print("Accuracy of the network on the 10000 test images: %d %%" % (100 * correct / total))
	# Pytorch reference
	# https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html#train-the-network

	import numpy as np

	import torch
	import torchvision
	import torchvision.transforms as transforms

	import tvm

	import mnm
	from mnm._core.executor import VMExecutor
	from mnm.model.trace import _get_func_inputs
	from mnm.testing import randn_torch, one_hot_torch, randint, asnumpy
	from mnm.testing.utils import ir_fusion

	import benchmark

	# Configuration
	run_pytorch = False
	n_epoch = 2 # Change to a larger number for better accuracy.
	batch_size = 32
	image_size = (32, 32) # CIFAR-10: 32x32, ImageNet: 224x224

	transform = transforms.Compose(
	[transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))]
	)

	trainset = torchvision.datasets.CIFAR10(
	root="./cifar10", train=True, download=True, transform=transform
	)
	trainloader = torch.utils.data.DataLoader(
	trainset, batch_size=batch_size, shuffle=True, num_workers=4
	)

	testset = torchvision.datasets.CIFAR10(
	root="./cifar10", train=False, download=True, transform=transform
	)
	testloader = torch.utils.data.DataLoader(
	testset, batch_size=batch_size, shuffle=False, num_workers=4
	)


	if run_pytorch:
	bencher = benchmark.get_model_bencher(
	"resnet50", batch_size=batch_size, shape=image_size, include_orig_model=True
	)
	model = bencher.ref_bencher.model
	model.to(device="cuda")
	model.train()
	optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.01)

	for epoch in range(n_epoch):
	running_loss = []
	print_period = 500
	for idx, (inputs, labels) in enumerate(trainloader, 0):
	inputs = inputs.to(device="cuda")
	labels = labels.to(device="cuda")

	optimizer.zero_grad()
	t_y = model(inputs)
	if isinstance(t_y, tuple):
	t_y = t_y[0]

	t_ypred = torch.log_softmax(t_y, dim=-1)
	t_loss = torch.nn.functional.nll_loss(t_ypred, labels)
	if isinstance(t_loss, tuple):
	if hasattr(t_loss[0], "backward"):
	t_loss = t_loss[0]
	else:
	assert hasattr(t_loss[1], "backward")
	t_loss = t_loss[1]
	t_loss.backward()
	optimizer.step()
	torch.cuda.synchronize()
	running_loss.append(t_loss.item())

	if idx % print_period == print_period - 1:
	print("[%d, %5d] loss: %.3f" % (epoch + 1, idx + 1, np.mean(running_loss)))
	running_loss = []

	if running_loss:
	print("[%d, final] loss: %.3f" % (epoch + 1, np.mean(running_loss)))

	model.eval()

	correct = 0
	total = 0
	with torch.no_grad():
	for inputs, labels in testloader:
	inputs = inputs.to(device="cuda")
	labels = labels.to(device="cuda")

	outputs = model(inputs)
	_, predicted = torch.max(outputs, 1)
	total += labels.size(0)
	correct += (predicted == labels).sum().item()

	print("Accuracy of the network on the 10000 test images: %d %%" % (100 * correct / total))
	else:

	def torch_to_meta(inputs, labels, batch_size, image_size, one_hot=True):
	"""Convert PyTorch tensor inputs and labels to Meta."""
	inputs = inputs.numpy()
	targets = labels.numpy()

	# FIXME: We don't support dynamic shape so we need to pad the last batch.
	if targets.shape[0] < batch_size:
	# FIXME: Remove this line results in CUDA error at the last batch. Unknown reason yet.
	return None, None

	pad = batch_size - targets.shape[0]

	# Pad inputs with all 0s.
	inputs = np.append(inputs, np.zeros([pad, 3, *image_size], dtype="float32"), axis=0)

	# Pad outputs with the unused label.
	targets = np.append(targets, [10 for _ in range(pad)])

	inputs = mnm.array(inputs, device="cuda")
	assert inputs.shape == (batch_size, 3, *image_size)

	# FIXME: make nll_loss accept index in addition to one hot.
	if one_hot:
	labels = np.zeros((batch_size, 1000), dtype="float32")
	labels[range(batch_size), targets] = 1
	assert labels.shape == (batch_size, 1000)
	else:
	labels = targets
	assert labels.shape == (batch_size,)

	labels = mnm.array(labels, device="cuda")
	return inputs, labels

	def get_vm_executor(model, args):
	"""Apply fusion level 1 and make a VM executor."""
	record = model._internal(*args)
	mod = record.mod
	mod = ir_fusion(mod, fuse_opt_level=1)
	executor = VMExecutor(mod, "cuda").make_executor()
	return executor, record

	def run_executor(executor, record, args):
	"""Form VM inputs with inputs and paramters and run the executor."""
	# Prepare VM inputs.
	vm_inputs = _get_func_inputs(record, args, {}, get_handle=False)
	return executor(*vm_inputs)

	bencher = benchmark.get_model_bencher(
	"resnet50", batch_size=batch_size, shape=image_size, include_orig_model=False
	)

	# The model without loss for inference.
	model = bencher.model
	model.to(device="cuda")
	model.infer_mode()

	# The model with loss function for training.
	# Note that model_w_loss and model are different models but bind to the same NDArrays,
	# so model can be directly used for inference after training model_w_loss.
	model_w_loss = bencher.model_w_loss
	model_w_loss.to(device="cuda")
	model_w_loss.train_mode()
	optimizer = mnm.optim.sgd.with_sgd(learning_rate=0.1, momentum=0.01)(model_w_loss)

	record = None
	executor = None
	for epoch in range(n_epoch):
	running_loss = []
	print_period = 500
	for idx, (inputs, labels) in enumerate(trainloader, 0):
	inputs, labels = torch_to_meta(inputs, labels, batch_size, image_size)
	if inputs is None and labels is None:
	break

	# FIXME: Make optimizer accept default.
	dy, _ = randn_torch((), std=0.0, mean=1.0, requires_grad=False)

	args = [dy, inputs, labels]

	# Initialie the VM executor in the first run.
	if record is None and executor is None:
	executor, record = get_vm_executor(optimizer, args)

	# Train a mini-batch
	loss = run_executor(executor, record, args)
	while isinstance(loss, (tuple, tvm.ir.container.Array, mnm._core.value.TupleValue)):
	loss = loss[0]

	running_loss.append(asnumpy(loss))
	if idx % print_period == print_period - 1:
	print("[%d, %5d] loss: %.3f" % (epoch + 1, idx + 1, np.mean(running_loss)))
	running_loss = []

	if running_loss:
	print("[%d, final] loss: %.3f" % (epoch + 1, np.mean(running_loss)))

	record = None
	executor = None
	correct = 0
	total = 0
	for inputs, labels in testloader:
	inputs, _ = torch_to_meta(inputs, labels, batch_size, image_size, one_hot=False)
	labels = labels.numpy()
	if inputs is None:
	break

	args = [inputs]

	# Initialie the VM executor in the first run.
	if record is None and executor is None:
	executor, record = get_vm_executor(model, args)

	outputs = asnumpy(run_executor(executor, record, args)[0])
	predicted = np.argmax(outputs, axis=1)
	total += predicted.shape[0]
	correct += (predicted == labels).sum()

	print("Accuracy of the network on the 10000 test images: %d %%" % (100 * correct / total))