Cody Yu comaniac

Lorien: A Hyper-Automated Tuning System for Tensor Operators

Lorien is a system built on the top of TVM to massively explore/benchmark the best schedule configs of TOPI schedules.

Motivation

Although TVM already has a TOPI (TVM Operator Inventory) with the implementations of algorithm and schedules for commonly used operators such as conv2d and dense, there is a challenge makes TOPI hard to be improved efficiently.

	import numpy as np

	import tvm
	from tvm import auto_scheduler, te, topi
	from tvm.te import schedule

	# The last layer in resnet
	H, W, CO, CI, KH, KW, strides, padding = 7, 7, 512, 512, 3, 3, (1, 1), (1, 1)

	def conv2d(N, H, W, CO, CI, KH, KW, stride, padding):

	import numpy as np

	import tvm
	from tvm import relay
	from tvm.runtime.vm import VirtualMachine

	target = "cuda"
	data_shape = (relay.Any(), 3, 224, 224)
	weight_shape = (32, 3, 3, 3)

	"""BYOC Demo using TensorRT."""
	# pylint: disable=invalid-name,redefined-outer-name,missing-function-docstring

	# config.cmake
	# set(USE_TENSORRT_CODEGEN ON)
	# set(USE_TENSORRT_RUNTIME ON)
	# Add TensorRT to LD_LIBRARY_PATH if use tarball.
	# export LD_LIBRARY_PATH=/path/to/tensorrt/lib:$LD_LIBRARY_PATH

	import time

	import logging
	import numpy as np

	import tvm
	from tvm import relay, te, topi, transform, auto_scheduler
	from tvm.contrib import graph_runtime
	from tvm.relay.backend import compile_engine

	# logging.basicConfig(level=logging.INFO)

	extracted_reduction_ax0, extracted_reduction_ax1, extracted_reduction_ax2, extracted_reduction_ax3, extracted_reduction_n0_n0_k1_shifted_shifted, extracted_reduction_n1_n1_k2_shifted_shifted, extracted_reduction_n2_n2_k3_shifted_shifted = tuple(extracted_reduction.op.axis) + tuple(extracted_reduction.op.reduce_axis)
	pad_temp_data_grad_ax0, pad_temp_data_grad_ax1, pad_temp_data_grad_ax2, pad_temp_data_grad_ax3 = tuple(pad_temp_data_grad.op.axis) + tuple(pad_temp_data_grad.op.reduce_axis)
	pad_temp_i0, pad_temp_i1, pad_temp_i2, pad_temp_i3 = tuple(pad_temp.op.axis) + tuple(pad_temp.op.reduce_axis)
	compute_kernel_grad_ax0, compute_kernel_grad_ax1, compute_kernel_grad_ax2, compute_kernel_grad_ax3, compute_kernel_grad_n0_n0_k0_shifted_shifted, compute_kernel_grad_n1_n1_k2_shifted_shifted, compute_kernel_grad_n2_n2_k3_shifted_shifted = tuple(compute_kernel_grad.op.axis) + tuple(compute_kernel_grad.op.reduce_axis)
	compute_kernel_grad_local, = s.cache_write([compute_kernel_grad], "local")
	compute_kernel_grad_local_ax0

	import os
	import numpy as np

	import logging

	import tvm
	from tvm import auto_scheduler, te, topi
	from tvm.topi.nn.util import get_pad_tuple
	from tvm.auto_scheduler.compute_dag import ComputeDAG