eqy

Motivation

	"""
	Auto-tuning a convolutional network for NVIDIA GPU
	====================================================
	Author: `Lianmin Zheng <https://https://github.com/merrymercy>`_

	Auto-tuning for specific devices and workloads is critical for getting the
	best performance. This is a tutorial on how to tune a whole convolutional
	network for NVIDIA GPU.

	The operator implementation for NVIDIA GPU in TVM is written in template form.

	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#

	import logging

	import mxnet as mx
	import tvm
	import nnvm.frontend
	import nnvm.compiler
	from mxnet import gluon
	from mxnet.gluon.model_zoo import vision
	from tvm import relay
	from tvm.contrib import ndk

	#!/bin/bash
	PYTHONPATH=/tvm/python:$PYTHONPATH && python3 -m tvm.exec.rpc_tracker --host 0.0.0.0 --port 9190 &
	while true; do
	res=$(PYTHONPATH=/tvm/python:$PYTHONPATH && python3 -m tvm.exec.query_rpc_tracker --host 0.0.0.0 --port 9190 2>&1 \| grep 'Cannot connect to tracker')
	if [ "$res" == "" ]; then
	echo "OK..."
	else
	echo "RESTARTING @ " $(date)
	PYTHONPATH=/tvm/python:$PYTHONPATH && python3 -m tvm.exec.rpc_tracker --host 0.0.0.0 --port 9190 &
	fi

	# This isn't supposed to run as a bash script, i named it with ".sh" for syntax highlighting.

	# https://developer.nvidia.com/nsight-systems
	# https://docs.nvidia.com/nsight-systems/profiling/index.html

	# My preferred nsys (command line executable used to create profiles) commands
	#
	# In your script, write
	# torch.cuda.nvtx.range_push("region name")
	# ...

	import torch
	import time

	torch.backends.cudnn.benchmark = True

	iters = 10

	conv = torch.nn.Conv2d(64, 64, 3, 3, groups=64, dtype=torch.half, device='cuda')
	convb = torch.nn.Conv2d(64, 64, 3, 3, groups=64, dtype=torch.bfloat16, device='cuda')
	data = torch.randn(16, 64, 1024, 1024, dtype=torch.half, device='cuda')

	import torch
	from torch.nn.functional import scaled_dot_product_attention
	from torch.nn.attention import SDPBackend, sdpa_kernel

	batch = 4
	seq_len_q = 512
	seq_len_kv = 1024
	D = 128
	# Sample call to SDPA - GQ
	query = torch.rand(batch, 32, seq_len_q, D, device='cuda', dtype=torch.bfloat16)