December 18, 2024 18:29 · December 13, 2024 01:12 · November 25, 2024 22:47 · November 25, 2024 22:45 · November 25, 2024 19:33 · November 4, 2024 18:34
 """
 Original kernel is from https://github.com/triton-lang/triton/issues/4906.

 This kernel is modified to use dot_scaled and fp4. It _should_ be faster than int4 because it skips the int->float conversion, but it's not.
 """

 # pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly;
 # OMP_NUM_THREADS=16 CUDA_VISIBLE_DEVICES=0 ipython3 A100_vs_4090_test.py
 ##########################################################################
 import torch
 # Results:
 #
 # Vertical indices ms:  2.8862898349761963
 # Horizontal indices ms:  0.3734990060329437

 import torch
 import triton
 import triton.language as tl

 BLOCK_SIZE = 64
 --- mobicham.py	2024-11-25 14:02:15.355460967 -0800
 +++ mobicham_fp4.py	2024-11-25 14:44:09.015276420 -0800
 @@ -42,6 +42,7 @@
     a_ptr, b_ptr, c_ptr,
     M, N, K,
     elements_per_sample: tl.constexpr,
 +    b_type: tl.constexpr,
     stride_am, stride_ak,
     stride_bk, stride_bn,
     stride_cm, stride_cn,
 # pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly;
 # OMP_NUM_THREADS=16 CUDA_VISIBLE_DEVICES=0 ipython3 A100_vs_4090_test.py
 ##########################################################################
 import torch
 import triton
 import triton.language as tl
 from triton.testing import do_bench

 import itertools
 # AOT ID: ['0_inference']
 from ctypes import c_void_p, c_long, c_int
 import torch
 import math
 import random
 import os
 import tempfile
 from math import inf, nan
 from torch._inductor.hooks import run_intermediate_hooks
 from torch._inductor.utils import maybe_profile
 # AOT ID: ['0_inference']
 from ctypes import c_void_p, c_long, c_int
 import torch
 import math
 import random
 import os
 import tempfile
 from math import inf, nan
 from torch._inductor.hooks import run_intermediate_hooks
 from torch._inductor.utils import maybe_profile
 # AOT ID: ['0_inference']
 from ctypes import c_void_p, c_long, c_int
 import torch
 import math
 import random
 import os
 import tempfile
 from math import inf, nan
 from torch._inductor.hooks import run_intermediate_hooks
 from torch._inductor.utils import maybe_profile
 import torch
 import torch._functorch.config

 def fn(values, offsets, w):
    for _ in range(10):
        nt = torch.nested.nested_tensor_from_jagged(values, offsets, min_seqlen=1, max_seqlen=4).view(-1, -1, 4, 16).transpose(1, 2)
        nt = torch.nn.functional.scaled_dot_product_attention(nt, nt, nt)
        values = nt.transpose(1, 2).view(-1, -1, 64).values().cos()
        values = values @ w
    return values
 /home/dberard/local/pytorch/torch/backends/cudnn/__init__.py:106: UserWarning: PyTorch was compiled without cuDNN/MIOpen support. To use cuDNN/MIOpen, rebuild PyTorch making sure the library is visible to the build system.
  warnings.warn(
 /home/dberard/local/pytorch/torch/backends/cudnn/__init__.py:106: UserWarning: PyTorch was compiled without cuDNN/MIOpen support. To use cuDNN/MIOpen, rebuild PyTorch making sure the library is visible to the build system.
  warnings.warn(
 /home/dberard/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/z3/z3core.py:5: DeprecationWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html
  import pkg_resources
 /home/dberard/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/pkg_resources/__init__.py:2871: DeprecationWarning: Deprecated call to `pkg_resources.declare_namespace('ruamel')`.
 Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See ht
 E
 ======================================================================
 ERROR: test_torch_function_call_to_size_within_aot_autograd_graph (__main__.TestNestedTensor.test_torch_function_call_to_size_within_aot_autograd_graph)
 ----------------------------------------------------------------------
 Traceback (most recent call last):
  File "/data/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 2739, in wrapper
    method(*args, **kwargs)
  File "/data/users/dberard/pytorch/test/dynamo/test_subclasses.py", line 1403, in test_torch_function_call_to_size_within_aot_autograd_graph
    compiled_fn(x, y)
  File "/data/users/dberard/pytorch/torch/_dynamo/eval_frame.py", line 451, in _fn
	"""
	Original kernel is from https://github.com/triton-lang/triton/issues/4906.

	This kernel is modified to use dot_scaled and fp4. It _should_ be faster than int4 because it skips the int->float conversion, but it's not.
	"""

	# pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly;
	# OMP_NUM_THREADS=16 CUDA_VISIBLE_DEVICES=0 ipython3 A100_vs_4090_test.py
	##########################################################################
	import torch
	# Results:
	#
	# Vertical indices ms: 2.8862898349761963
	# Horizontal indices ms: 0.3734990060329437

	import torch
	import triton
	import triton.language as tl

	BLOCK_SIZE = 64
	--- mobicham.py 2024-11-25 14:02:15.355460967 -0800
	+++ mobicham_fp4.py 2024-11-25 14:44:09.015276420 -0800
	@@ -42,6 +42,7 @@
	a_ptr, b_ptr, c_ptr,
	M, N, K,
	elements_per_sample: tl.constexpr,
	+ b_type: tl.constexpr,
	stride_am, stride_ak,
	stride_bk, stride_bn,
	stride_cm, stride_cn,
	# AOT ID: ['0_inference']
	from ctypes import c_void_p, c_long, c_int
	import torch
	import math
	import random
	import os
	import tempfile
	from math import inf, nan
	from torch._inductor.hooks import run_intermediate_hooks
	from torch._inductor.utils import maybe_profile
	import torch
	import torch._functorch.config

	def fn(values, offsets, w):
	for _ in range(10):
	nt = torch.nested.nested_tensor_from_jagged(values, offsets, min_seqlen=1, max_seqlen=4).view(-1, -1, 4, 16).transpose(1, 2)
	nt = torch.nn.functional.scaled_dot_product_attention(nt, nt, nt)
	values = nt.transpose(1, 2).view(-1, -1, 64).values().cos()
	values = values @ w
	return values
	/home/dberard/local/pytorch/torch/backends/cudnn/__init__.py:106: UserWarning: PyTorch was compiled without cuDNN/MIOpen support. To use cuDNN/MIOpen, rebuild PyTorch making sure the library is visible to the build system.
	warnings.warn(
	/home/dberard/local/pytorch/torch/backends/cudnn/__init__.py:106: UserWarning: PyTorch was compiled without cuDNN/MIOpen support. To use cuDNN/MIOpen, rebuild PyTorch making sure the library is visible to the build system.
	warnings.warn(
	/home/dberard/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/z3/z3core.py:5: DeprecationWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html
	import pkg_resources
	/home/dberard/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/pkg_resources/__init__.py:2871: DeprecationWarning: Deprecated call to `pkg_resources.declare_namespace('ruamel')`.
	Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See ht
	E
	======================================================================
	ERROR: test_torch_function_call_to_size_within_aot_autograd_graph (__main__.TestNestedTensor.test_torch_function_call_to_size_within_aot_autograd_graph)
	----------------------------------------------------------------------
	Traceback (most recent call last):
	File "/data/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 2739, in wrapper
	method(args, *kwargs)
	File "/data/users/dberard/pytorch/test/dynamo/test_subclasses.py", line 1403, in test_torch_function_call_to_size_within_aot_autograd_graph
	compiled_fn(x, y)
	File "/data/users/dberard/pytorch/torch/_dynamo/eval_frame.py", line 451, in _fn