Skip to content

Instantly share code, notes, and snippets.

View davidberard98's full-sized avatar

David Berard davidberard98

  • PyTorch
  • Menlo Park, CA
View GitHub Profile
"""
Original kernel is from https://github.com/triton-lang/triton/issues/4906.
This kernel is modified to use dot_scaled and fp4. It _should_ be faster than int4 because it skips the int->float conversion, but it's not.
"""
# pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly;
# OMP_NUM_THREADS=16 CUDA_VISIBLE_DEVICES=0 ipython3 A100_vs_4090_test.py
##########################################################################
import torch
# Results:
#
# Vertical indices ms: 2.8862898349761963
# Horizontal indices ms: 0.3734990060329437
import torch
import triton
import triton.language as tl
BLOCK_SIZE = 64
--- mobicham.py 2024-11-25 14:02:15.355460967 -0800
+++ mobicham_fp4.py 2024-11-25 14:44:09.015276420 -0800
@@ -42,6 +42,7 @@
a_ptr, b_ptr, c_ptr,
M, N, K,
elements_per_sample: tl.constexpr,
+ b_type: tl.constexpr,
stride_am, stride_ak,
stride_bk, stride_bn,
stride_cm, stride_cn,
# pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly;
# OMP_NUM_THREADS=16 CUDA_VISIBLE_DEVICES=0 ipython3 A100_vs_4090_test.py
##########################################################################
import torch
import triton
import triton.language as tl
from triton.testing import do_bench
import itertools
# AOT ID: ['0_inference']
from ctypes import c_void_p, c_long, c_int
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from torch._inductor.hooks import run_intermediate_hooks
from torch._inductor.utils import maybe_profile
# AOT ID: ['0_inference']
from ctypes import c_void_p, c_long, c_int
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from torch._inductor.hooks import run_intermediate_hooks
from torch._inductor.utils import maybe_profile
# AOT ID: ['0_inference']
from ctypes import c_void_p, c_long, c_int
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from torch._inductor.hooks import run_intermediate_hooks
from torch._inductor.utils import maybe_profile
import torch
import torch._functorch.config
def fn(values, offsets, w):
for _ in range(10):
nt = torch.nested.nested_tensor_from_jagged(values, offsets, min_seqlen=1, max_seqlen=4).view(-1, -1, 4, 16).transpose(1, 2)
nt = torch.nn.functional.scaled_dot_product_attention(nt, nt, nt)
values = nt.transpose(1, 2).view(-1, -1, 64).values().cos()
values = values @ w
return values
/home/dberard/local/pytorch/torch/backends/cudnn/__init__.py:106: UserWarning: PyTorch was compiled without cuDNN/MIOpen support. To use cuDNN/MIOpen, rebuild PyTorch making sure the library is visible to the build system.
warnings.warn(
/home/dberard/local/pytorch/torch/backends/cudnn/__init__.py:106: UserWarning: PyTorch was compiled without cuDNN/MIOpen support. To use cuDNN/MIOpen, rebuild PyTorch making sure the library is visible to the build system.
warnings.warn(
/home/dberard/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/z3/z3core.py:5: DeprecationWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html
import pkg_resources
/home/dberard/local/miniconda3/envs/pytorch/lib/python3.10/site-packages/pkg_resources/__init__.py:2871: DeprecationWarning: Deprecated call to `pkg_resources.declare_namespace('ruamel')`.
Implementing implicit namespace packages (as specified in PEP 420) is preferred to `pkg_resources.declare_namespace`. See ht
E
======================================================================
ERROR: test_torch_function_call_to_size_within_aot_autograd_graph (__main__.TestNestedTensor.test_torch_function_call_to_size_within_aot_autograd_graph)
----------------------------------------------------------------------
Traceback (most recent call last):
File "/data/users/dberard/pytorch/torch/testing/_internal/common_utils.py", line 2739, in wrapper
method(*args, **kwargs)
File "/data/users/dberard/pytorch/test/dynamo/test_subclasses.py", line 1403, in test_torch_function_call_to_size_within_aot_autograd_graph
compiled_fn(x, y)
File "/data/users/dberard/pytorch/torch/_dynamo/eval_frame.py", line 451, in _fn