add torch.cuda.cudart().cudaProfilerStart()
and torch.cuda.cudart().cudaProfilerStop()
where profiling should start and stop.
launch profiler with
CUDA_VISIBLE_DEVICES=0,1,2,3 \
nsys profile \
-w true \
-t cuda,nvtx,osrt,cudnn,cublas \
# torchrun --nproc-per-node 1 benchmark_sdpa.py | |
# torchrun --nproc-per-node 2 benchmark_sdpa.py | |
# torchrun --nproc-per-node 4 benchmark_sdpa.py | |
# torchrun --nproc-per-node 8 benchmark_sdpa.py | |
import torch | |
import torch.nn.functional as F | |
from torch.distributed.tensor.experimental import context_parallel | |
from torch.distributed.tensor.experimental._attention import _cp_options |
import torch | |
import torch.nn.functional as F | |
import torch.profiler | |
def benchmark_forward_pass(q, k, v, num_warmup=10, num_timed_runs=20): | |
""" | |
Benchmarks the forward pass of torch.nn.functional.scaled_dot_product_attention. | |
""" | |
start_event = torch.cuda.Event(enable_timing=True) | |
end_event = torch.cuda.Event(enable_timing=True) |
import os | |
from tqdm import tqdm | |
import torch | |
import torch.distributed as dist | |
import torch.nn.functional as F | |
from torch.distributed.tensor.experimental import context_parallel | |
from torch.distributed.tensor.experimental._attention import _cp_options | |
from torch.nn.attention import SDPBackend, sdpa_kernel |
import functools | |
import torch | |
import math | |
def taylor_seer_approximation(WARMUP_STEPS=1, SKIP_INTERVAL_STEPS=1, compute_step_map=None, n_derivatives = 2): | |
""" | |
A decorator that approximates the forward pass of an nn.Module to reduce computation. | |
Args: | |
warmup: Number of steps to compute the actual forward pass before starting approximation |
""" | |
test performance and correctness of ring attention vs. single gpu attention | |
torchrun --nproc-per-node 4 ring_attn.py | |
using 4 H100s I get: | |
Rank 0 single gpu attention: 261.78 ms | |
Rank 0 ring attention: 73.34 ms | |
""" | |
import os | |
import math |
""" | |
test performance and correctness of ulysses parallel attention vs single gpu attention | |
torchrun --nproc-per-node 2 benchmark_attn.py | |
using two H100s I get: | |
Rank 0 single gpu attention: 1698.14 ms | |
Rank 0 ulysses attention: 912.84 ms | |
running pip install para-attn should install everything needed | |
""" |