add torch.cuda.cudart().cudaProfilerStart() and torch.cuda.cudart().cudaProfilerStop() where profiling should start and stop.
launch profiler with
CUDA_VISIBLE_DEVICES=0,1,2,3 \
nsys profile \
-w true \
-t cuda,nvtx,osrt,cudnn,cublas \
| """ | |
| test performance and correctness of ulysses parallel attention vs single gpu attention | |
| torchrun --nproc-per-node 2 benchmark_attn.py | |
| using two H100s I get: | |
| Rank 0 single gpu attention: 1698.14 ms | |
| Rank 0 ulysses attention: 912.84 ms | |
| running pip install para-attn should install everything needed | |
| """ |
| """ | |
| test performance and correctness of ring attention vs. single gpu attention | |
| torchrun --nproc-per-node 4 ring_attn.py | |
| using 4 H100s I get: | |
| Rank 0 single gpu attention: 261.78 ms | |
| Rank 0 ring attention: 73.34 ms | |
| """ | |
| import os | |
| import math |
| import functools | |
| import torch | |
| import math | |
| def taylor_seer_approximation(WARMUP_STEPS=1, SKIP_INTERVAL_STEPS=1, compute_step_map=None, n_derivatives = 2): | |
| """ | |
| A decorator that approximates the forward pass of an nn.Module to reduce computation. | |
| Args: | |
| warmup: Number of steps to compute the actual forward pass before starting approximation |
| import os | |
| from tqdm import tqdm | |
| import torch | |
| import torch.distributed as dist | |
| import torch.nn.functional as F | |
| from torch.distributed.tensor.experimental import context_parallel | |
| from torch.distributed.tensor.experimental._attention import _cp_options | |
| from torch.nn.attention import SDPBackend, sdpa_kernel |
| import torch | |
| import torch.nn.functional as F | |
| import torch.profiler | |
| def benchmark_forward_pass(q, k, v, num_warmup=10, num_timed_runs=20): | |
| """ | |
| Benchmarks the forward pass of torch.nn.functional.scaled_dot_product_attention. | |
| """ | |
| start_event = torch.cuda.Event(enable_timing=True) | |
| end_event = torch.cuda.Event(enable_timing=True) |
| # torchrun --nproc-per-node 1 benchmark_sdpa.py | |
| # torchrun --nproc-per-node 2 benchmark_sdpa.py | |
| # torchrun --nproc-per-node 4 benchmark_sdpa.py | |
| # torchrun --nproc-per-node 8 benchmark_sdpa.py | |
| import torch | |
| import torch.nn.functional as F | |
| from torch.distributed.tensor.experimental import context_parallel | |
| from torch.distributed.tensor.experimental._attention import _cp_options |
| # mypy: allow-untyped-decorators | |
| # mypy: allow-untyped-defs | |
| import functools | |
| import itertools | |
| import logging | |
| import operator | |
| from collections import Counter, defaultdict | |
| from typing import Any, Callable, Optional, TypeVar, Union | |
| from typing_extensions import ParamSpec |