Skip to content

Instantly share code, notes, and snippets.

View a-r-r-o-w's full-sized avatar
:octocat:
wandering on a rock

Aryan a-r-r-o-w

:octocat:
wandering on a rock
View GitHub Profile
@a-r-r-o-w
a-r-r-o-w / attempt_eager_layernorm_linear_activation.py
Created June 20, 2025 13:13
Attempt to make fused LayerNorm + Linear + Activation
import pathlib
import torch
import torch._dynamo.config
import triton
import triton.language as tl
torch._dynamo.config.cache_size_limit = 10000
import argparse
import contextlib
import math
import pathlib
from typing import List, Optional, Tuple, Union
import numpy as np
import torch
import torch.distributed as dist
import torch.nn as nn
import argparse
import contextlib
import math
import pathlib
from typing import List, Optional, Tuple
import numpy as np
import torch
import torch.nn as nn
import torch.profiler._utils
@a-r-r-o-w
a-r-r-o-w / ring_attention_when_you_forget_to_do_the_rotations.py
Created June 18, 2025 05:17
ring attention when you forget to do the rotations
import argparse
import contextlib
import math
import pathlib
from typing import List, Optional, Tuple
import numpy as np
import torch
import torch.distributed as dist
import torch.nn as nn
@a-r-r-o-w
a-r-r-o-w / benchmark_attention.py
Created June 17, 2025 02:11
SDPA benchmark for torch, FA2, FA3, transformer engine, xformers, Sage Attention and HF kernels-lib
#!/usr/bin/env python3
# Benchmarking common shapes for Flux 1024x1024px image + varying text sequence lengths
import functools
import os
import pathlib
import matplotlib.pyplot as plt
import torch
import torch._dynamo.config
import argparse
import contextlib
import math
from typing import List, Optional, Tuple
import numpy as np
import torch
import torch.nn as nn
import torch._inductor.config
import torch._higher_order_ops.auto_functionalize as af
import argparse
import contextlib
import math
from typing import List, Optional, Tuple
import numpy as np
import torch
import torch.nn as nn
import torch._inductor.config
import torch._higher_order_ops.auto_functionalize as af
@a-r-r-o-w
a-r-r-o-w / attention.py
Created May 30, 2025 04:31
Copy-pastable implementation for various attention backends
import contextlib
import functools
import inspect
import os
from enum import Enum
from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union
FINETRAINERS_ATTN_CHECKS = os.environ.get("FINETRAINERS_ATTN_CHECKS", "0").lower() in ("1", "true", "yes")
FINETRAINERS_ATTN_PROVIDER = os.environ.get("FINETRAINERS_ATTN_PROVIDER", "native").lower()
# Reference: https://github.com/arcee-ai/mergekit/blob/488957e8e67c82861ecf63ef761f6bc59122dc74/mergekit/scripts/extract_lora.py
import argparse
import torch
from safetensors.torch import load_file, save_file
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.backends.cuda.preferred_linalg_library("cusolver")
import torch
import torch.distributed as dist
from diffusers import AutoencoderKLWan, WanPipeline
from diffusers.utils import export_to_video
from finetrainers._metadata import ParamId, CPInput, CPOutput
from finetrainers.parallel.ptd import apply_context_parallel
from finetrainers.models.attention_dispatch import attention_provider, attention_dispatch
torch.nn.functional.scaled_dot_product_attention = attention_dispatch