wandering on a rock

Aryan a-r-r-o-w

wandering on a rock

a-r-r-o-w / attempt_eager_layernorm_linear_activation.py

Created June 20, 2025 13:13

Attempt to make fused LayerNorm + Linear + Activation

	import pathlib

	import torch
	import torch._dynamo.config
	import triton
	import triton.language as tl

	torch._dynamo.config.cache_size_limit = 10000

a-r-r-o-w / benchmark_flux_3_cp.py

Created June 19, 2025 08:54

	import argparse
	import contextlib
	import math
	import pathlib
	from typing import List, Optional, Tuple, Union

	import numpy as np
	import torch
	import torch.distributed as dist
	import torch.nn as nn

a-r-r-o-w / benchmark_flux_3.py

Created June 19, 2025 08:53

	import argparse
	import contextlib
	import math
	import pathlib
	from typing import List, Optional, Tuple

	import numpy as np
	import torch
	import torch.nn as nn
	import torch.profiler._utils

a-r-r-o-w / ring_attention_when_you_forget_to_do_the_rotations.py

Created June 18, 2025 05:17

ring attention when you forget to do the rotations

	import argparse
	import contextlib
	import math
	import pathlib
	from typing import List, Optional, Tuple

	import numpy as np
	import torch
	import torch.distributed as dist
	import torch.nn as nn

a-r-r-o-w / benchmark_attention.py

Created June 17, 2025 02:11

SDPA benchmark for torch, FA2, FA3, transformer engine, xformers, Sage Attention and HF kernels-lib

	#!/usr/bin/env python3
	# Benchmarking common shapes for Flux 1024x1024px image + varying text sequence lengths

	import functools
	import os
	import pathlib

	import matplotlib.pyplot as plt
	import torch
	import torch._dynamo.config

a-r-r-o-w / benchmark_flux.py

Created June 16, 2025 20:16

	import argparse
	import contextlib
	import math
	from typing import List, Optional, Tuple

	import numpy as np
	import torch
	import torch.nn as nn
	import torch._inductor.config
	import torch._higher_order_ops.auto_functionalize as af

a-r-r-o-w / benchmark_flux_2.py

Last active June 16, 2025 04:58

	import argparse
	import contextlib
	import math
	from typing import List, Optional, Tuple

	import numpy as np
	import torch
	import torch.nn as nn
	import torch._inductor.config
	import torch._higher_order_ops.auto_functionalize as af

a-r-r-o-w / attention.py

Created May 30, 2025 04:31

Copy-pastable implementation for various attention backends

	import contextlib
	import functools
	import inspect
	import os
	from enum import Enum
	from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union

	FINETRAINERS_ATTN_CHECKS = os.environ.get("FINETRAINERS_ATTN_CHECKS", "0").lower() in ("1", "true", "yes")
	FINETRAINERS_ATTN_PROVIDER = os.environ.get("FINETRAINERS_ATTN_PROVIDER", "native").lower()

a-r-r-o-w / extract_lora.py

Created May 28, 2025 12:01

	# Reference: https://github.com/arcee-ai/mergekit/blob/488957e8e67c82861ecf63ef761f6bc59122dc74/mergekit/scripts/extract_lora.py

	import argparse

	import torch
	from safetensors.torch import load_file, save_file

	torch.backends.cuda.matmul.allow_tf32 = True
	torch.backends.cudnn.allow_tf32 = True
	torch.backends.cuda.preferred_linalg_library("cusolver")

a-r-r-o-w / cp_wan_example.py

Last active May 13, 2025 12:00

	import torch
	import torch.distributed as dist
	from diffusers import AutoencoderKLWan, WanPipeline
	from diffusers.utils import export_to_video

	from finetrainers._metadata import ParamId, CPInput, CPOutput
	from finetrainers.parallel.ptd import apply_context_parallel
	from finetrainers.models.attention_dispatch import attention_provider, attention_dispatch

	torch.nn.functional.scaled_dot_product_attention = attention_dispatch