Skip to content

Instantly share code, notes, and snippets.

from diffusers import StableDiffusionXLPipeline
pipe = StableDiffusionXLPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.bfloat16
).to("cuda")
prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
image = pipe(prompt, num_inference_steps=30).images[0]
@scottt
scottt / validate_torch_vroom.py
Created May 26, 2025 01:37
Pytorch Performance Validation
import torch
from torch.nn.functional import scaled_dot_product_attention
from torch.nn.attention import SDPBackend
###############################################################################
# Check for GPU
###############################################################################
if not torch.cuda.is_available():
raise SystemExit("CUDA GPU is not available. Please run on a CUDA-enabled device.")
#!/usr/bin/env python
import os
import sys
import subprocess
import time
from pathlib import Path
# --- Configuration ---
#!/usr/bin/env python
import os
import sys
import subprocess
import platform
import time
import signal
from pathlib import Path
@scottt
scottt / aotriton-nokernel-build.py
Created May 20, 2025 20:15
aotriton build on Windows for gfx1101
#!/usr/bin/env python
import os
import sys
import subprocess
import time
from pathlib import Path
# --- Configuration ---
@scottt
scottt / GNUmakefile
Created May 11, 2025 21:24
aotriton attn_fwd C++ test program
PROGRAMS := $(basename $(wildcard *.cpp))
ROCM_PREFIX := $(HOME)/therock-upstream-output/build/dist/rocm
HIP_CFLAGS := -I$(ROCM_PREFIX)/include
AOTRITON_CFLAGS := -D__HIP_PLATFORM_AMD__ -I$(HOME)/aotriton-output/build/install_dir/include
AOTRITON_LIBS := -L$(HOME)/aotriton-output/build/install_dir/lib -laotriton_v2 -L$(ROCM_PREFIX)/lib -lamdhip64
%: %.cpp
clang++ $(HIP_CFLAGS) $(AOTRITON_CFLAGS) $< $(AOTRITON_LIBS) -o $@
@scottt
scottt / aotriton-build.py
Last active June 6, 2025 22:41
Triton on Windows
#!/usr/bin/env python
import os
import sys
import subprocess
import time
from pathlib import Path
# --- Configuration ---
@scottt
scottt / m1.c
Last active September 19, 2024 06:22
scanf to printf on AArch64
#include <stdio.h>
#include <stdint.h>
#include <sys/mman.h>
int main()
{
mprotect((void *)((unsigned long)main & (~0xfff)), 4096,
PROT_EXEC|PROT_WRITE);
{ // gcc and clang puts the "0:", "1:" etc labels below at different places
char *p;
@scottt
scottt / GNUmakefile
Last active April 5, 2023 19:22
Memory stores cause minor page faults after fork() but not after vfork() on Linux
CFLAGS := -Wall -Og
PROGRAMS := fork-exec-memset vfork-exec-memset
.PHONY: all
all: $(PROGRAMS)
.PHONY: test
test: all
./test-fork-exec-memset
# openssl speed -elapsed -evp aes-128-gcm
You have chosen to measure elapsed time instead of user CPU time.
Doing aes-128-gcm for 3s on 16 size blocks: 55740342 aes-128-gcm's in 3.00s
Doing aes-128-gcm for 3s on 64 size blocks: 36792402 aes-128-gcm's in 3.00s
Doing aes-128-gcm for 3s on 256 size blocks: 19513330 aes-128-gcm's in 3.00s
Doing aes-128-gcm for 3s on 1024 size blocks: 6431553 aes-128-gcm's in 3.00s
Doing aes-128-gcm for 3s on 8192 size blocks: 943229 aes-128-gcm's in 3.00s
Doing aes-128-gcm for 3s on 16384 size blocks: 478375 aes-128-gcm's in 3.00s
OpenSSL 1.1.1c 28 May 2019
built on: Wed May 29 17:53:30 2019 UTC