Skip to content

Instantly share code, notes, and snippets.

@cloneofsimo
cloneofsimo / test.py
Created December 29, 2024 15:44
Are MFU correlated with watt usage in practice?
#!/usr/bin/env python
import torch
import time
import random
import numpy as np
import multiprocessing
from multiprocessing import Process, Manager, Event
import plotly.express as px
@cloneofsimo
cloneofsimo / vis.py
Created December 5, 2024 00:57
twodimropevis
import torch
class TwoDimRotary(torch.nn.Module):
def __init__(self, dim, base=100, h = 128, w = 128):
super().__init__()
self.inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / (dim)))
self.h = h
self.w = w
@cloneofsimo
cloneofsimo / runner.py
Last active November 14, 2024 20:47
Orthogonal weight update
https://x.com/jxbz/status/1857145985480438073
import torch
def polar_factor_newton_schulz(M, max_iter=50):
M_t = M / M.norm(p='fro')
for _ in range(max_iter):
M_t = 1.5 * M_t - 0.5 * M_t @ M_t.T @ M_t
return M_t
@cloneofsimo
cloneofsimo / syevjBatched.py
Created October 31, 2024 09:43
extended_syevjBatched torch
# batch_eigendecomp.py
import torch
from torch.utils.cpp_extension import load_inline
import argparse
import os
import shutil
def clear_cuda_cache():
cache_path = os.path.expanduser('~/.cache/torch_extensions')
if os.path.exists(cache_path):
@cloneofsimo
cloneofsimo / watch_setup.sh
Created October 25, 2024 22:18
latex-watcher
#!/bin/bash
# Install required packages if not present
check_and_install_dependencies() {
local packages=("inotify-tools" "texlive" "texlive-latex-extra" "biber")
echo "Checking and installing dependencies..."
for package in "${packages[@]}"; do
if ! dpkg -l | grep -q "^ii $package "; then
sudo apt-get install -y "$package"
@cloneofsimo
cloneofsimo / unit_activation_reinitializer.py
Created October 15, 2024 10:41
Unit-Scale Activation Initialization by Brute Force search
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms
import numpy as np
import math
def compute_activation_std(model, dataset, device='cpu', batch_size=32, num_workers=0, layer_names=None):
activations = {}
import torch
import time
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
@torch.no_grad()
def benchmark_gemm(m, k, n, dtype=torch.bfloat16, allow_bf16_reduce=True):
torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = allow_bf16_reduce
@cloneofsimo
cloneofsimo / infinite_parameterized_fractal.py
Created October 3, 2024 18:03
Parameterized Fractal Triton
import torch
import triton
import triton.language as tl
from triton.language.extra import libdevice
@triton.jit
def fractal_kernel(
zr_ptr, zi_ptr, cr_ptr, ci_ptr, output_ptr,
alpha_ptr, beta_ptr, poly0_ptr, poly1_ptr, poly2_ptr, poly3_ptr, p_ptr, R, max_iter,
H, W,
@cloneofsimo
cloneofsimo / polynomial-sphere-map.md
Last active September 28, 2024 09:35
Does there exists polynomial map of degree m sending the S^n to itself?

Variant of AM-GM for Minimization

When dealing with functions of the form $f(x) = x^a + \frac{1}{x^b}$, a variant of the AM-GM inequality can be used to find the minimum. Specifically, if you have:

$$ f(x) = c_1 \cdot x^a + c_2 \cdot \frac{1}{x^b} $$

The minimum occurs at: