Skip to content

Instantly share code, notes, and snippets.

View hvaara's full-sized avatar
:shipit:

Roy Hvaara hvaara

:shipit:
View GitHub Profile
2024-11-28 05:45:11.621 python[37469:34111015] mps_3d_convolution:1:1:1:1:1:1:0:1:1:1:Contiguous:bf16[1,128,4,15,22]:bf16[512,128,3,3,3]:1:512
2024-11-28 05:45:11.647 python[37469:34111015] mps_3d_convolution:1:1:1:1:1:1:0:1:1:1:Contiguous:bf16[1,512,4,15,22]:bf16[512,512,3,3,3]:1:512
2024-11-28 05:45:11.649 python[37469:34111015] mps_3d_convolution:1:1:1:1:1:1:0:1:1:1:Contiguous:bf16[1,512,4,15,22]:bf16[512,512,3,3,3]:1:512
2024-11-28 05:45:11.652 python[37469:34111015] mps_3d_convolution:1:1:1:1:1:1:0:1:1:1:Contiguous:bf16[1,512,4,15,22]:bf16[512,512,3,3,3]:1:512
2024-11-28 05:45:11.652 python[37469:34111015] mps_3d_convolution:1:1:1:1:1:1:0:1:1:1:Contiguous:bf16[1,512,4,15,22]:bf16[512,512,3,3,3]:1:512
2024-11-28 05:45:11.652 python[37469:34111015] mps_3d_convolution:1:1:1:1:1:1:0:1:1:1:Contiguous:bf16[1,512,4,15,22]:bf16[512,512,3,3,3]:1:512
2024-11-28 05:45:11.653 python[37469:34111015] mps_3d_convolution:1:1:1:1:1:1:0:1:1:1:Contiguous:bf16[1,512,4,15,22]:bf16[512,512,3,3,3]:1:512
2024-11-28 05:45:11.653
@hvaara
hvaara / gist:e3002c3258c689f9b6a2dd56fb15317c
Created September 16, 2024 01:04
Tinygrad fused kernel example
METAL_XCODE=1 DISABLE_COMPILER_CACHE=1 DEBUG=4 python3 -c "from tinygrad import Tensor;
N = 1024; a, b = Tensor.rand(N, N), Tensor.rand(N, N);
c = (a.reshape(N, 1, N) * b.T.reshape(1, N, N)).sum(axis=2);
print((c.numpy() - (a.numpy() @ b.numpy())).mean())"
opened device METAL from pid:15248
opened device NPY from pid:15248
*** CUSTOM 1 custom_random mem 0.00 GB
*** CUSTOM 2 custom_random mem 0.01 GB
TENSOR CORES [(1, 1024, 1)] [(0, 1024, 1024)] WMMA_8_8_8_float_float
r_32_8_2_4_2_2_4_128_8_2_4_4
@hvaara
hvaara / gist:9db8389c15cf9a665a090ba85926a18d
Created September 12, 2024 23:47
Example collect_env.py output
   ~/dev/pytorch/pytorch    main *12 !1  git branch | grep -e '^*'  ✔  8s   pytorchdev   01:45:33 
* main
   ~/dev/pytorch/pytorch    main *12 !1  python torch/utils/collect_env.py  INT ✘  pytorchdev   01:45:22 
Collecting environment information...
PyTorch version: 2.5.0a0+gitdd47f6f
Is debug build: False
CUDA used to build PyTorch: None
ROCM used to build PyTorch: N/A
OS: macOS 14.6.1 (arm64)
python test/test_modules.py -v -k test_memory_format_nn_BatchNorm2d_
test_memory_format_nn_BatchNorm2d_eval_mode_cpu_float32 (__main__.TestModuleCPU.test_memory_format_nn_BatchNorm2d_eval_mode_cpu_float32) ... ok
test_memory_format_nn_BatchNorm2d_eval_mode_cpu_float64 (__main__.TestModuleCPU.test_memory_format_nn_BatchNorm2d_eval_mode_cpu_float64) ... ok
test_memory_format_nn_BatchNorm2d_train_mode_cpu_float32 (__main__.TestModuleCPU.test_memory_format_nn_BatchNorm2d_train_mode_cpu_float32) ... ok
test_memory_format_nn_BatchNorm2d_train_mode_cpu_float64 (__main__.TestModuleCPU.test_memory_format_nn_BatchNorm2d_train_mode_cpu_float64) ... ok
test_memory_format_nn_BatchNorm2d_eval_mode_mps_float16 (__main__.TestModuleMPS.test_memory_format_nn_BatchNorm2d_eval_mode_mps_float16) ... ok
test_memory_format_nn_BatchNorm2d_eval_mode_mps_float32 (__main__.TestModuleMPS.test_memory_format_nn_BatchNorm2d_eval_mode_mps_float32) ... ok
test_memory_format_nn_BatchNorm2d_train_mode_mps_float16 (__main__.TestModuleMPS.test_m
python test/test_modules.py -v -k test_memory_format_nn_BatchNorm2d_
test_memory_format_nn_BatchNorm2d_eval_mode_cpu_float32 (__main__.TestModuleCPU.test_memory_format_nn_BatchNorm2d_eval_mode_cpu_float32) ... ok
test_memory_format_nn_BatchNorm2d_eval_mode_cpu_float64 (__main__.TestModuleCPU.test_memory_format_nn_BatchNorm2d_eval_mode_cpu_float64) ... ok
test_memory_format_nn_BatchNorm2d_train_mode_cpu_float32 (__main__.TestModuleCPU.test_memory_format_nn_BatchNorm2d_train_mode_cpu_float32) ... ok
test_memory_format_nn_BatchNorm2d_train_mode_cpu_float64 (__main__.TestModuleCPU.test_memory_format_nn_BatchNorm2d_train_mode_cpu_float64) ... ok
test_memory_format_nn_BatchNorm2d_eval_mode_mps_float16 (__main__.TestModuleMPS.test_memory_format_nn_BatchNorm2d_eval_mode_mps_float16) ... FAIL
test_memory_format_nn_BatchNorm2d_eval_mode_mps_float32 (__main__.TestModuleMPS.test_memory_format_nn_BatchNorm2d_eval_mode_mps_float32) ... FAIL
test_memory_format_nn_BatchNorm2d_train_mode_mps_float16 (__main__.TestModuleMPS.te
test_memory_format_nn_BatchNorm2d_eval_mode_cpu_float32 (__main__.TestModuleCPU.test_memory_format_nn_BatchNorm2d_eval_mode_cpu_float32) ... ok
test_memory_format_nn_BatchNorm2d_eval_mode_cpu_float64 (__main__.TestModuleCPU.test_memory_format_nn_BatchNorm2d_eval_mode_cpu_float64) ... ok
test_memory_format_nn_BatchNorm2d_train_mode_cpu_float32 (__main__.TestModuleCPU.test_memory_format_nn_BatchNorm2d_train_mode_cpu_float32) ... ok
test_memory_format_nn_BatchNorm2d_train_mode_cpu_float64 (__main__.TestModuleCPU.test_memory_format_nn_BatchNorm2d_train_mode_cpu_float64) ... ok
test_memory_format_nn_BatchNorm2d_eval_mode_mps_float16 (__main__.TestModuleMPS.test_memory_format_nn_BatchNorm2d_eval_mode_mps_float16) ... FAIL
test_memory_format_nn_BatchNorm2d_eval_mode_mps_float32 (__main__.TestModuleMPS.test_memory_format_nn_BatchNorm2d_eval_mode_mps_float32) ... FAIL
test_memory_format_nn_BatchNorm2d_train_mode_mps_float16 (__main__.TestModuleMPS.test_memory_format_nn_BatchNorm2d_train_mode_mps_float16) ... FAIL
test
python test/test_modules.py -v -k test_memory_format_nn_BatchNorm2d_
test_memory_format_nn_BatchNorm2d_eval_mode_cpu_float32 (__main__.TestModuleCPU.test_memory_format_nn_BatchNorm2d_eval_mode_cpu_float32) ... ok
test_memory_format_nn_BatchNorm2d_eval_mode_cpu_float64 (__main__.TestModuleCPU.test_memory_format_nn_BatchNorm2d_eval_mode_cpu_float64) ... ok
test_memory_format_nn_BatchNorm2d_train_mode_cpu_float32 (__main__.TestModuleCPU.test_memory_format_nn_BatchNorm2d_train_mode_cpu_float32) ... ok
test_memory_format_nn_BatchNorm2d_train_mode_cpu_float64 (__main__.TestModuleCPU.test_memory_format_nn_BatchNorm2d_train_mode_cpu_float64) ... ok
test_memory_format_nn_BatchNorm2d_eval_mode_mps_float16 (__main__.TestModuleMPS.test_memory_format_nn_BatchNorm2d_eval_mode_mps_float16) ... FAIL
test_memory_format_nn_BatchNorm2d_eval_mode_mps_float32 (__main__.TestModuleMPS.test_memory_format_nn_BatchNorm2d_eval_mode_mps_float32) ... FAIL
test_memory_format_nn_BatchNorm2d_train_mode_mps_float16 (__main__.TestModuleMPS.te
git log v2.4.0..HEAD --oneline
d74039f7010 (HEAD -> repro-24-134580, origin/repro-24-134580) Repro case for #134580
ccdbe084a9e Skip memory_format tests
49f0d3f1111 Update common_modules.py
630cc4ea8b2 Update test_nn.py
b702a483965 Use newer `toAccumulateType` signature in Normalization.cpp
$ git diff v2.4.0
diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp
index e9e7c001837..16ada4cead5 100644
@hvaara
hvaara / __init__.pyi
Created August 21, 2024 02:05
mlx core/__init__.pyi
from collections.abc import Callable, Sequence
import enum
import types
from typing import Annotated, overload
import numpy
from numpy.typing import ArrayLike
from . import (
distributed as distributed,
High watermark memory allocation limit: 163.20 GB
Low watermark memory allocation limit: 134.40 GB
Initializing private heap allocator on unified device memory of size 96.00 GB
BlitCopySync: CPU:Float[3, 224, 224] --> MPS(buf#1:1):Float[3, 224, 224] (len=588.00 KB, gpu=9.644 ms, cpu=4.767 ms)
BlitCopySync: CPU:Float[64, 3, 7, 7] --> MPS(buf#2:1):Float[64, 3, 7, 7] (len=36.75 KB, gpu=1.555 ms, cpu=0.043 ms)
BlitCopySync: CPU:Float[64] --> MPS(buf#3:1):Float[64] (len=256 bytes, gpu=1.491 ms, cpu=0.031 ms)
BlitCopySync: CPU:Float[64] --> MPS(buf#4:1):Float[64] (len=256 bytes, gpu=0.586 ms, cpu=0.020 ms)
BlitCopySync: CPU:Float[64] --> MPS(buf#5:1):Float[64] (len=256 bytes, gpu=0.564 ms, cpu=0.020 ms)
BlitCopySync: CPU:Float[64] --> MPS(buf#6:1):Float[64] (len=256 bytes, gpu=0.518 ms, cpu=0.024 ms)
BlitCopySync: CPU:Long[] --> MPS(buf#7:1):Long[] (len=8 bytes, gpu=0.409 ms, cpu=0.232 ms)