Skip to content

Instantly share code, notes, and snippets.

//
// Generated by LLVM NVPTX Back-End
//
.version 8.8
.target sm_100a
.address_size 64
// .globl fused_moe_kernel // -- Begin function fused_moe_kernel
.extern .shared .align 16 .b8 global_smem[];
@malfet
malfet / skip_delta.py
Last active May 29, 2026 14:44
Computes delta in number of skipped tests
"""Per-commit pass/skip/flaky/fail counts across the last N commits.
Usage: python skip_delta.py [<sha>] [-n 5] [--workflow trunk]
[--job-filter 'jammy.*cuda'] [--repo pytorch/pytorch]
Requires: curl_cffi, gh CLI.
"""
import argparse
import gzip
@malfet
malfet / fetch_commit_statuses.py
Created May 7, 2026 15:29
Fetch commit statuses from HUD
"""Fetch the latest N commits' job statuses from hud.pytorch.org."""
import argparse
import gzip
import json
import sys
from curl_cffi import requests
def fetch_hud(owner: str, repo: str, branch: str, count: int) -> dict:
@malfet
malfet / tune_mpp.py
Last active April 30, 2026 15:24
Tune MPP matmul2d tile sizes for MPS F.linear
"""Tune MPP matmul2d tile sizes across dtypes and shapes."""
import time
import torch
import torch.nn.functional as F
torch.set_grad_enabled(False)
WARMUP = 20
REPEAT = 200
BATCH = 10
#!/usr/bin/env python3
"""
Repack torchaudio wheels as cp310-abi3 (stable ABI) wheels.
Downloads the cp310 wheel for each platform from PyPI, verifies all native
extensions use the stable ABI, patches the WHEEL tag (adds PEP 427 Build
number) and METADATA (adds torch>=<version> dependency), and repacks.
RECORD regeneration and repacking are handled by auditwheel's InWheelCtx.
Usage:
"""
Demonstrate the NEON overread in F.interpolate by placing uint8 tensor data
right before an unmapped guard page.
vld3_u8 in the block-of-4 loop reads 24 bytes (8 pixels × 3 channels)
but only needs 12 bytes (4 pixels × 3 channels). If the extra 12 bytes
cross into an unmapped page, we get SIGBUS.
"""
import ctypes
#!/usr/bin/env python3
"""
Fetch PyTorch outside collaborators and infer company affiliation from commit emails.
Requires: `gh` CLI authenticated with appropriate permissions.
Usage: python fetch_collaborator_affiliations.py [--repo pytorch/pytorch] [--max-commits 100]
Caches results in pytorch_collab_emails.json to avoid re-fetching known collaborators.
"""
#!/usr/bin/env python3
"""Print all ops with MPS skips for non-contiguous input."""
import unittest
from torch.testing._internal.common_methods_invocations import op_db
from torch.testing._internal.opinfo.core import DecorateInfo
def main():
ops_with_mps_noncontig_skip = []
#include <stdio.h>
#include <chrono>
__global__ void noop() { }
int main(int argc, const char *argv[]) {
cudaDeviceProp prop;
auto rc = cudaGetDeviceProperties(&prop, 0);
printf("Running on %s sm%d.%d multiProcessorCount = %d maxBlocksPerMultiProcessor = %d maxThreadsPerBlock = %d\n",
prop.name, prop.major, prop.minor, prop.multiProcessorCount, prop.maxBlocksPerMultiProcessor, prop.maxThreadsPerBlock);
a_cpp = """#include <iostream>
namespace foo::bar {
inline namespace baz {
int inc(int x) {
std::cout << "do inc from lib_a" << std::endl;
return x + 1;
}
} // inline namespace baz
void do_a(int x) {