Skip to content

Instantly share code, notes, and snippets.

View shauray8's full-sized avatar

Shauray Singh shauray8

View GitHub Profile
code for sm_120a
.target sm_120a
Function : k_rms_16
.headerflags @"EF_CUDA_ACCELERATORS EF_CUDA_SM120 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM120)"
/*0000*/ LDC R1, c[0x0][0x37c] ; /* 0x0000df00ff017b82 */
/* 0x000fe20000000800 */
/*0010*/ S2R R0, SR_TID.X ; /* 0x0000000000007919 */
/* 0x000e220000002100 */
import cuda.tile as ct
import torch
from math import ceil
def next_pow_of_2(n):
p = 1
while p < n: p *= 2
return p
@ct.kernel(
@shauray8
shauray8 / composition_sum.py
Created September 23, 2025 19:01
teny tiny gist kernel
import cutlass
import cutlass.cute as cute
@cute.jit
def compose_layouts():
# Define two layouts
layout_a = cute.make_layout((6, 2), stride=(1, 7))
layout_b = cute.make_layout((3, 2), stride=(2, 3))
composed = cute.composition(layout_a, layout_b)
#include <vector>
#include <chrono>
#include <iostream>
// Function to get current time in seconds
double get_time() {
auto now = std::chrono::high_resolution_clock::now();
return std::chrono::duration<double>(now.time_since_epoch()).count();
}
# AOT ID: ['0_inference']
from ctypes import c_void_p, c_long, c_int
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from torch._inductor.hooks import run_intermediate_hooks
from torch._inductor.utils import maybe_profile
@shauray8
shauray8 / bench_tqdm.txt
Created April 9, 2025 20:32
shutil tqdm benchmark
With tqdm took 0.698805 seconds
2062651 function calls (2061588 primitive calls) in 0.752 seconds
Ordered by: internal time
ncalls tottime percall cumtime percall filename:lineno(function)
1000002 0.350 0.000 0.355 0.000 loading_utils.py:41(update)
1000001 0.242 0.000 0.597 0.000 loading_utils.py:32(__iter__)
1 0.102 0.102 0.699 0.699 loading_utils.py:185(with_tqdm_loop)
58 0.005 0.000 0.005 0.000 {built-in method marshal.loads}
Graph break in user code at /workspace/Wan2GP/wan/modules/sage2_core.py:66
Reason: Unsupported: torch.* op returned non-Tensor int call_function <function device_count at 0x795533c18400>
User code traceback:
File "/workspace/Wan2GP/wan/modules/model.py", line 452, in forward
y = self.self_attn( xlist, seq_lens, grid_sizes,freqs)
File "/usr/local/lib/python3.11/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.11/dist-packages/mmgp/offload.py", line 1926, in check_empty_cuda_cache
return previous_method(*args, **kwargs)
File "/workspace/Wan2GP/wan/modules/model.py", line 266, in forward