This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// BUILD COMMAND: | |
// LD_LIBRARY_PATH=/usr/local/cuda-11.6/extras/CUPTI/lib64:$LD_LIBRARY_PATH nvcc -arch=sm_80 -std=c++17 -o cudagraph cudagraph.cu -lcupti | |
#include <cstddef> | |
#include <cuda_runtime_api.h> | |
#include <cstdio> | |
#include <sys/time.h> | |
#include <iostream> | |
#include <cupti.h> | |
#define N 500000 // tuned such that kernel takes a few microseconds |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import torch._dynamo | |
import torch._inductor.inductor_prims | |
def fn(values, boundaries): | |
return torch.bucketize(values, boundaries) | |
def fn_ind(values, boundaries): | |
return torch.ops.prims._inductor_bucketize(values, boundaries) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# $CUDA_HOME/bin/nvcc binary_search_cuda.cu -std=c++17 -o binary_search_cuda -O3 # -Wl,-rpath $CUDA_HOME/lib64 | |
$CUDA_HOME/bin/nvcc dense_to_jagged.cu -std=c++17 -o dense_to_jagged -O3 # -Wl,-rpath $CUDA_HOME/lib64 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import triton | |
import triton.language as tl | |
@triton.jit | |
def dense_to_jagged_triton( | |
in_ptr, | |
offsets_ptr, | |
inverse_offsets_ptr, | |
out_ptr, |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import time | |
profiler_events = [] | |
is_enabled = False | |
def _start_fn(name, args = None): | |
if is_enabled: | |
profiler_events.append((name, args, time.time())) | |
def _stop_fn(): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import triton | |
import triton.language as tl | |
@triton.jit | |
def dense_to_jagged_triton( | |
inverse_offsets_ptr, offsets_ptr, dense_ptr, out_ptr0, xnumel, XBLOCK: tl.constexpr | |
): | |
# xnumel = 33106688 | |
xoffset = tl.program_id(0) * XBLOCK |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
def fn(x, y): | |
return torch.cat([x + y, y]).sin() | |
a = torch.ones((1024, 256), dtype=torch.float32) | |
b = torch.ones((1024, 256), dtype=torch.float32) * 2 | |
with torch.profiler.profile(schedule=torch.profiler.schedule(wait=2, warmup=2, repeat=1, active=2), record_shapes=True) as prof: | |
for _ in range(8): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Traceback (most recent call last): | |
File "/data/users/dberard/scripts/oncall/112489.py", line 8, in <module> | |
fn_opt(*inputs) | |
File "/data/users/dberard/pytorch/torch/_dynamo/eval_frame.py", line 411, in _fn | |
return fn(*args, **kwargs) | |
File "/data/users/dberard/pytorch/torch/_dynamo/eval_frame.py", line 559, in catch_errors | |
return callback(frame, cache_entry, hooks, frame_state) | |
File "/data/users/dberard/pytorch/torch/_dynamo/convert_frame.py", line 687, in _convert_frame | |
result = inner_convert(frame, cache_entry, hooks, frame_state) | |
File "/data/users/dberard/pytorch/torch/_dynamo/convert_frame.py", line 148, in _fn |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/data/users/dberard/scripts/oncall/112494.py:6: UserWarning: An output with one or more elements was resized since it had shape [10, 9, 8], which does not match the required output shape [1, 9, 8]. This behavior is deprecated, and in a future PyTorch release outputs will not be resized unless they have zero elements. You can explicitly reuse an out tensor t by resizing it, inplace, to zero elements with t.resize_(0). (Triggered internally at ../aten/src/ATen/native/Resize.cpp:28.) | |
x = torch.var(correction=4, dim=0, input=x, keepdim=True, out=torch.rand_like(x)) | |
/data/users/dberard/pytorch/torch/_prims_common/wrappers.py:159: UserWarning: An output with one or more elements was resized since it had shape torch.Size([s0, s1, s2]) which does not match the required output shape {str(shape)}. This behavior is deprecated, and in a future PyTorch release outputs will not be resized unless they have zero elements. You can explicitly reuse an out tensor t by resizing it, inplace, to zero elements with t.resize_(0). |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/data/users/dberard/scripts/oncall/112502.py:7: UserWarning: An output with one or more elements was resized since it had shape [9, 10], which does not match the required output shape [9]. This behavior is deprecated, and in a future PyTorch release outputs will not be resized unless they have zero elements. You can explicitly reuse an out tensor t by resizing it, inplace, to zero elements with t.resize_(0). (Triggered internally at ../aten/src/ATen/native/Resize.cpp:28.) | |
x = torch.diag(input=x, diagonal=0,out=torch.rand([9, 10], dtype=torch.float32).to('cpu')) | |
build succeded | |
/data/users/dberard/pytorch/torch/_prims_common/wrappers.py:159: UserWarning: An output with one or more elements was resized since it had shape torch.Size([9, 10]) which does not match the required output shape {str(shape)}. This behavior is deprecated, and in a future PyTorch release outputs will not be resized unless they have zero elements. You can explicitly reuse an out tensor t by resizing it, inplace, to zero elements with t.re |