davidberard98’s gists

davidberard98 / cupti_cudagraph_test.cu

Last active March 22, 2023 23:59

	// BUILD COMMAND:
	// LD_LIBRARY_PATH=/usr/local/cuda-11.6/extras/CUPTI/lib64:$LD_LIBRARY_PATH nvcc -arch=sm_80 -std=c++17 -o cudagraph cudagraph.cu -lcupti
	#include <cstddef>
	#include <cuda_runtime_api.h>
	#include <cstdio>
	#include <sys/time.h>
	#include <iostream>
	#include <cupti.h>

	#define N 500000 // tuned such that kernel takes a few microseconds

davidberard98 / torch_bucketize.py

Created June 30, 2023 04:23

	import torch
	import torch._dynamo
	import torch._inductor.inductor_prims

	def fn(values, boundaries):
	return torch.bucketize(values, boundaries)

	def fn_ind(values, boundaries):
	return torch.ops.prims._inductor_bucketize(values, boundaries)

davidberard98 / build.sh

Last active July 7, 2023 23:55

	# $CUDA_HOME/bin/nvcc binary_search_cuda.cu -std=c++17 -o binary_search_cuda -O3 # -Wl,-rpath $CUDA_HOME/lib64
	$CUDA_HOME/bin/nvcc dense_to_jagged.cu -std=c++17 -o dense_to_jagged -O3 # -Wl,-rpath $CUDA_HOME/lib64

davidberard98 / dense_to_jagged_triton.py

Created July 28, 2023 20:59

davidberard98 / record_function_benchmark.py

Last active August 13, 2023 01:25

	import torch
	import time

	profiler_events = []
	is_enabled = False
	def _start_fn(name, args = None):
	if is_enabled:
	profiler_events.append((name, args, time.time()))

	def _stop_fn():

davidberard98 / jagged_move.py

Created August 29, 2023 17:24

	import torch
	import triton
	import triton.language as tl

	@triton.jit
	def dense_to_jagged_triton(
	inverse_offsets_ptr, offsets_ptr, dense_ptr, out_ptr0, xnumel, XBLOCK: tl.constexpr
	):
	# xnumel = 33106688
	xoffset = tl.program_id(0) * XBLOCK

davidberard98 / use_profiler.py

Created September 14, 2023 00:07

	import torch

	def fn(x, y):
	return torch.cat([x + y, y]).sin()

	a = torch.ones((1024, 256), dtype=torch.float32)
	b = torch.ones((1024, 256), dtype=torch.float32) * 2

	with torch.profiler.profile(schedule=torch.profiler.schedule(wait=2, warmup=2, repeat=1, active=2), record_shapes=True) as prof:
	for _ in range(8):

davidberard98 / 112489_trace.txt

Last active October 31, 2023 21:10

	Traceback (most recent call last):
	File "/data/users/dberard/scripts/oncall/112489.py", line 8, in <module>
	fn_opt(*inputs)
	File "/data/users/dberard/pytorch/torch/_dynamo/eval_frame.py", line 411, in _fn
	return fn(args, *kwargs)
	File "/data/users/dberard/pytorch/torch/_dynamo/eval_frame.py", line 559, in catch_errors
	return callback(frame, cache_entry, hooks, frame_state)
	File "/data/users/dberard/pytorch/torch/_dynamo/convert_frame.py", line 687, in _convert_frame
	result = inner_convert(frame, cache_entry, hooks, frame_state)
	File "/data/users/dberard/pytorch/torch/_dynamo/convert_frame.py", line 148, in _fn

davidberard98 / 112494_trace.txt

Last active October 31, 2023 21:23

	/data/users/dberard/scripts/oncall/112494.py:6: UserWarning: An output with one or more elements was resized since it had shape [10, 9, 8], which does not match the required output shape [1, 9, 8]. This behavior is deprecated, and in a future PyTorch release outputs will not be resized unless they have zero elements. You can explicitly reuse an out tensor t by resizing it, inplace, to zero elements with t.resize_(0). (Triggered internally at ../aten/src/ATen/native/Resize.cpp:28.)
	x = torch.var(correction=4, dim=0, input=x, keepdim=True, out=torch.rand_like(x))
	/data/users/dberard/pytorch/torch/_prims_common/wrappers.py:159: UserWarning: An output with one or more elements was resized since it had shape torch.Size([s0, s1, s2]) which does not match the required output shape {str(shape)}. This behavior is deprecated, and in a future PyTorch release outputs will not be resized unless they have zero elements. You can explicitly reuse an out tensor t by resizing it, inplace, to zero elements with t.resize_(0).

davidberard98 / 112502_trace.txt

Last active October 31, 2023 21:31

	/data/users/dberard/scripts/oncall/112502.py:7: UserWarning: An output with one or more elements was resized since it had shape [9, 10], which does not match the required output shape [9]. This behavior is deprecated, and in a future PyTorch release outputs will not be resized unless they have zero elements. You can explicitly reuse an out tensor t by resizing it, inplace, to zero elements with t.resize_(0). (Triggered internally at ../aten/src/ATen/native/Resize.cpp:28.)
	x = torch.diag(input=x, diagonal=0,out=torch.rand([9, 10], dtype=torch.float32).to('cpu'))
	build succeded
	/data/users/dberard/pytorch/torch/_prims_common/wrappers.py:159: UserWarning: An output with one or more elements was resized since it had shape torch.Size([9, 10]) which does not match the required output shape {str(shape)}. This behavior is deprecated, and in a future PyTorch release outputs will not be resized unless they have zero elements. You can explicitly reuse an out tensor t by resizing it, inplace, to zero elements with t.re

David Berard davidberard98