HDCharles’s gists

HDCharles / gist:484655b4b8979cc4bbcbff180fd9a79f

Created October 8, 2024 16:24

	W1008 09:22:11.858000 1289935 torch/_higher_order_ops/triton_kernel_wrap.py:503] [0/0] ValueError: Incorrect number of arguments passed to kernel
	W1008 09:22:11.870000 1289935 torch/_higher_order_ops/triton_kernel_wrap.py:503] [0/0] Encountered an exception in identify_mutated_tensors, assuming every input is mutated
	W1008 09:22:11.870000 1289935 torch/_higher_order_ops/triton_kernel_wrap.py:503] [0/0] Traceback (most recent call last):
	W1008 09:22:11.870000 1289935 torch/_higher_order_ops/triton_kernel_wrap.py:503] [0/0] File "/home/cdhernandez/local/pytorch/torch/_higher_order_ops/triton_kernel_wrap.py", line 482, in identify_mutated_tensors
	W1008 09:22:11.870000 1289935 torch/_higher_order_ops/triton_kernel_wrap.py:503] [0/0] ttir_module, ordered_tensor_names = generate_ttir(kernel, kwargs)
	W1008 09:22:11.870000 1289935 torch/_higher_order_ops/triton_kernel_wrap.py:503] [0/0] File "/home/cdhernandez/local/pytorch/torch/_higher_order_ops/triton_kernel_wrap.py", line 139, in generate_ttir
	W1008 09:22

HDCharles / benchmark_triton.py

Created October 7, 2024 16:25

	#OMP_NUM_THREADS=16 CUDA_VISIBLE_DEVICES=0 ipython3 benchmark_triton.py #select the right number of threads based on your machine
	#You can change the matmul_dtype: GEMM, GEMV or AUTO
	#Note: bfloat16 only supported in GEMM mode with float32 accumulation
	#################################################################################################################################
	import torch
	import numpy as np

	device = 'cuda:0'
	compute_dtype = torch.float16

HDCharles / evals.sh

Created September 11, 2024 20:07

	export CHECKPOINT_PATH=../../../checkpoints # path to checkpoints folder

	# README EVALUATIONS
	export MODEL_REPO=meta-llama/Llama-2-7b-chat-hf
	python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth #12.212
	python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization int8dq --compile #12.262
	python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization int8wo #12.204
	python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization fp6 --compile --precision float16 #12.369
	python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization int4wo-64-hqq #12.825717540084083
	python eval.py --checkpoint_path $CHECKPOINT_PATH/$MODEL_REPO/model.pth --quantization int4wo-64 #12.87233037343588

HDCharles / multi_tensor_test.py

Last active August 21, 2024 00:16

MultiTensor for GPTQ

	import torch
	import torch.nn as nn
	from torch.utils._pytree import tree_flatten, tree_unflatten
	import gc
	class MultiTensor(torch.Tensor):
	@staticmethod
	def __new__(cls, input, **kwargs):
	if isinstance(input, (list, tuple)):
	input = input[0]
	kwargs["dtype"]=kwargs.get("dtype", input.dtype)

HDCharles / eval_script.py

Created June 4, 2024 20:21

doing lm_eval's work

	import torch

	from transformers import AutoModelForCausalLM, AutoTokenizer

	from lm_eval.models.huggingface import HFLM
	from lm_eval.evaluator import evaluate
	from lm_eval.tasks import get_task_dict


	path_to_hf_checkpoint = "/home/cdhernandez/local/gpt-fast/checkpoints/meta-llama/Meta-Llama-3-8B"

HDCharles / gist:888bc5973198ca447046b974439dca03

Last active March 28, 2024 20:35

repro for subclass issue

	import torch
	import torch.nn as nn
	from torch.utils._pytree import tree_flatten, tree_unflatten

	class MultiTensor(torch.Tensor):
	@staticmethod
	def __new__(cls, input, **kwargs):
	if isinstance(input, (list, tuple)):
	input = input[0]
	kwargs["dtype"]=kwargs.get("dtype", input.dtype)

HDCharles / linear_triton_kernels.py

Last active March 1, 2024 17:00

script for comparing performance of several linear triton kernels across several shapes

	import torch
	import torch.nn.functional as F
	import triton
	import triton.language as tl
	from triton import Config
	from torch._inductor import config
	from torch import _dynamo
	aten = torch.ops.aten

	def get_configs_io_bound():

HDCharles / microbenchmarks.py

Created February 24, 2024 16:46

microbenchmarks

	import torch
	import torch.nn.functional as F
	import triton
	import triton.language as tl
	from triton.ops.matmul import matmul as triton_matmul
	from triton.ops.matmul import _kernel
	from triton import Config
	from torch._inductor import config
	from torch import _dynamo
	torch._inductor.config.coordinate_descent_tuning = True

HDCharles / comparison.py

Created January 25, 2024 03:07

compare bitsandbytes with torchao

	######################################################################
	# Comparing Torchao #
	# and BitsandBytes #
	######################################################################
	# Set up Your Environment
	# --------------------------------
	#
	# First, let's configure your environment. This guide requires you to use CUDA 12.1.
	# We have run this tutorial on an A100-PG509-200 power limited to 330.00 W. If you
	# are using a different hardware, you might see different performance numbers.

HDCharles / gist:5da8093137ea7b05dd59a4a3bb1f67a5

Created December 20, 2023 10:30

	/home/cdhernandez/.conda/envs/pytorch-3.10/lib/python3.10/site-packages/transformers/utils/generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.
	_torch_pytree._register_pytree_node(
	/home/cdhernandez/.conda/envs/pytorch-3.10/lib/python3.10/site-packages/transformers/utils/generic.py:309: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.
	_torch_pytree._register_pytree_node(
	/home/cdhernandez/local/diffusers/src/diffusers/utils/outputs.py:63: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.
	torch.utils._pytree._register_pytree_node(
	Namespace(no_bf16=False, no_sdpa=False, batch_size=1, num_inference_steps=30, enable_fused_projections=True, upcast_vae=False, compile_unet=True, compile_vae=True, compile_mode='max-autotune', change_comp_config=True, do_quan