Install cursor cli from https://api2.cursor.sh/updates/download-latest?os=cli-alpine-x64, then ./cursor tunnel
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# https://jonisalonen.com/2013/deriving-welfords-method-for-computing-variance/ | |
import torch | |
def two_pass_variance(data): | |
n = len(data) | |
mean = sum(data) / n | |
var = sum([(x - mean) ** 2 for x in data]) / (n - 1) | |
return var |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
import torchao | |
from torchao.quantization.autoquant import ( | |
DEFAULT_AUTOQUANT_CLASS_LIST, | |
DEFAULT_INT4_AUTOQUANT_CLASS_LIST, | |
OTHER_AUTOQUANT_CLASS_LIST, | |
) | |
from torchao.quantization.quant_api import ( |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
from torch._inductor.utils import get_code, get_triton_code | |
def my_model(x): | |
return torch.square(x) | |
compiled_model = torch.compile(my_model) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pycuda.autoinit | |
import pycuda.driver as cuda | |
# Get the first CUDA device (index 0) | |
device = cuda.Device(0) | |
# List of attributes you want to get | |
attributes = [ | |
cuda.device_attribute.MAX_THREADS_PER_BLOCK, | |
cuda.device_attribute.MAX_BLOCK_DIM_X, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
from torch.utils.cpp_extension import load_inline | |
finfo = torch.finfo(torch.float8_e4m3fn) | |
print(f"finfo: {finfo}") | |
# finfo(resolution=1, min=-448, max=448, eps=0.125, smallest_normal=0.015625, tiny=0.015625, dtype=float8_e4m3fn) | |
cuda_source = """ | |
C10_HOST_DEVICE constexpr auto FP8_E4M3_MAX = std::numeric_limits<c10::Float8_e4m3fn>::max(); | |
void test() { |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# speechmatics.com/company/articles-and-news/timing-operations-in-pytorch | |
import time | |
import torch | |
# 400000000B/1000000 = 400 MB | |
a = torch.randn(1000, 1000, device="cuda") | |
torch.softmax(a, dim=1) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import timeit | |
import torch | |
@torch.compile() # 0.103 seconds | |
# @torch.compile(fullgraph=True) # 0.105 seconds | |
# @torch.compile(fullgraph=False) # 0.102 seconds | |
# @torch.compile(options={"triton.cudagraphs": False}, fullgraph=True) # 0.104 seconds | |
# @torch.compile( |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import time | |
import torch | |
import transformers | |
from torch.profiler import ProfilerActivity, profile, record_function | |
from vllm import LLM, SamplingParams | |
os.environ["HOST_IP"] = "10.42.10.16" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# https://github.com/huggingface/transpip install --pre fbgemm-gpu --index-url https://download.pytorch.org/whl/nightly/cu121/formers/pull/32047 | |
# CUDA Nightly | |
# pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121/ | |
# pip install --pre fbgemm-gpu --index-url https://download.pytorch.org/whl/nightly/cu121/ | |
import os | |
from transformers import AutoModelForCausalLM, AutoTokenizer, FbgemmFp8Config |
NewerOlder