curious

youkaichao youkaichao

curious

Ph.D. candidate in Tsinghua University. Visiting UC Berkeley, working on @vllm-project now.

youkaichao / syntax.s

Created April 23, 2018 03:01 — forked from mishurov/syntax.s

AT&T assembly syntax and IA-32 instructions

	# --------
	# Hardware
	# --------

	# Opcode - operational code
	# Assebly mnemonic - abbreviation for an operation

	# Instruction Code Format (IA-32)
	# - Optional instruction prefix
	# - Operational code

youkaichao / dynamic_conv_bn.py

Created August 25, 2023 16:40

Demonstrate very dynamic usage case of conv-bn pairs.

	import torch
	from torch import nn
	import copy

	from torch.fx.experimental.efficient_conv_bn_eval import turn_on_efficient_conv_bn_eval


	class BackboneModel(nn.Module):
	def __init__(self, args, *kwargs) -> None:
	super().__init__(args, *kwargs)

youkaichao / gist:7ed49dcb55b2e66dfd841b1a9b0bfeff

Created September 8, 2023 12:59

	import torch
	from torch import nn
	import copy

	class BackboneModel(nn.Module):
	def __init__(self, args, *kwargs) -> None:
	super().__init__(args, *kwargs)
	self.conv1 = nn.Conv2d(16, 16, 6)
	self.bn1 = nn.BatchNorm2d(16)
	def forward(self, x):

youkaichao / test.py

Last active June 27, 2024 21:32

object broadcast comparison

	import torch
	import torch.distributed as dist
	import os
	import multiprocessing
	import multiprocessing.shared_memory
	import io
	import pickle

	N_warmup = 10 # warmup N_warmup times
	N = 100 # repeat N times

youkaichao / test.py

Created May 9, 2024 05:54

pytorch distributed nccl communicator creation

	# run the code with `torchrun --nproc-per-node 4 test.py`
	import os
	os.environ['NCCL_DEBUG'] = 'TRACE'
	import torch
	import torch.distributed as dist

	# nccl communicators are lazily created
	dist.init_process_group(backend='nccl')
	print("init done")

youkaichao / audit.c

Created May 21, 2024 04:27

enable verbose cudagraph dump for pytorch

youkaichao / test.py

Created May 21, 2024 23:03

vLLM + torch.compile

	from vllm import LLM, SamplingParams
	prompts = [
	"Hello, my name is",
	"The president of the United States is",
	"The capital of France is",
	"The future of AI is",
	]
	sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

	from contextlib import nullcontext

youkaichao / wait.cu

Created June 4, 2024 04:15

wait kernel, gpu waits for cpu signal to continue

	#include <cstdio>
	#include <iostream>
	#include <cuda_runtime.h>

	__global__ void waitKernel(volatile bool *flag) {
	// Busy-wait loop
	while (!*flag) {
	// The use of volatile ensures that the GPU fetches the flag value from memory each time
	// This is necessary because without volatile, the compiler might optimize the memory read
	__threadfence_system(); // Optional for system-wide memory coherence

youkaichao / test.py

Last active August 30, 2024 19:45

torch.compile integration plan

	import torch
	from typing import Optional

	from torch._dynamo.backends.common import aot_autograd

	@torch.library.custom_op("custom::unified_attention", mutates_args=[])
	def unified_attention(x: torch.Tensor, num_prefill_tokens: torch.Tensor, cache: torch.Tensor) -> torch.Tensor:
	if cache.numel() == 0:
	return x * 2
	output = x.clone()

youkaichao / custom_op_inductor.py

Created September 12, 2024 05:38

	import torch
	from typing import Optional, Tuple, Union
	torch.cuda.is_available()

	def report_memory(prefix):
	free, total = torch.cuda.mem_get_info()
	used = total - free
	print(f"{prefix}: Used: {used / 1024 / 1024} MB, Free: {free / 1024 / 1024} MB, Total: {total / 1024 / 1024} MB")

	output_parallel = torch.randn(8192, 4096, dtype=torch.bfloat16, device="cuda") # 64 MB