Skip to content

Instantly share code, notes, and snippets.

View youkaichao's full-sized avatar
:octocat:
curious

youkaichao youkaichao

:octocat:
curious
View GitHub Profile
@youkaichao
youkaichao / syntax.s
Created April 23, 2018 03:01 — forked from mishurov/syntax.s
AT&T assembly syntax and IA-32 instructions
# --------
# Hardware
# --------
# Opcode - operational code
# Assebly mnemonic - abbreviation for an operation
# Instruction Code Format (IA-32)
# - Optional instruction prefix
# - Operational code
@youkaichao
youkaichao / dynamic_conv_bn.py
Created August 25, 2023 16:40
Demonstrate very dynamic usage case of conv-bn pairs.
import torch
from torch import nn
import copy
from torch.fx.experimental.efficient_conv_bn_eval import turn_on_efficient_conv_bn_eval
class BackboneModel(nn.Module):
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
import torch
from torch import nn
import copy
class BackboneModel(nn.Module):
def __init__(self, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.conv1 = nn.Conv2d(16, 16, 6)
self.bn1 = nn.BatchNorm2d(16)
def forward(self, x):
@youkaichao
youkaichao / test.py
Last active June 27, 2024 21:32
object broadcast comparison
import torch
import torch.distributed as dist
import os
import multiprocessing
import multiprocessing.shared_memory
import io
import pickle
N_warmup = 10 # warmup N_warmup times
N = 100 # repeat N times
@youkaichao
youkaichao / test.py
Created May 9, 2024 05:54
pytorch distributed nccl communicator creation
# run the code with `torchrun --nproc-per-node 4 test.py`
import os
os.environ['NCCL_DEBUG'] = 'TRACE'
import torch
import torch.distributed as dist
# nccl communicators are lazily created
dist.init_process_group(backend='nccl')
print("init done")
@youkaichao
youkaichao / audit.c
Created May 21, 2024 04:27
enable verbose cudagraph dump for pytorch
#define _GNU_SOURCE
#include <stdio.h>
#include <link.h>
#include <stdbool.h>
#include <string.h>
#include <stdlib.h>
typedef int cudaError_t;
typedef void* cudaGraph_t;
@youkaichao
youkaichao / test.py
Created May 21, 2024 23:03
vLLM + torch.compile
from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
from contextlib import nullcontext
@youkaichao
youkaichao / wait.cu
Created June 4, 2024 04:15
wait kernel, gpu waits for cpu signal to continue
#include <cstdio>
#include <iostream>
#include <cuda_runtime.h>
__global__ void waitKernel(volatile bool *flag) {
// Busy-wait loop
while (!*flag) {
// The use of volatile ensures that the GPU fetches the flag value from memory each time
// This is necessary because without volatile, the compiler might optimize the memory read
__threadfence_system(); // Optional for system-wide memory coherence
@youkaichao
youkaichao / test.py
Last active August 30, 2024 19:45
torch.compile integration plan
import torch
from typing import Optional
from torch._dynamo.backends.common import aot_autograd
@torch.library.custom_op("custom::unified_attention", mutates_args=[])
def unified_attention(x: torch.Tensor, num_prefill_tokens: torch.Tensor, cache: torch.Tensor) -> torch.Tensor:
if cache.numel() == 0:
return x * 2
output = x.clone()
import torch
from typing import Optional, Tuple, Union
torch.cuda.is_available()
def report_memory(prefix):
free, total = torch.cuda.mem_get_info()
used = total - free
print(f"{prefix}: Used: {used / 1024 / 1024} MB, Free: {free / 1024 / 1024} MB, Total: {total / 1024 / 1024} MB")
output_parallel = torch.randn(8192, 4096, dtype=torch.bfloat16, device="cuda") # 64 MB