Skip to content

Instantly share code, notes, and snippets.

View youkaichao's full-sized avatar
:octocat:
curious

youkaichao youkaichao

:octocat:
curious
View GitHub Profile
@youkaichao
youkaichao / test1.cu
Created April 24, 2025 11:05
test1.cu
#include <iostream>
#include <cuda_runtime.h>
#include <cuda.h>
// Define the kernel with illegal memory access
__global__ void illegalWildPointerKernel(int* data, int size) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
__nanosleep(1000000000ULL); // Sleep for 1 second
int* wild_pointer = (int*)0x100;
if (idx == 0) {
@youkaichao
youkaichao / test.py
Created February 6, 2025 04:34
gloo v.s. nccl
import torch
import torch.distributed as dist
use_nccl = False
dist.init_process_group(backend="nccl" if use_nccl else "gloo")
rank = dist.get_rank()
torch.cuda.set_device(rank % 8)
@youkaichao
youkaichao / test_pytorch.py
Created January 3, 2025 03:05
cmp shm broadcast and pytorch broadcast object list
import torch.distributed as dist
import torch
import time
dist.init_process_group(backend="nccl")
rank = dist.get_rank()
torch.cuda.set_device(rank)
N_warmup = 10
@youkaichao
youkaichao / embedding.py
Created November 6, 2024 20:28
inplace embedding
import torch
import torch.nn as nn
# an Embedding module containing 10 tensors of size 3
embedding = nn.Embedding(10, 3)
embedding.weight.requires_grad_(False)
# a batch of 4 indices
input = torch.LongTensor([1, 2, 4, 5])
output = embedding(input)
@youkaichao
youkaichao / ipc.py
Created November 5, 2024 00:06
cuda ipc
import os
from typing import List
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
import torch
import torch.distributed as dist
dist.init_process_group(backend="gloo")
rank = local_rank = dist.get_rank()
world_size = dist.get_world_size()
torch.cuda.set_device(local_rank)
@youkaichao
youkaichao / overhead.py
Created October 31, 2024 22:51
direct custom op
import os
from dataclasses import dataclass
from typing import Optional, Tuple
import torch
from torch import nn
def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
out: torch.Tensor) -> None:
out.copy_(q)
@youkaichao
youkaichao / overhead.py
Created October 31, 2024 21:27
custom op overhead (no mutation)
import os
from dataclasses import dataclass
from typing import Optional, Tuple
import torch
from torch import nn
def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> torch.Tensor:
out = q.clone()
out += k
@youkaichao
youkaichao / overhead.py
Created October 31, 2024 21:15
custom op overhead
import os
from dataclasses import dataclass
from typing import Optional, Tuple
import torch
from torch import nn
def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
out: torch.Tensor) -> None:
out.copy_(q)
@youkaichao
youkaichao / client.py
Created October 27, 2024 03:43
client.py
import openai
import asyncio
async def get_choice_completion(prompt, choices):
# Initialize an asynchronous OpenAI client
async with openai.AsyncClient(base_url="http://127.0.0.1:8000/v1", api_key="abc") as client:
choice_probs = {}
# Calculate logprobs for each prompt + choice sequence
for choice in choices:
@youkaichao
youkaichao / data.txt
Created September 30, 2024 04:28
profiling
unified benchmark script
$ python benchmarks/benchmark_serving.py --model meta-llama/Meta-Llama-3-8B --dataset-name random --random-input-len 256 --random-output-len 256 --num-prompts 100
vLLM default
$ vllm serve meta-llama/Meta-Llama-3-8B --disable-log-requests --max-num-seqs 1
============ Serving Benchmark Result ============
Successful requests: 100
Benchmark duration (s): 198.86