This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import openai | |
import asyncio | |
async def get_choice_completion(prompt, choices): | |
# Initialize an asynchronous OpenAI client | |
async with openai.AsyncClient(base_url="http://127.0.0.1:8000/v1", api_key="abc") as client: | |
choice_probs = {} | |
# Calculate logprobs for each prompt + choice sequence | |
for choice in choices: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
unified benchmark script | |
$ python benchmarks/benchmark_serving.py --model meta-llama/Meta-Llama-3-8B --dataset-name random --random-input-len 256 --random-output-len 256 --num-prompts 100 | |
vLLM default | |
$ vllm serve meta-llama/Meta-Llama-3-8B --disable-log-requests --max-num-seqs 1 | |
============ Serving Benchmark Result ============ | |
Successful requests: 100 | |
Benchmark duration (s): 198.86 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
from typing import Optional, Tuple, Union | |
torch.cuda.is_available() | |
def report_memory(prefix): | |
free, total = torch.cuda.mem_get_info() | |
used = total - free | |
print(f"{prefix}: Used: {used / 1024 / 1024} MB, Free: {free / 1024 / 1024} MB, Total: {total / 1024 / 1024} MB") | |
output_parallel = torch.randn(8192, 4096, dtype=torch.bfloat16, device="cuda") # 64 MB |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
from typing import Optional | |
from torch._dynamo.backends.common import aot_autograd | |
@torch.library.custom_op("custom::unified_attention", mutates_args=[]) | |
def unified_attention(x: torch.Tensor, num_prefill_tokens: torch.Tensor, cache: torch.Tensor) -> torch.Tensor: | |
if cache.numel() == 0: | |
return x * 2 | |
output = x.clone() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cstdio> | |
#include <iostream> | |
#include <cuda_runtime.h> | |
__global__ void waitKernel(volatile bool *flag) { | |
// Busy-wait loop | |
while (!*flag) { | |
// The use of volatile ensures that the GPU fetches the flag value from memory each time | |
// This is necessary because without volatile, the compiler might optimize the memory read | |
__threadfence_system(); // Optional for system-wide memory coherence |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from vllm import LLM, SamplingParams | |
prompts = [ | |
"Hello, my name is", | |
"The president of the United States is", | |
"The capital of France is", | |
"The future of AI is", | |
] | |
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) | |
from contextlib import nullcontext |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define _GNU_SOURCE | |
#include <stdio.h> | |
#include <link.h> | |
#include <stdbool.h> | |
#include <string.h> | |
#include <stdlib.h> | |
typedef int cudaError_t; | |
typedef void* cudaGraph_t; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# run the code with `torchrun --nproc-per-node 4 test.py` | |
import os | |
os.environ['NCCL_DEBUG'] = 'TRACE' | |
import torch | |
import torch.distributed as dist | |
# nccl communicators are lazily created | |
dist.init_process_group(backend='nccl') | |
print("init done") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import torch.distributed as dist | |
import os | |
import multiprocessing | |
import multiprocessing.shared_memory | |
import io | |
import pickle | |
N_warmup = 10 # warmup N_warmup times | |
N = 100 # repeat N times |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
from torch import nn | |
import copy | |
class BackboneModel(nn.Module): | |
def __init__(self, *args, **kwargs) -> None: | |
super().__init__(*args, **kwargs) | |
self.conv1 = nn.Conv2d(16, 16, 6) | |
self.bn1 = nn.BatchNorm2d(16) | |
def forward(self, x): |