This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import torch.nn as nn | |
# an Embedding module containing 10 tensors of size 3 | |
embedding = nn.Embedding(10, 3) | |
embedding.weight.requires_grad_(False) | |
# a batch of 4 indices | |
input = torch.LongTensor([1, 2, 4, 5]) | |
output = embedding(input) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from typing import List | |
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" | |
import torch | |
import torch.distributed as dist | |
dist.init_process_group(backend="gloo") | |
rank = local_rank = dist.get_rank() | |
world_size = dist.get_world_size() | |
torch.cuda.set_device(local_rank) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from dataclasses import dataclass | |
from typing import Optional, Tuple | |
import torch | |
from torch import nn | |
def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, | |
out: torch.Tensor) -> None: | |
out.copy_(q) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from dataclasses import dataclass | |
from typing import Optional, Tuple | |
import torch | |
from torch import nn | |
def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> torch.Tensor: | |
out = q.clone() | |
out += k |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from dataclasses import dataclass | |
from typing import Optional, Tuple | |
import torch | |
from torch import nn | |
def silly_attention(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, | |
out: torch.Tensor) -> None: | |
out.copy_(q) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import openai | |
import asyncio | |
async def get_choice_completion(prompt, choices): | |
# Initialize an asynchronous OpenAI client | |
async with openai.AsyncClient(base_url="http://127.0.0.1:8000/v1", api_key="abc") as client: | |
choice_probs = {} | |
# Calculate logprobs for each prompt + choice sequence | |
for choice in choices: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
unified benchmark script | |
$ python benchmarks/benchmark_serving.py --model meta-llama/Meta-Llama-3-8B --dataset-name random --random-input-len 256 --random-output-len 256 --num-prompts 100 | |
vLLM default | |
$ vllm serve meta-llama/Meta-Llama-3-8B --disable-log-requests --max-num-seqs 1 | |
============ Serving Benchmark Result ============ | |
Successful requests: 100 | |
Benchmark duration (s): 198.86 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
from typing import Optional, Tuple, Union | |
torch.cuda.is_available() | |
def report_memory(prefix): | |
free, total = torch.cuda.mem_get_info() | |
used = total - free | |
print(f"{prefix}: Used: {used / 1024 / 1024} MB, Free: {free / 1024 / 1024} MB, Total: {total / 1024 / 1024} MB") | |
output_parallel = torch.randn(8192, 4096, dtype=torch.bfloat16, device="cuda") # 64 MB |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
from typing import Optional | |
from torch._dynamo.backends.common import aot_autograd | |
@torch.library.custom_op("custom::unified_attention", mutates_args=[]) | |
def unified_attention(x: torch.Tensor, num_prefill_tokens: torch.Tensor, cache: torch.Tensor) -> torch.Tensor: | |
if cache.numel() == 0: | |
return x * 2 | |
output = x.clone() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cstdio> | |
#include <iostream> | |
#include <cuda_runtime.h> | |
__global__ void waitKernel(volatile bool *flag) { | |
// Busy-wait loop | |
while (!*flag) { | |
// The use of volatile ensures that the GPU fetches the flag value from memory each time | |
// This is necessary because without volatile, the compiler might optimize the memory read | |
__threadfence_system(); // Optional for system-wide memory coherence |
NewerOlder