This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -------- | |
# Hardware | |
# -------- | |
# Opcode - operational code | |
# Assebly mnemonic - abbreviation for an operation | |
# Instruction Code Format (IA-32) | |
# - Optional instruction prefix | |
# - Operational code |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
from torch import nn | |
import copy | |
from torch.fx.experimental.efficient_conv_bn_eval import turn_on_efficient_conv_bn_eval | |
class BackboneModel(nn.Module): | |
def __init__(self, *args, **kwargs) -> None: | |
super().__init__(*args, **kwargs) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
from torch import nn | |
import copy | |
class BackboneModel(nn.Module): | |
def __init__(self, *args, **kwargs) -> None: | |
super().__init__(*args, **kwargs) | |
self.conv1 = nn.Conv2d(16, 16, 6) | |
self.bn1 = nn.BatchNorm2d(16) | |
def forward(self, x): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import torch.distributed as dist | |
import os | |
import multiprocessing | |
import multiprocessing.shared_memory | |
import io | |
import pickle | |
N_warmup = 10 # warmup N_warmup times | |
N = 100 # repeat N times |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# run the code with `torchrun --nproc-per-node 4 test.py` | |
import os | |
os.environ['NCCL_DEBUG'] = 'TRACE' | |
import torch | |
import torch.distributed as dist | |
# nccl communicators are lazily created | |
dist.init_process_group(backend='nccl') | |
print("init done") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#define _GNU_SOURCE | |
#include <stdio.h> | |
#include <link.h> | |
#include <stdbool.h> | |
#include <string.h> | |
#include <stdlib.h> | |
typedef int cudaError_t; | |
typedef void* cudaGraph_t; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from vllm import LLM, SamplingParams | |
prompts = [ | |
"Hello, my name is", | |
"The president of the United States is", | |
"The capital of France is", | |
"The future of AI is", | |
] | |
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) | |
from contextlib import nullcontext |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cstdio> | |
#include <iostream> | |
#include <cuda_runtime.h> | |
__global__ void waitKernel(volatile bool *flag) { | |
// Busy-wait loop | |
while (!*flag) { | |
// The use of volatile ensures that the GPU fetches the flag value from memory each time | |
// This is necessary because without volatile, the compiler might optimize the memory read | |
__threadfence_system(); // Optional for system-wide memory coherence |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
from typing import Optional | |
from torch._dynamo.backends.common import aot_autograd | |
@torch.library.custom_op("custom::unified_attention", mutates_args=[]) | |
def unified_attention(x: torch.Tensor, num_prefill_tokens: torch.Tensor, cache: torch.Tensor) -> torch.Tensor: | |
if cache.numel() == 0: | |
return x * 2 | |
output = x.clone() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
from typing import Optional, Tuple, Union | |
torch.cuda.is_available() | |
def report_memory(prefix): | |
free, total = torch.cuda.mem_get_info() | |
used = total - free | |
print(f"{prefix}: Used: {used / 1024 / 1024} MB, Free: {free / 1024 / 1024} MB, Total: {total / 1024 / 1024} MB") | |
output_parallel = torch.randn(8192, 4096, dtype=torch.bfloat16, device="cuda") # 64 MB |
OlderNewer