Skip to content

Instantly share code, notes, and snippets.

View tlrmchlsmth's full-sized avatar
🍲
cooking

Tyler Michael Smith tlrmchlsmth

🍲
cooking
  • Red Hat
View GitHub Profile
def decompress_2_4(metadata, values):
"""
Decompress 2:4 sparse tensor
Args:
metadata: Tensor of shape (N, K/8) uint8.
Each byte stores indexes for 2 blocks
values: Tensor of shape (N, K/2) bf16, storing non-zero values
Returns:
Dense tensor of shape (N, K) in bf16
"""
import numpy as np
import msgpack
import pickle
import time
import json
from msgspec import msgpack as msgspec_msgpack
import struct
from io import BytesIO
def benchmark_methods(arr, num_iterations=1000):
import os
import time
import asyncio
import argparse
import statistics
from typing import List
def benchmark_time_sleep(sleep_duration: float, iterations: int) -> List[float]:
durations = []
for _ in range(iterations):
import pdb
import traceback
import inspect
def function_in_stack(function_name):
stack = traceback.extract_stack()
for frame in stack:
if frame.name == function_name:
return True
@tlrmchlsmth
tlrmchlsmth / vllm_benchmark_rr_sweep.sh
Created July 15, 2024 15:43
Simple sweep over request rates, for running e2e benchmarks for vLLM
#!/bin/bash
# Default values
model="meta-llama/Meta-Llama-3-8B"
port=8192
num_prompts=500
# TODO: Add other datasets
dataset_path="ShareGPT_V3_unfiltered_cleaned_split.json"
dataset_url="https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/$dataset_path"