Skip to content

Instantly share code, notes, and snippets.

[{"Test name": "serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp2_sharegpt_qps_01", "GPU": "1xStandard_ND96asr_v4 x 2", "# of req.": 200, "Tput (req/s)": 0.9284057358744006, "Output Tput (tok/s)": 198.24247678126076, "Total Tput (tok/s)": 396.266778214591, "Mean TTFT (ms)": 110.37337160010793, "Median TTFT (ms)": 96.9816950000677, "P99 TTFT (ms)": 230.3005734290491, "Mean TPOT (ms)": 43.72182021034344, "Median TPOT (ms)": 43.54532462942404, "P99 TPOT (ms)": 50.513716590712384, "Mean ITL (ms)": 43.631314270832306, "Median ITL (ms)": 42.27557599915599, "P99 ITL (ms)": 87.99811164881247}, {"Test name": "serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp2_sharegpt_qps_04", "GPU": "1xStandard_ND96asr_v4 x 2", "# of req.": 200, "Tput (req/s)": 2.521471685463534, "Output Tput (tok/s)": 539.0528242768216, "Total Tput (tok/s)": 1076.8701274277662, "Mean TTFT (ms)": 139.8380736899344, "Median TTFT (ms)": 125.15622350110789, "P99 TTFT (ms)": 332.96458055017825, "Mean TPOT (ms)": 61.62705314762229, "Median TPOT (ms)": 63.4
git clone https://github.com/surajssd/llm-k8s
cd llm-k8s
git checkout 4b4dd8e8521346aa3473175eb0c45b4c7e6e6883
source .env
export GPU_NODE_COUNT=2
export VM_SIZE="Standard_HB120-16rs_v3"
./scripts/deploy-aks.sh deploy_aks
./scripts/deploy-aks.sh download_aks_credentials
[{"Test name": "serving_microsoft-phi-4_tp1_pp1_sharegpt_qps_01", "GPU": "1xStandard_NC24ads_A100_v4 x 1", "# of req.": 200, "Tput (req/s)": 0.9874006606348092, "Output Tput (tok/s)": 96.84919379836526, "Total Tput (tok/s)": 309.02184775557305, "Mean TTFT (ms)": 67.13169773499885, "Median TTFT (ms)": 53.45841300004395, "P99 TTFT (ms)": 154.88893441993696, "Mean TPOT (ms)": 22.48243263621561, "Median TPOT (ms)": 22.103191907692437, "P99 TPOT (ms)": 29.212842526595843, "Mean ITL (ms)": 22.355201681361745, "Median ITL (ms)": 21.27322000001186, "P99 ITL (ms)": 41.42363359993397}, {"Test name": "serving_microsoft-phi-4_tp1_pp1_sharegpt_qps_04", "GPU": "1xStandard_NC24ads_A100_v4 x 1", "# of req.": 200, "Tput (req/s)": 3.2772372061643584, "Output Tput (tok/s)": 325.2002479676893, "Total Tput (tok/s)": 1029.4129788282867, "Mean TTFT (ms)": 78.54876733999617, "Median TTFT (ms)": 64.22880649995477, "P99 TTFT (ms)": 201.15669853015976, "Mean TPOT (ms)": 28.32710065808013, "Median TPOT (ms)": 27.660848333349957, "P99 TP
[{"Test name": "serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp1_sharegpt_qps_01", "GPU": "1xStandard_NC96ads_A100_v4 x 1", "# of req.": 200, "Tput (req/s)": 0.9279779228604551, "Output Tput (tok/s)": 198.50839736869426, "Total Tput (tok/s)": 396.441448425215, "Mean TTFT (ms)": 154.77130150999983, "Median TTFT (ms)": 128.38760200008892, "P99 TTFT (ms)": 376.5480166300789, "Mean TPOT (ms)": 44.93937090850136, "Median TPOT (ms)": 44.63469464226745, "P99 TPOT (ms)": 58.03939859885578, "Mean ITL (ms)": 44.85155391470774, "Median ITL (ms)": 43.878026000129466, "P99 ITL (ms)": 131.9412263799859}, {"Test name": "serving_meta-llama-Llama-3.3-70B-Instruct_tp4_pp1_sharegpt_qps_04", "GPU": "1xStandard_NC96ads_A100_v4 x 1", "# of req.": 200, "Tput (req/s)": 2.495698100981504, "Output Tput (tok/s)": 532.9313724835904, "Total Tput (tok/s)": 1065.2512989324402, "Mean TTFT (ms)": 246.33752696000101, "Median TTFT (ms)": 219.59910649991343, "P99 TTFT (ms)": 606.8011953101309, "Mean TPOT (ms)": 73.95579696666762, "Median TPOT
[{"Test name": "serving_meta-llama-Llama-3.3-70B-Instruct_tp2_pp2_sharegpt_qps_01", "GPU": "1xStandard_NC48ads_A100_v4 x 2", "# of req.": 200, "Tput (req/s)": 0.8723546086118796, "Output Tput (tok/s)": 186.62718319338248, "Total Tput (tok/s)": 372.6960594372533, "Mean TTFT (ms)": 183.92802053997002, "Median TTFT (ms)": 149.6454604985047, "P99 TTFT (ms)": 433.51827928232507, "Mean TPOT (ms)": 68.63183632389541, "Median TPOT (ms)": 68.07398973492188, "P99 TPOT (ms)": 91.97054489733009, "Mean ITL (ms)": 68.4992397737343, "Median ITL (ms)": 63.92631100243307, "P99 ITL (ms)": 224.9206623390637}, {"Test name": "serving_meta-llama-Llama-3.3-70B-Instruct_tp2_pp2_sharegpt_qps_04", "GPU": "1xStandard_NC48ads_A100_v4 x 2", "# of req.": 200, "Tput (req/s)": 2.0098153410901136, "Output Tput (tok/s)": 427.7389499675034, "Total Tput (tok/s)": 856.4225131453192, "Mean TTFT (ms)": 250.33063353503167, "Median TTFT (ms)": 210.2748959987366, "P99 TTFT (ms)": 595.3828498008077, "Mean TPOT (ms)": 104.60007571868582, "Median TPOT (
Running over qps list 1
~/vllm/benchmarks /vllm-workspace
Running test case serving_llama70B_tp2_pp2_sharegpt with qps 1
Client command: python3 benchmark_serving.py --save-result --base-url http://llama-3-3-70b-instruct-leader.default:8000 --result-dir /root/results/ --result-filename serving_llama70B_tp2_pp2_sharegpt_qps_1.json --request-rate 1 --model=meta-llama/Llama-3.3-70B-Instruct --backend=vllm --dataset-name=sharegpt --dataset-path=/root/ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts=200
Namespace(backend='vllm', base_url='http://llama-3-3-70b-instruct-leader.default:8000', host='127.0.0.1', port=8000, endpoint='/v1/completions', dataset=None, dataset_name='sharegpt', dataset_path='/root/ShareGPT_V3_unfiltered_cleaned_split.json', max_concurrency=None, model='meta-llama/Llama-3.3-70B-Instruct', tokenizer=None, best_of=1, use_beam_search=False, num_prompts=200, logprobs=None, request_rate=1.0, burstiness=1.0, seed
@surajssd
surajssd / list-vm-sizes.go
Last active July 11, 2023 13:41
Using Azure's user managed identity for the Kubernetes workloads
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See License.txt in the project root for license information.
package main
import (
"context"
"fmt"
"log"
"os"
==========================================================================
0. Building a qcow image, instructions at this link
==========================================================================
https://github.com/AMDESE/AMDSEV
==========================================================================
1. Building the host and guest kernels, instructions at this link, please
build the 5.19-rc6 kernel
==========================================================================
https://github.com/AMDESE/AMDSEV/tree/sev-snp-devel
@surajssd
surajssd / ipxe-ubuntu-20
Created June 1, 2022 08:03
Boot Ubuntu on Packet Using iPXE
#!ipxe
dhcp net0
set base-url http://archive.ubuntu.com/ubuntu/dists/focal/main/installer-amd64/current/legacy-images/netboot/ubuntu-installer/amd64/
kernel ${base-url}/linux console=ttyS1,115200n8
initrd ${base-url}/initrd.gz
boot
apiVersion: v1
kind: Namespace
metadata:
labels:
cluster.x-k8s.io/provider: control-plane-kubeadm
control-plane: controller-manager
name: capi-kubeadm-control-plane-system
---
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition