fedora@ip-172-31-37-101:~$ git clone https://github.com/neuralmagic/gateway-api-inference-extension.git
cd gateway-api-inference-extension
Cloning into 'gateway-api-inference-extension'...
remote: Enumerating objects: 5757, done.
remote: Counting objects: 100% (1395/1395), done.
remote: Compressing objects: 100% (318/318), done.
remote: Total 5757 (delta 1188), reused 1078 (delta 1077), pack-reused 4362 (from 3)
Receiving objects: 100% (5757/5757), 7.04 MiB | 38.35 MiB/s, done.
Resolving deltas: 100% (3112/3112), done.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Events: <none> | |
ubuntu@ip-172-31-46-4:~/v2/vllm-d-deployer$ kubectl describe inferencepools.inference.networking.x-k8s.io llama-3-2-3b-instruct -n llm-d^C | |
ubuntu@ip-172-31-46-4:~/v2/vllm-d-deployer$ k logs llama-3.2-3b-instruct-epp-65c87574f5-wtxrc | |
{"level":"info","ts":"2025-05-03T17:09:29Z","logger":"setup","caller":"epp/main.go:135","msg":"Flags processed","flags":{"DestinationEndpointHintMetadataNamespace":"envoy.lb","certPath":"","destinationEndpointHintKey":"x-gateway-destination-endpoint","grpcHealthPort":9003,"grpcPort":9002,"kubeconfig":"","kvCacheUsagePercentageMetric":"vllm:gpu_cache_usage_perc","loraInfoMetric":"vllm:lora_requests_info","metricsPort":9090,"poolName":"llama-3.2-3b-instruct","poolNamespace":"llm-d","refreshMetricsInterval":50000000,"refreshPrometheusMetricsInterval":5000000000,"secureServing":true,"totalQueuedRequestsMetric":"vllm:num_requests_waiting","v":4,"zap-devel":true,"zap-encoder":{},"zap-log-level":{},"zap-stacktrace-level":{},"zap-time-encoding":{}}} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ k logs llama-3.2-3b-instruct-decode-6dcb767b75-4c8c8 -c vllm | |
INFO 05-03 16:00:29 [__init__.py:239] Automatically detected platform cuda. | |
INFO 05-03 16:00:32 [api_server.py:1042] vLLM API server version 0.1.dev1+g9b70e2b | |
INFO 05-03 16:00:32 [api_server.py:1043] args: Namespace(host=None, port=8200, uvicorn_log_level='info', disable_uvicorn_access_log=False, allow_credentials=False, allowed_origins=['*'], allowed_methods=['*'], allowed_headers=['*'], api_key=None, lora_modules=None, prompt_adapters=None, chat_template=None, chat_template_content_format='auto', response_role='assistant', ssl_keyfile=None, ssl_certfile=None, ssl_ca_certs=None, enable_ssl_refresh=False, ssl_cert_reqs=0, root_path=None, middleware=[], return_tokens_as_token_ids=False, disable_frontend_multiprocessing=False, enable_request_id_headers=False, enable_auto_tool_choice=False, tool_call_parser=None, tool_parser_plugin='', model='meta-llama/Llama-3.2-3B-Instruct', task='auto', tokenizer=None, tokenizer_mode='auto', trust_remote_code=Fals |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
sudo apt-get update | |
sudo apt-get -y install jq gh | |
sudo wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /usr/local/bin/yq | |
sudo chmod +x /usr/local/bin/yq | |
sudo apt-get update | |
sudo apt-get -y install ca-certificates curl | |
sudo install -m 0755 -d /etc/apt/keyrings | |
sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc |
cloned and made sure the nightly image was up to date.
docker run -it --ipc=host --network=host --group-add render \
--privileged --security-opt seccomp=unconfined \
--cap-add=CAP_SYS_ADMIN --cap-add=SYS_PTRACE \
--device=/dev/kfd --device=/dev/dri --device=/dev/mem \
-e HF_TOKEN=$HF_TOKEN -e HF_HOME=/data/model_cache \
-e MODEL=$MODEL \
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
``` | |
docker run --rm -it --ipc=host --network=host --group-add render \ | |
--privileged --security-opt seccomp=unconfined \ | |
--cap-add=CAP_SYS_ADMIN --cap-add=SYS_PTRACE \ | |
--device=/dev/kfd --device=/dev/dri --device=/dev/mem \ | |
-e HF_TOKEN=$HF_TOKEN -e HF_HOME=/data/model_cache \ |
$ ./install.sh
+ set -e
+ set -o pipefail
++ command -v git
+ '[' -z /usr/bin/git ']'
++ command -v kubectl
+ '[' -z '/usr/local/bin/kubectl]'
./install.sh: line 10: [: missing `]'
$ python ./benchmark-e2e.py --port 8000 --model "meta-llama/Llama-3.2-1B" --cuda-device 0
Using port: 8000
Removing /home/ubuntu/vllm/benchmark-e2e/benchmark-compare
Removing /home/ubuntu/vllm/benchmark-e2e/venv-vllm
Removing /home/ubuntu/vllm/benchmark-e2e/venv-vllm-src
Removing /home/ubuntu/vllm/benchmark-e2e/venv-sgl
▶ git clone https://github.com/neuralmagic/benchmark-compare.git /home/ubuntu/vllm/benchmark-e2e/benchmark-compare
Cloning into '/home/ubuntu/vllm/benchmark-e2e/benchmark-compare'...
remote: Enumerating objects: 78, done.
$ go build
$ ./benchmark-go --port 8000 --model meta-llama/Llama-3.2-1B --cuda-device 0
[main] 2025/04/23 04:20:18 Using port: 8000
[main] 2025/04/23 04:20:18 Removing /home/ubuntu/vllm/benchmark-go/benchmark-compare
[main] 2025/04/23 04:20:18 Removing /home/ubuntu/vllm/benchmark-go/venv-vllm
[main] 2025/04/23 04:20:19 Removing /home/ubuntu/vllm/benchmark-go/venv-vllm-src
[main] 2025/04/23 04:20:19 Removing /home/ubuntu/vllm/benchmark-go/venv-sgl
[main] 2025/04/23 04:20:19 ▶ git clone https://github.com/neuralmagic/benchmark-compare.git /home/ubuntu/vllm/benchmark-go/benchmark-compare
Cloning into '/home/ubuntu/vllm/benchmark-go/benchmark-compare'...
podman run --rm -it --network host -e MODEL=meta-llama/Llama-3.2-1B -e FRAMEWORK=vllm -e HF_TOKEN="${HF_TOKEN}" -e PORT=8000 -e HOST=172.31.37.101 -v "$(pwd)":/host:Z -w /opt/benchmark quay.io/bsalisbu/vllm-benchmark:latest
===== vllm - RUNNING meta-llama/Llama-3.2-1B FOR 120 PROMPTS WITH 1 QPS =====
INFO 04-23 01:38:59 [__init__.py:243] No platform detected, vLLM is running on UnspecifiedPlatform
Namespace(backend='vllm', base_url=None, host='127.0.0.1', port=8000, endpoint='/v1/completions', dataset_name='random', dataset_path=None, max_concurrency=None, model='meta-llama/Llama-3.2-1B', tokenizer=None, use_beam_search=False, num_prompts=120, logprobs=None, request_rate=1.0, burstiness=1.0, seed=1, trust_remote_code=False, disable_tqdm=False, profile=False, save_result=True, save_detailed=False, metadata=['framework=vllm'], result_dir=None, result_filename='results.json', ignore_eos=True, percentile_metrics='ttft,tpot,itl', metric_percentiles='99', goodput=None, sonn