Brent Salisbury nerdalert

fedora@ip-172-31-37-101:~$ git clone https://github.com/neuralmagic/gateway-api-inference-extension.git
cd gateway-api-inference-extension
Cloning into 'gateway-api-inference-extension'...
remote: Enumerating objects: 5757, done.
remote: Counting objects: 100% (1395/1395), done.
remote: Compressing objects: 100% (318/318), done.
remote: Total 5757 (delta 1188), reused 1078 (delta 1077), pack-reused 4362 (from 3)
Receiving objects: 100% (5757/5757), 7.04 MiB | 38.35 MiB/s, done.
Resolving deltas: 100% (3112/3112), done.

cloned and made sure the nightly image was up to date.

docker run -it --ipc=host --network=host --group-add render \
    --privileged --security-opt seccomp=unconfined \
    --cap-add=CAP_SYS_ADMIN --cap-add=SYS_PTRACE \
    --device=/dev/kfd --device=/dev/dri --device=/dev/mem \
    -e HF_TOKEN=$HF_TOKEN -e HF_HOME=/data/model_cache \
    -e MODEL=$MODEL \

$ ./install.sh
+ set -e
+ set -o pipefail
++ command -v git
+ '[' -z /usr/bin/git ']'
++ command -v kubectl
+ '[' -z '/usr/local/bin/kubectl]'
./install.sh: line 10: [: missing `]'

$ python ./benchmark-e2e.py --port 8000 --model "meta-llama/Llama-3.2-1B" --cuda-device 0
Using port: 8000
Removing /home/ubuntu/vllm/benchmark-e2e/benchmark-compare
Removing /home/ubuntu/vllm/benchmark-e2e/venv-vllm
Removing /home/ubuntu/vllm/benchmark-e2e/venv-vllm-src
Removing /home/ubuntu/vllm/benchmark-e2e/venv-sgl
▶ git clone https://github.com/neuralmagic/benchmark-compare.git /home/ubuntu/vllm/benchmark-e2e/benchmark-compare
Cloning into '/home/ubuntu/vllm/benchmark-e2e/benchmark-compare'...
remote: Enumerating objects: 78, done.

$ go build
$ ./benchmark-go --port 8000 --model meta-llama/Llama-3.2-1B --cuda-device 0
[main] 2025/04/23 04:20:18 Using port: 8000
[main] 2025/04/23 04:20:18 Removing /home/ubuntu/vllm/benchmark-go/benchmark-compare
[main] 2025/04/23 04:20:18 Removing /home/ubuntu/vllm/benchmark-go/venv-vllm
[main] 2025/04/23 04:20:19 Removing /home/ubuntu/vllm/benchmark-go/venv-vllm-src
[main] 2025/04/23 04:20:19 Removing /home/ubuntu/vllm/benchmark-go/venv-sgl
[main] 2025/04/23 04:20:19 ▶ git clone https://github.com/neuralmagic/benchmark-compare.git /home/ubuntu/vllm/benchmark-go/benchmark-compare
Cloning into '/home/ubuntu/vllm/benchmark-go/benchmark-compare'...

podman run --rm -it     --network host     -e MODEL=meta-llama/Llama-3.2-1B     -e FRAMEWORK=vllm     -e HF_TOKEN="${HF_TOKEN}"     -e PORT=8000     -e HOST=172.31.37.101     -v "$(pwd)":/host:Z     -w /opt/benchmark     quay.io/bsalisbu/vllm-benchmark:latest

===== vllm - RUNNING meta-llama/Llama-3.2-1B FOR 120 PROMPTS WITH 1 QPS =====

INFO 04-23 01:38:59 [__init__.py:243] No platform detected, vLLM is running on UnspecifiedPlatform
Namespace(backend='vllm', base_url=None, host='127.0.0.1', port=8000, endpoint='/v1/completions', dataset_name='random', dataset_path=None, max_concurrency=None, model='meta-llama/Llama-3.2-1B', tokenizer=None, use_beam_search=False, num_prompts=120, logprobs=None, request_rate=1.0, burstiness=1.0, seed=1, trust_remote_code=False, disable_tqdm=False, profile=False, save_result=True, save_detailed=False, metadata=['framework=vllm'], result_dir=None, result_filename='results.json', ignore_eos=True, percentile_metrics='ttft,tpot,itl', metric_percentiles='99', goodput=None, sonn


	Events: <none>
	ubuntu@ip-172-31-46-4:~/v2/vllm-d-deployer$ kubectl describe inferencepools.inference.networking.x-k8s.io llama-3-2-3b-instruct -n llm-d^C
	ubuntu@ip-172-31-46-4:~/v2/vllm-d-deployer$ k logs llama-3.2-3b-instruct-epp-65c87574f5-wtxrc
	{"level":"info","ts":"2025-05-03T17:09:29Z","logger":"setup","caller":"epp/main.go:135","msg":"Flags processed","flags":{"DestinationEndpointHintMetadataNamespace":"envoy.lb","certPath":"","destinationEndpointHintKey":"x-gateway-destination-endpoint","grpcHealthPort":9003,"grpcPort":9002,"kubeconfig":"","kvCacheUsagePercentageMetric":"vllm:gpu_cache_usage_perc","loraInfoMetric":"vllm:lora_requests_info","metricsPort":9090,"poolName":"llama-3.2-3b-instruct","poolNamespace":"llm-d","refreshMetricsInterval":50000000,"refreshPrometheusMetricsInterval":5000000000,"secureServing":true,"totalQueuedRequestsMetric":"vllm:num_requests_waiting","v":4,"zap-devel":true,"zap-encoder":{},"zap-log-level":{},"zap-stacktrace-level":{},"zap-time-encoding":{}}}

	$ k logs llama-3.2-3b-instruct-decode-6dcb767b75-4c8c8 -c vllm
	INFO 05-03 16:00:29 [__init__.py:239] Automatically detected platform cuda.
	INFO 05-03 16:00:32 [api_server.py:1042] vLLM API server version 0.1.dev1+g9b70e2b
	INFO 05-03 16:00:32 [api_server.py:1043] args: Namespace(host=None, port=8200, uvicorn_log_level='info', disable_uvicorn_access_log=False, allow_credentials=False, allowed_origins=[''], allowed_methods=[''], allowed_headers=['*'], api_key=None, lora_modules=None, prompt_adapters=None, chat_template=None, chat_template_content_format='auto', response_role='assistant', ssl_keyfile=None, ssl_certfile=None, ssl_ca_certs=None, enable_ssl_refresh=False, ssl_cert_reqs=0, root_path=None, middleware=[], return_tokens_as_token_ids=False, disable_frontend_multiprocessing=False, enable_request_id_headers=False, enable_auto_tool_choice=False, tool_call_parser=None, tool_parser_plugin='', model='meta-llama/Llama-3.2-3B-Instruct', task='auto', tokenizer=None, tokenizer_mode='auto', trust_remote_code=Fals

	sudo apt-get update
	sudo apt-get -y install jq gh

	sudo wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /usr/local/bin/yq
	sudo chmod +x /usr/local/bin/yq

	sudo apt-get update
	sudo apt-get -y install ca-certificates curl
	sudo install -m 0755 -d /etc/apt/keyrings
	sudo curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc




	```

	docker run --rm -it --ipc=host --network=host --group-add render \
	--privileged --security-opt seccomp=unconfined \
	--cap-add=CAP_SYS_ADMIN --cap-add=SYS_PTRACE \
	--device=/dev/kfd --device=/dev/dri --device=/dev/mem \
	-e HF_TOKEN=$HF_TOKEN -e HF_HOME=/data/model_cache \