git clone https://github.com/ggml-org/llama.cpp
cd llama.cpp
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1101 -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON \
&& cmake --build build --config Release -- -j 21
# this is important
export HSA_OVERRIDE_GFX_VERSION=11.0.0
export AMDGPU_TARGETS=gfx1101
export ROCM_PATH=/opt/rocm
export HIP_VISIBLE_DEVICES=0
export ROCR_VISIBLE_DEVICES=0
# 28 layers for qwen3 can fit on the gpu
./build/bin/llama-cli --model /mnt/persist/models/llama/Qwen3-14B-Q5_K_M.gguf --jinja --color -ngl 28 -fa -sm row --temp 0.6 --top-k 20 --top-p 0.95 --min-p 0 --presence-penalty 1.5 -c 40960 -n 32768 --no-context-shift
# run gemma3
./build/bin/llama-cli \
--model /mnt/persist/models/gemma/gemma-3-12b-it-qat-int4-Q4_K_M.gguf \
--threads 22 \
--ctx-size 16384 \
--n-gpu-layers 24 \
--seed 3407 \
--prio 2 \
--temp 1.0 \
--repeat-penalty 1.0 \
--min-p 0.01 \
--top-k 64 \
--top-p 0.95 \
-no-cnv \
--prompt "<start_of_turn>user\nCreate a Flappy Bird game in Python. You must include these things:\n1. You must use pygame.\n2. The background color should be randomly chosen and is a light shade. Start with a light blue color.\n3. Pressing SPACE multiple times will accelerate the bird.\n4. The bird's shape should be randomly chosen as a square, circle or triangle. The color should be randomly chosen as a dark color.\n5. Place on the bottom some land colored as dark brown or yellow chosen randomly.\n6. Make a score shown on the top right side. Increment if you pass pipes and don't hit them.\n7. Make randomly spaced pipes with enough space. Color them randomly as dark green or light brown or a dark gray shade.\n8. When you lose, show the best score. Make the text inside the screen. Pressing q or Esc will quit the game. Restarting is pressing SPACE again.\nThe final game should be inside a markdown section in Python. Check your code for errors and fix them before the final markdown section.<end_of_turn>\n<start_of_turn>model\n"
# run gemma3 server
./build/bin/llama-server \
--model /mnt/persist/models/gemma/gemma-3-12b-it-qat-int4-Q4_K_M.gguf \
--threads 22 \
--ctx-size 128000 \
--n-gpu-layers 24 \
--seed 3407 \
--prio 2 \
--temp 1.0 \
--repeat-penalty 1.0 \
--min-p 0.01 \
--top-k 64 \
--top-p 0.95 \
--port 4269 --host 0.0.0.0