Skip to content

Instantly share code, notes, and snippets.

@evadne
Created February 11, 2025 02:58
Show Gist options
  • Save evadne/4791cefd8f6a01d51e0fb52c60a52f27 to your computer and use it in GitHub Desktop.
Save evadne/4791cefd8f6a01d51e0fb52c60a52f27 to your computer and use it in GitHub Desktop.
LLaMA.cpp Server on Incus
architecture: x86_64
config:
cloud-init.user-data: |
#cloud-config
package_upgrade: true
packages:
- apt-transport-https
- build-essential
- ca-certificates
- cmake
- git
- gnupg
- libopenblas-dev
- libssl-dev
- lsb-release
- numactl
- libomp-dev
- pkg-config
- software-properties-common
- wget
write_files:
- path: /setup/setup-application.sh
owner: root:root
permissions: '0700'
content: |
#!/usr/bin/env bash
set -euxo pipefail
git clone \
--depth=1 \
--single-branch \
--branch=master \
https://github.com/ggerganov/llama.cpp \
/run/llama.cpp
cd /run/llama.cpp
cmake -S . -B build \
-DCMAKE_BUILD_TYPE=Release \
-DLLAMA_CURL=OFF \
-DLLAMA_BUILD_EXAMPLES=ON \
-DGGML_NATIVE=OFF \
-DGGML_BACKEND_DL=ON \
-DGGML_CCACHE=OFF \
-DGGML_BLAS=ON \
-DGGML_BLAS_VENDOR=OpenBLAS
cmake --build build --config Release -j $(nproc)
find build -name "*.so" -exec cp {} /app \;
cp build/bin/llama-cli /app
cp build/bin/llama-server /app
cd /app
rm -rf /run/llama.cpp
systemctl enable llama-server
systemctl start llama-server
- path: /etc/systemd/system/llama-server.service
owner: root:root
permissions: '0644'
content: |
[Unit]
Description=llama.cpp server
After=network-online.target
[Service]
Type=exec
Restart=always
WorkingDirectory=/app
User=root
Group=root
ExecStart=numactl --cpunodebind=12-23 --interleave=12-23 -- /app/build/bin/llama-server --host 0.0.0.0 --port 80 --threads-http 8 --cache-type-k q4_0 -t 48 -c 16384 --temp 0.6 -s 42 --no-kv-offload -m /media/models/unsloth/DeepSeek-R1-GGUF/DeepSeek-R1-UD-IQ2_XXS/DeepSeek-R1-UD-IQ2_XXS-00001-of-00004.gguf
AmbientCapabilities=CAP_NET_BIND_SERVICE
[Install]
WantedBy=multi-user.target
runcmd:
- /setup/setup-application.sh
image.architecture: amd64
image.description: Debian bookworm amd64 (20250210_05:24)
image.os: Debian
image.release: bookworm
image.serial: "20250210_05:24"
image.type: squashfs
image.variant: cloud
limits.cpu: 96-191,288-383
limits.cpu.priority: "0"
security.privileged: "true"
security.syscalls.intercept.mount: "true"
security.syscalls.intercept.mount.allowed: hugetlbfs
volatile.base_image: 1103adaa8aa5a0f10bb5fd3268bafea99e9b7662e88e739b024dad509934dd5d
volatile.cloud-init.instance-id: e34cb050-295a-4e26-871c-47d6b4f7b298
volatile.cpu.nodes: "6"
volatile.eth0.host_name: veth7aa64d0d
volatile.eth0.hwaddr: 00:16:3e:d4:91:4c
volatile.idmap.base: "0"
volatile.idmap.current: '[]'
volatile.last_state.power: RUNNING
volatile.last_state.ready: "false"
volatile.network-primary.host_name: enp65s0f0v1
volatile.network-primary.hwaddr: 00:16:3e:b2:36:e6
volatile.network-primary.last_state.created: "false"
volatile.network-primary.last_state.hwaddr: 4a:c1:1c:f5:d5:e2
volatile.network-primary.last_state.mtu: "1500"
volatile.network-primary.last_state.vf.hwaddr: "00:00:00:00:00:00"
volatile.network-primary.last_state.vf.id: "1"
volatile.network-primary.last_state.vf.parent: enp65s0f0np0
volatile.network-primary.last_state.vf.spoofcheck: "false"
volatile.network-primary.last_state.vf.vlan: "0"
volatile.network-primary.name: eth1
volatile.network-sanctuary.host_name: enp65s0f1v1
volatile.network-sanctuary.hwaddr: 00:16:3e:0e:9a:e7
volatile.network-sanctuary.last_state.created: "false"
volatile.network-sanctuary.last_state.hwaddr: 16:43:4e:ea:13:50
volatile.network-sanctuary.last_state.mtu: "1500"
volatile.network-sanctuary.last_state.vf.hwaddr: "00:00:00:00:00:00"
volatile.network-sanctuary.last_state.vf.id: "1"
volatile.network-sanctuary.last_state.vf.parent: enp65s0f1np1
volatile.network-sanctuary.last_state.vf.spoofcheck: "false"
volatile.network-sanctuary.last_state.vf.vlan: "0"
volatile.network-sanctuary.name: eth2
volatile.uuid: e8cc8ac6-2b39-45d3-8c59-e8cea59d48e7
volatile.uuid.generation: e8cc8ac6-2b39-45d3-8c59-e8cea59d48e7
devices:
disk-models:
path: /media/models
pool: hercules
source: hypnos-models
type: disk
root:
path: /
pool: hercules
size: 32GiB
type: disk
ephemeral: false
profiles:
- default
- infrastructure-network-cpu0
stateful: false
description: LLaMA.cpp Inference Worker
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment