Skip to content

Instantly share code, notes, and snippets.

@initialed85
Created October 17, 2022 13:28
Show Gist options
  • Save initialed85/cd8b268b7cad2ca8f991b4087cbbf57a to your computer and use it in GitHub Desktop.
Save initialed85/cd8b268b7cad2ca8f991b4087cbbf57a to your computer and use it in GitHub Desktop.
Modified nvidia-device-plugin.yml that works for k3s (at least for me)
# tweaked from https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.12.2/nvidia-device-plugin.yml
# credit to https://github.com/hansaya for the comment at https://github.com/k3s-io/k3s/issues/4391#issuecomment-1194627754
# note: you need to label your GPU nodes with gpu=yes for this to work (I had some non-GPU nodes and the discovery pods kept crashlooping
# on those nodes without the system working out they had no GPUs)
#
# prerequisites
#
apiVersion: node.k8s.io/v1
kind: RuntimeClass
metadata:
name: nvidia
handler: nvidia
---
# From https://github.com/NVIDIA/gpu-feature-discovery/blob/main/deployments/static/nfd.yaml
# Modified to set `runtimeClassName: nvidia`, as k3s does not replace the default container runtime
apiVersion: v1
kind: Namespace
metadata:
name: node-feature-discovery
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: nfd-master
namespace: node-feature-discovery
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: nfd-master
rules:
- apiGroups:
- ""
resources:
- nodes
verbs:
- get
- patch
- update
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: nfd-master
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: nfd-master
subjects:
- kind: ServiceAccount
name: nfd-master
namespace: node-feature-discovery
---
#
# feature discovery
#
apiVersion: apps/v1
kind: DaemonSet
metadata:
labels:
app: nfd
name: nfd
namespace: node-feature-discovery
spec:
selector:
matchLabels:
app: nfd
template:
metadata:
labels:
app: nfd
spec:
serviceAccount: nfd-master
runtimeClassName: nvidia
containers:
- env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
image: quay.io/kubernetes_incubator/node-feature-discovery:v0.6.0
name: nfd-master
command:
- "nfd-master"
args:
- "--extra-label-ns=nvidia.com"
- env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
image: quay.io/kubernetes_incubator/node-feature-discovery:v0.6.0
name: nfd-worker
command:
- "nfd-worker"
args:
- "--sleep-interval=60s"
- "--options={\"sources\": {\"pci\": { \"deviceLabelFields\": [\"vendor\"] }}}"
volumeMounts:
- name: host-boot
mountPath: "/host-boot"
readOnly: true
- name: host-os-release
mountPath: "/host-etc/os-release"
readOnly: true
- name: host-sys
mountPath: "/host-sys"
- name: source-d
mountPath: "/etc/kubernetes/node-feature-discovery/source.d/"
- name: features-d
mountPath: "/etc/kubernetes/node-feature-discovery/features.d/"
nodeSelector:
gpu: "yes"
volumes:
- name: host-boot
hostPath:
path: "/boot"
- name: host-os-release
hostPath:
path: "/etc/os-release"
- name: host-sys
hostPath:
path: "/sys"
- name: source-d
hostPath:
path: "/etc/kubernetes/node-feature-discovery/source.d/"
- name: features-d
hostPath:
path: "/etc/kubernetes/node-feature-discovery/features.d/"
---
# From https://github.com/NVIDIA/gpu-feature-discovery/blob/main/deployments/static/gpu-feature-discovery-daemonset.yaml
# Modified to set `runtimeClassName: nvidia`, as k3s does not replace the default container runtime
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: gpu-feature-discovery
namespace: node-feature-discovery
labels:
app.kubernetes.io/name: gpu-feature-discovery
app.kubernetes.io/version: 0.6.1
app.kubernetes.io/part-of: nvidia-gpu
spec:
selector:
matchLabels:
app.kubernetes.io/name: gpu-feature-discovery
app.kubernetes.io/part-of: nvidia-gpu
template:
metadata:
labels:
app.kubernetes.io/name: gpu-feature-discovery
app.kubernetes.io/version: 0.6.1
app.kubernetes.io/part-of: nvidia-gpu
spec:
runtimeClassName: nvidia
containers:
- image: nvcr.io/nvidia/gpu-feature-discovery:v0.6.1
name: gpu-feature-discovery
volumeMounts:
- name: output-dir
mountPath: "/etc/kubernetes/node-feature-discovery/features.d"
- name: dmi-product-name
mountPath: "/sys/class/dmi/id/product_name"
securityContext:
privileged: true
env:
- name: MIG_STRATEGY
value: none
nodeSelector:
gpu: "yes"
feature.node.kubernetes.io/pci-10de.present: "true" # NVIDIA vendor ID
volumes:
- name: output-dir
hostPath:
path: "/etc/kubernetes/node-feature-discovery/features.d"
- name: dmi-product-name
hostPath:
path: "/sys/class/dmi/id/product_name"
---
#
# plugin
#
# From https://github.com/NVIDIA/k8s-device-plugin/blob/master/nvidia-device-plugin.yml
# Modified to set `runtimeClassName: nvidia`, as k3s does not replace the default container runtime
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvidia-device-plugin-daemonset
namespace: kube-system
spec:
selector:
matchLabels:
name: nvidia-device-plugin-ds
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: nvidia-device-plugin-ds
spec:
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
# Mark this pod as a critical add-on; when enabled, the critical add-on
# scheduler reserves resources for critical add-on pods so that they can
# be rescheduled after a failure.
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
priorityClassName: "system-node-critical"
runtimeClassName: "nvidia"
containers:
- image: nvcr.io/nvidia/k8s-device-plugin:v0.12.2
name: nvidia-device-plugin-ctr
env:
- name: FAIL_ON_INIT_ERROR
value: "false"
securityContext:
allowPrivilegeEscalation: false
capabilities:
drop: [ "ALL" ]
volumeMounts:
- name: device-plugin
mountPath: /var/lib/kubelet/device-plugins
nodeSelector:
gpu: "yes"
volumes:
- name: device-plugin
hostPath:
path: /var/lib/kubelet/device-plugins
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment