Created
October 17, 2022 13:28
-
-
Save initialed85/cd8b268b7cad2ca8f991b4087cbbf57a to your computer and use it in GitHub Desktop.
Modified nvidia-device-plugin.yml that works for k3s (at least for me)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# tweaked from https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.12.2/nvidia-device-plugin.yml | |
# credit to https://github.com/hansaya for the comment at https://github.com/k3s-io/k3s/issues/4391#issuecomment-1194627754 | |
# note: you need to label your GPU nodes with gpu=yes for this to work (I had some non-GPU nodes and the discovery pods kept crashlooping | |
# on those nodes without the system working out they had no GPUs) | |
# | |
# prerequisites | |
# | |
apiVersion: node.k8s.io/v1 | |
kind: RuntimeClass | |
metadata: | |
name: nvidia | |
handler: nvidia | |
--- | |
# From https://github.com/NVIDIA/gpu-feature-discovery/blob/main/deployments/static/nfd.yaml | |
# Modified to set `runtimeClassName: nvidia`, as k3s does not replace the default container runtime | |
apiVersion: v1 | |
kind: Namespace | |
metadata: | |
name: node-feature-discovery | |
--- | |
apiVersion: v1 | |
kind: ServiceAccount | |
metadata: | |
name: nfd-master | |
namespace: node-feature-discovery | |
--- | |
apiVersion: rbac.authorization.k8s.io/v1 | |
kind: ClusterRole | |
metadata: | |
name: nfd-master | |
rules: | |
- apiGroups: | |
- "" | |
resources: | |
- nodes | |
verbs: | |
- get | |
- patch | |
- update | |
--- | |
apiVersion: rbac.authorization.k8s.io/v1 | |
kind: ClusterRoleBinding | |
metadata: | |
name: nfd-master | |
roleRef: | |
apiGroup: rbac.authorization.k8s.io | |
kind: ClusterRole | |
name: nfd-master | |
subjects: | |
- kind: ServiceAccount | |
name: nfd-master | |
namespace: node-feature-discovery | |
--- | |
# | |
# feature discovery | |
# | |
apiVersion: apps/v1 | |
kind: DaemonSet | |
metadata: | |
labels: | |
app: nfd | |
name: nfd | |
namespace: node-feature-discovery | |
spec: | |
selector: | |
matchLabels: | |
app: nfd | |
template: | |
metadata: | |
labels: | |
app: nfd | |
spec: | |
serviceAccount: nfd-master | |
runtimeClassName: nvidia | |
containers: | |
- env: | |
- name: NODE_NAME | |
valueFrom: | |
fieldRef: | |
fieldPath: spec.nodeName | |
image: quay.io/kubernetes_incubator/node-feature-discovery:v0.6.0 | |
name: nfd-master | |
command: | |
- "nfd-master" | |
args: | |
- "--extra-label-ns=nvidia.com" | |
- env: | |
- name: NODE_NAME | |
valueFrom: | |
fieldRef: | |
fieldPath: spec.nodeName | |
image: quay.io/kubernetes_incubator/node-feature-discovery:v0.6.0 | |
name: nfd-worker | |
command: | |
- "nfd-worker" | |
args: | |
- "--sleep-interval=60s" | |
- "--options={\"sources\": {\"pci\": { \"deviceLabelFields\": [\"vendor\"] }}}" | |
volumeMounts: | |
- name: host-boot | |
mountPath: "/host-boot" | |
readOnly: true | |
- name: host-os-release | |
mountPath: "/host-etc/os-release" | |
readOnly: true | |
- name: host-sys | |
mountPath: "/host-sys" | |
- name: source-d | |
mountPath: "/etc/kubernetes/node-feature-discovery/source.d/" | |
- name: features-d | |
mountPath: "/etc/kubernetes/node-feature-discovery/features.d/" | |
nodeSelector: | |
gpu: "yes" | |
volumes: | |
- name: host-boot | |
hostPath: | |
path: "/boot" | |
- name: host-os-release | |
hostPath: | |
path: "/etc/os-release" | |
- name: host-sys | |
hostPath: | |
path: "/sys" | |
- name: source-d | |
hostPath: | |
path: "/etc/kubernetes/node-feature-discovery/source.d/" | |
- name: features-d | |
hostPath: | |
path: "/etc/kubernetes/node-feature-discovery/features.d/" | |
--- | |
# From https://github.com/NVIDIA/gpu-feature-discovery/blob/main/deployments/static/gpu-feature-discovery-daemonset.yaml | |
# Modified to set `runtimeClassName: nvidia`, as k3s does not replace the default container runtime | |
apiVersion: apps/v1 | |
kind: DaemonSet | |
metadata: | |
name: gpu-feature-discovery | |
namespace: node-feature-discovery | |
labels: | |
app.kubernetes.io/name: gpu-feature-discovery | |
app.kubernetes.io/version: 0.6.1 | |
app.kubernetes.io/part-of: nvidia-gpu | |
spec: | |
selector: | |
matchLabels: | |
app.kubernetes.io/name: gpu-feature-discovery | |
app.kubernetes.io/part-of: nvidia-gpu | |
template: | |
metadata: | |
labels: | |
app.kubernetes.io/name: gpu-feature-discovery | |
app.kubernetes.io/version: 0.6.1 | |
app.kubernetes.io/part-of: nvidia-gpu | |
spec: | |
runtimeClassName: nvidia | |
containers: | |
- image: nvcr.io/nvidia/gpu-feature-discovery:v0.6.1 | |
name: gpu-feature-discovery | |
volumeMounts: | |
- name: output-dir | |
mountPath: "/etc/kubernetes/node-feature-discovery/features.d" | |
- name: dmi-product-name | |
mountPath: "/sys/class/dmi/id/product_name" | |
securityContext: | |
privileged: true | |
env: | |
- name: MIG_STRATEGY | |
value: none | |
nodeSelector: | |
gpu: "yes" | |
feature.node.kubernetes.io/pci-10de.present: "true" # NVIDIA vendor ID | |
volumes: | |
- name: output-dir | |
hostPath: | |
path: "/etc/kubernetes/node-feature-discovery/features.d" | |
- name: dmi-product-name | |
hostPath: | |
path: "/sys/class/dmi/id/product_name" | |
--- | |
# | |
# plugin | |
# | |
# From https://github.com/NVIDIA/k8s-device-plugin/blob/master/nvidia-device-plugin.yml | |
# Modified to set `runtimeClassName: nvidia`, as k3s does not replace the default container runtime | |
apiVersion: apps/v1 | |
kind: DaemonSet | |
metadata: | |
name: nvidia-device-plugin-daemonset | |
namespace: kube-system | |
spec: | |
selector: | |
matchLabels: | |
name: nvidia-device-plugin-ds | |
updateStrategy: | |
type: RollingUpdate | |
template: | |
metadata: | |
labels: | |
name: nvidia-device-plugin-ds | |
spec: | |
tolerations: | |
- key: nvidia.com/gpu | |
operator: Exists | |
effect: NoSchedule | |
# Mark this pod as a critical add-on; when enabled, the critical add-on | |
# scheduler reserves resources for critical add-on pods so that they can | |
# be rescheduled after a failure. | |
# See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ | |
priorityClassName: "system-node-critical" | |
runtimeClassName: "nvidia" | |
containers: | |
- image: nvcr.io/nvidia/k8s-device-plugin:v0.12.2 | |
name: nvidia-device-plugin-ctr | |
env: | |
- name: FAIL_ON_INIT_ERROR | |
value: "false" | |
securityContext: | |
allowPrivilegeEscalation: false | |
capabilities: | |
drop: [ "ALL" ] | |
volumeMounts: | |
- name: device-plugin | |
mountPath: /var/lib/kubelet/device-plugins | |
nodeSelector: | |
gpu: "yes" | |
volumes: | |
- name: device-plugin | |
hostPath: | |
path: /var/lib/kubelet/device-plugins |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment