Skip to content

Instantly share code, notes, and snippets.

@MMMarcy
Created August 11, 2024 14:12
Show Gist options
  • Save MMMarcy/8f8bc3f660e01eb46cf28c654d373a62 to your computer and use it in GitHub Desktop.
Save MMMarcy/8f8bc3f660e01eb46cf28c654d373a62 to your computer and use it in GitHub Desktop.
configure nvidia-device-plugin in K8s using pulumi
"""Installs and configures the nvidia device plugin."""
from textwrap import dedent
import pulumi
from pulumi_kubernetes.apps.v1 import (
DaemonSet,
DaemonSetSpecArgs,
DaemonSetUpdateStrategyArgs,
)
from pulumi_kubernetes.core.v1 import (
CapabilitiesArgs,
ConfigMap,
ConfigMapVolumeSourceArgs,
ContainerArgs,
EnvVarArgs,
HostPathVolumeSourceArgs,
KeyToPathArgs,
PodSpecArgs,
PodTemplateSpecArgs,
SecurityContextArgs,
TolerationArgs,
VolumeArgs,
VolumeMountArgs,
)
from pulumi_kubernetes.meta.v1 import LabelSelectorArgs, ObjectMetaArgs
def provide_nvidia_demonset() -> DaemonSet:
"""Sets up the daemonset to multiplex gpus."""
cm = ConfigMap(
"nvidia-device-plugin-config-map",
metadata=ObjectMetaArgs(
name="nvidia-device-plugin-config-map", namespace="kube-system"
),
data={
"config.json": dedent(
"""
{
"version": "v1",
"flags": {
"migStrategy": "mixed",
"failOnInitError": false,
"mpsRoot": "",
"nvidiaDriverRoot": "/",
"nvidiaDevRoot": "/",
"gdsEnabled": false,
"mofedEnabled": false,
"useNodeFeatureAPI": null,
"deviceDiscoveryStrategy": "auto",
"plugin": {
"passDeviceSpecs": false,
"deviceListStrategy": [
"envvar"
],
"deviceIDStrategy": "uuid",
"cdiAnnotationPrefix": "cdi.k8s.io/",
"nvidiaCTKPath": "/usr/bin/nvidia-ctk",
"containerDriverRoot": "/driver-root"
}
},
"resources": {
"gpus": [
{
"pattern": "*",
"name": "nvidia.com/gpu"
}
]
},
"sharing": {
"timeSlicing": {
"resources": [
{"name": "nvidia.com/gpu", "replicas": 2}
]
}
}
}
"""
)
},
)
ds = DaemonSet(
"nvidia-device-plugin",
metadata=ObjectMetaArgs(
name="nvidia-device-plugin-daemonset",
namespace="kube-system",
),
spec=DaemonSetSpecArgs(
selector=LabelSelectorArgs(
match_labels={"name": "nvidia-device-plugin-ds"}
),
update_strategy=DaemonSetUpdateStrategyArgs(type="RollingUpdate"),
template=PodTemplateSpecArgs(
metadata=ObjectMetaArgs(labels={"name": "nvidia-device-plugin-ds"}),
spec=PodSpecArgs(
tolerations=[
TolerationArgs(
key="nvidia.com/gpu", operator="Exists", effect="NoSchedule"
)
],
priority_class_name="system-node-critical",
containers=[
ContainerArgs(
name="nvidia-device-plugin-ctr",
image="nvcr.io/nvidia/k8s-device-plugin:v0.16.2",
env=[
EnvVarArgs(name="FAIL_ON_INIT_ERROR", value="false"),
EnvVarArgs(name="CONFIG_FILE", value="/config"),
],
security_context=SecurityContextArgs(
allow_privilege_escalation=False,
capabilities=CapabilitiesArgs(drop=["all"]),
),
volume_mounts=[
VolumeMountArgs(
name="device-plugin",
mount_path="/var/lib/kubelet/device-plugins",
),
VolumeMountArgs(
name="configuration",
mount_path="/config",
sub_path="config.json",
),
],
)
],
volumes=[
VolumeArgs(
name="device-plugin",
host_path=HostPathVolumeSourceArgs(
path="/var/lib/kubelet/device-plugins"
),
),
VolumeArgs(
name="configuration",
config_map=ConfigMapVolumeSourceArgs(
name=cm.metadata["name"],
items=[
KeyToPathArgs(key="config.json", path="config.json")
],
),
),
],
),
),
),
)
pulumi.export(
"nvidia-device-plugin",
{
"config_map": cm.metadata["name"],
"daemon_set": ds.metadata["name"],
},
)
return ds
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment