Skip to content

Instantly share code, notes, and snippets.

@cfstras
Created June 8, 2026 14:57
Show Gist options
  • Select an option

  • Save cfstras/98e6896fb599869c5ce5f3647a017d2c to your computer and use it in GitHub Desktop.

Select an option

Save cfstras/98e6896fb599869c5ce5f3647a017d2c to your computer and use it in GitHub Desktop.
apiVersion: v1
automountServiceAccountToken: true
kind: ServiceAccount
metadata:
name: mig-config-enforcer
namespace: gpu-operator
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: mig-config-enforcer
rules:
- apiGroups:
- ""
resources:
- nodes
verbs:
- get
- patch
- apiGroups:
- ""
resources:
- pods
verbs:
- get
- watch
- list
- delete
- apiGroups:
- ""
resources:
- pods/exec
verbs:
- create
- apiGroups:
- ""
resources:
- pods/eviction
verbs:
- create
- apiGroups:
- apps
resources:
- daemonsets
- replicasets
verbs:
- get
- list
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: mig-config-enforcer
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: mig-config-enforcer
subjects:
- kind: ServiceAccount
name: mig-config-enforcer
namespace: gpu-operator
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: mig-config-enforcer
namespace: gpu-operator
spec:
selector:
matchLabels:
app: mig-config-enforcer
template:
metadata:
labels:
app: mig-config-enforcer
spec:
containers:
- args:
- apk add --no-cache kubectl jq bash && exec bash /scripts/mig-config-enforcer.sh
command:
- /bin/sh
- -eu
- -c
env:
- name: NODE_NAME
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: spec.nodeName
image: alpine:3
imagePullPolicy: IfNotPresent
name: enforcer
resources:
limits:
memory: 64Mi
requests:
cpu: 10m
memory: 64Mi
volumeMounts:
- mountPath: /scripts
mountPropagation: None
name: mig-config-enforcer-scripts
nodeSelector:
nvidia.com/gpu.deploy.mig-manager: true
restartPolicy: Always
serviceAccountName: mig-config-enforcer
tolerations:
- operator: Exists
volumes:
- configMap:
defaultMode: 493
name: mig-config-enforcer
optional: false
name: mig-config-enforcer-scripts
---
apiVersion: v1
kind: ConfigMap
metadata:
name: mig-config-enforcer
namespace: gpu-operator
data:
mig-config-enforcer.sh: |
#!/bin/bash
set -euo pipefail
while true; do
CAPACITY=$(kubectl get node "$NODE_NAME" -o json 2>/dev/null || echo "")
if [[ "$(echo "$CAPACITY" | jq '.status.capacity | to_entries[] | select(.key | startswith("nvidia.com/mig-")) | .value | tonumber > 0')" == "true" ]]; then
echo "$(date): MIG resources present on $NODE_NAME"
echo "$CAPACITY" | jq '.status.capacity | to_entries[] | select(.key | startswith("nvidia.com/"))'
else
echo "$(date): No MIG resources on $NODE_NAME, applying b200-mixed config..."
echo "Draining the node first, to get rid of GPU workloads..."
kubectl drain "$NODE_NAME" --ignore-daemonsets --delete-emptydir-data --force --timeout=60s || echo "Node drain failed, maybe no workloads or some workloads are ignoring the drain? Trying anyway..."
echo "Applying MIG config via exec in mig pod manually..."
POD=$(kubectl get pods -n gpu-operator \
--field-selector "spec.nodeName=$NODE_NAME" \
-o jsonpath='{.items[*].metadata.name}' 2>/dev/null \
| tr ' ' '\n' | grep mig-manager | head -1 || true)
if [[ -z $POD ]]; then
echo "$(date): mig-manager pod not found on $NODE_NAME, retrying in 30s..."
else
kubectl exec -n gpu-operator "$POD" -- \
nvidia-mig-parted -d apply -f /mig-parted-config/config.yaml -c b200-mixed \
&& echo "$(date): MIG config applied" \
|| echo "$(date): Apply failed, retrying in 30s (did the node drain work?)..."
echo "Restarting the device-plugin-daemonset on the node to pick up the new config..."
kubectl delete pod -n gpu-operator -l app=nvidia-device-plugin-daemonset --field-selector "spec.nodeName=$NODE_NAME" || echo "Failed to restart device plugin, maybe no pods found? Retrying in 30s..."
echo "Finally: uncordoning the node to allow workloads to be scheduled again..."
kubectl uncordon "$NODE_NAME" || echo "Failed to uncordon node"
fi
fi
sleep 30
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment