StevenACoffman · March 27, 2025 02:16
diff --git a/a node-problem-detector Readme.md b/a node-problem-detector Readme.md
diff --git a/draino.yaml b/draino.yaml
 ---
 apiVersion: v1
 kind: ServiceAccount
 metadata:
  labels: {component: draino}
  name: draino
  namespace: kube-system
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
 metadata:
  labels: {component: draino}
  name: draino
 rules:
 - apiGroups: ['']
  resources: [events]
  verbs: [create, patch, update]
 - apiGroups: ['']
  resources: [nodes]
  verbs: [get, watch, list, update]
 - apiGroups: ['']
  resources: [nodes/status]
  verbs: [patch]
 - apiGroups: ['']
  resources: [pods]
  verbs: [get, watch, list]
 - apiGroups: ['']
  resources: [pods/eviction]
  verbs: [create]
 - apiGroups: [extensions]
  resources: [daemonsets]
  verbs: [get, watch, list]
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRoleBinding
 metadata:
  labels: {component: draino}
  name: draino
 roleRef: {apiGroup: rbac.authorization.k8s.io, kind: ClusterRole, name: draino}
 subjects:
 - {kind: ServiceAccount, name: draino, namespace: kube-system}
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  labels: {component: draino}
  name: draino
  namespace: kube-system
 spec:
  # Draino does not currently support locking/master election, so you should
  # only run one draino at a time. Draino won't start draining nodes immediately
  # so it's usually safe for multiple drainos to exist for a brief period of
  # time.
  replicas: 1
  selector:
    matchLabels: {component: draino}
  template:
    metadata:
      labels: {component: draino}
      name: draino
      namespace: kube-system
    spec:
      containers:
      - name: draino
        image: planetlabs/draino:dbadb44
        # You'll want to change these labels and conditions to suit your deployment.
        command:
        - /draino
        - --debug
        - --evict-daemonset-pods
        - --evict-emptydir-pods
        - --evict-unreplicated-pods
        - KernelDeadlock
        - OutOfDisk
        # - ReadonlyFilesystem
        # - MemoryPressure
        # - DiskPressure
        # - PIDPressure
        livenessProbe:
          httpGet: {path: /healthz, port: 10002}
          initialDelaySeconds: 30
      serviceAccountName: draino
diff --git a/node-problem-detector-config.yaml b/node-problem-detector-config.yaml
 apiVersion: v1
 data:
  kernel-monitor.json: |
    {
        "plugin": "kmsg",
        "logPath": "/dev/kmsg",
        "lookback": "5m",
        "bufferSize": 10,
        "source": "kernel-monitor",
        "conditions": [
            {
                "type": "KernelDeadlock",
                "reason": "KernelHasNoDeadlock",
                "message": "kernel has no deadlock"
            },
            {
                "type": "ReadonlyFilesystem",
                "reason": "FilesystemIsReadOnly",
                "message": "Filesystem is read-only"
            }
        ],
        "rules": [
            {
                "type": "temporary",
                "reason": "OOMKilling",
                "pattern": "Kill process \\d+ (.+) score \\d+ or sacrifice child\\nKilled process \\d+ (.+) total-vm:\\d+kB, anon-rss:\\d+kB, file-rss:\\d+kB.*"
            },
            {
                "type": "temporary",
                "reason": "TaskHung",
                "pattern": "task \\S+:\\w+ blocked for more than \\w+ seconds\\."
            },
            {
                "type": "temporary",
                "reason": "UnregisterNetDevice",
                "pattern": "unregister_netdevice: waiting for \\w+ to become free. Usage count = \\d+"
            },
            {
                "type": "temporary",
                "reason": "KernelOops",
                "pattern": "BUG: unable to handle kernel NULL pointer dereference at .*"
            },
            {
                "type": "temporary",
                "reason": "KernelOops",
                "pattern": "divide error: 0000 \\[#\\d+\\] SMP"
            },
            {
                "type": "permanent",
                "condition": "KernelDeadlock",
                "reason": "AUFSUmountHung",
                "pattern": "task umount\\.aufs:\\w+ blocked for more than \\w+ seconds\\."
            },
            {
                "type": "permanent",
                "condition": "KernelDeadlock",
                "reason": "DockerHung",
                "pattern": "task docker:\\w+ blocked for more than \\w+ seconds\\."
            },
            {
                "type": "permanent",
                "condition": "ReadonlyFilesystem",
                "reason": "FilesystemIsReadOnly",
                "pattern": "Remounting filesystem read-only"
            }
        ]
    }
  docker-monitor.json: |
    {
        "plugin": "journald",
        "pluginConfig": {
            "source": "dockerd"
        },
        "logPath": "/var/log/journal",
        "lookback": "5m",
        "bufferSize": 10,
        "source": "docker-monitor",
        "conditions": [],
        "rules": [
            {
                "type": "temporary",
                "reason": "CorruptDockerImage",
                "pattern": "Error trying v2 registry: failed to register layer: rename /var/lib/docker/image/(.+) /var/lib/docker/image/(.+): directory not empty.*"
            }
        ]
    }
 kind: ConfigMap
 metadata:
  name: node-problem-detector-config
  namespace: kube-system
diff --git a/node-problem-detector.yaml b/node-problem-detector.yaml
 apiVersion: v1
 kind: ServiceAccount
 metadata:
  name: node-problem-detector
  namespace: kube-system
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRoleBinding
 metadata:
  name: npd-binding
 roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: system:node-problem-detector
 subjects:
 - kind: ServiceAccount
  name: node-problem-detector
  namespace: kube-system
 ---
 apiVersion: apps/v1
 kind: DaemonSet
 metadata:
  name: node-problem-detector
  namespace: kube-system
 spec:
  selector:
    matchLabels:
      app: node-problem-detector
  template:
    metadata:
      labels:
        app: node-problem-detector
    spec:
      containers:
      - name: node-problem-detector
        command:
        - /node-problem-detector
        - --logtostderr
        - --system-log-monitors=/config/kernel-monitor.json,/config/docker-monitor.json
        image: k8s.gcr.io/node-problem-detector:v0.6.2
        resources:
          limits:
            cpu: 10m
            memory: 80Mi
          requests:
            cpu: 10m
            memory: 80Mi
        imagePullPolicy: Always
        securityContext:
          privileged: true
        env:
        - name: NODE_NAME
          valueFrom:
            fieldRef:
              fieldPath: spec.nodeName
        volumeMounts:
        - name: log
          mountPath: /var/log
          readOnly: true
        - name: kmsg
          mountPath: /dev/kmsg
          readOnly: true
        # Make sure node problem detector is in the same timezone
        # with the host.
        - name: localtime
          mountPath: /etc/localtime
          readOnly: true
        - name: config
          mountPath: /config
          readOnly: true
      serviceAccountName: node-problem-detector
      volumes:
      - name: log
        # Config `log` to your system log directory
        hostPath:
          path: /var/log/
      - name: kmsg
        hostPath:
          path: /dev/kmsg
      - name: localtime
        hostPath:
          path: /etc/localtime
      - name: config
        configMap:
          name: node-problem-detector-config
          items:
          - key: kernel-monitor.json
            path: kernel-monitor.json
          - key: docker-monitor.json
            path: docker-monitor.json
NodeCondition	Duration	Source	Draino Appropriate
KernelDeadlock	permanent	node-problem-detector	✅
ReadonlyFilesystem	permanent	node-problem-detector	✅
OutOfDisk	permanent?	❓	✅
MemoryPressure	Temporary?	❓	❌ ❓
DiskPressure	Temporary?	❓	❌ ❓
PIDPressure	Temporary?	❓	❌ ❓
Ready	Temporary 😅	N/A	❌
	---
	apiVersion: v1
	kind: ServiceAccount
	metadata:
	labels: {component: draino}
	name: draino
	namespace: kube-system
	---
	apiVersion: rbac.authorization.k8s.io/v1
	kind: ClusterRole
	metadata:
	labels: {component: draino}
	name: draino
	rules:
	- apiGroups: ['']
	resources: [events]
	verbs: [create, patch, update]
	- apiGroups: ['']
	resources: [nodes]
	verbs: [get, watch, list, update]
	- apiGroups: ['']
	resources: [nodes/status]
	verbs: [patch]
	- apiGroups: ['']
	resources: [pods]
	verbs: [get, watch, list]
	- apiGroups: ['']
	resources: [pods/eviction]
	verbs: [create]
	- apiGroups: [extensions]
	resources: [daemonsets]
	verbs: [get, watch, list]
	---
	apiVersion: rbac.authorization.k8s.io/v1
	kind: ClusterRoleBinding
	metadata:
	labels: {component: draino}
	name: draino
	roleRef: {apiGroup: rbac.authorization.k8s.io, kind: ClusterRole, name: draino}
	subjects:
	- {kind: ServiceAccount, name: draino, namespace: kube-system}
	---
	apiVersion: apps/v1
	kind: Deployment
	metadata:
	labels: {component: draino}
	name: draino
	namespace: kube-system
	spec:
	# Draino does not currently support locking/master election, so you should
	# only run one draino at a time. Draino won't start draining nodes immediately
	# so it's usually safe for multiple drainos to exist for a brief period of
	# time.
	replicas: 1
	selector:
	matchLabels: {component: draino}
	template:
	metadata:
	labels: {component: draino}
	name: draino
	namespace: kube-system
	spec:
	containers:
	- name: draino
	image: planetlabs/draino:dbadb44
	# You'll want to change these labels and conditions to suit your deployment.
	command:
	- /draino
	- --debug
	- --evict-daemonset-pods
	- --evict-emptydir-pods
	- --evict-unreplicated-pods
	- KernelDeadlock
	- OutOfDisk
	# - ReadonlyFilesystem
	# - MemoryPressure
	# - DiskPressure
	# - PIDPressure
	livenessProbe:
	httpGet: {path: /healthz, port: 10002}
	initialDelaySeconds: 30
	serviceAccountName: draino
	apiVersion: v1
	data:
	kernel-monitor.json: \|
	{
	"plugin": "kmsg",
	"logPath": "/dev/kmsg",
	"lookback": "5m",
	"bufferSize": 10,
	"source": "kernel-monitor",
	"conditions": [
	{
	"type": "KernelDeadlock",
	"reason": "KernelHasNoDeadlock",
	"message": "kernel has no deadlock"
	},
	{
	"type": "ReadonlyFilesystem",
	"reason": "FilesystemIsReadOnly",
	"message": "Filesystem is read-only"
	}
	],
	"rules": [
	{
	"type": "temporary",
	"reason": "OOMKilling",
	"pattern": "Kill process \\d+ (.+) score \\d+ or sacrifice child\\nKilled process \\d+ (.+) total-vm:\\d+kB, anon-rss:\\d+kB, file-rss:\\d+kB.*"
	},
	{
	"type": "temporary",
	"reason": "TaskHung",
	"pattern": "task \\S+:\\w+ blocked for more than \\w+ seconds\\."
	},
	{
	"type": "temporary",
	"reason": "UnregisterNetDevice",
	"pattern": "unregister_netdevice: waiting for \\w+ to become free. Usage count = \\d+"
	},
	{
	"type": "temporary",
	"reason": "KernelOops",
	"pattern": "BUG: unable to handle kernel NULL pointer dereference at .*"
	},
	{
	"type": "temporary",
	"reason": "KernelOops",
	"pattern": "divide error: 0000 \\[#\\d+\\] SMP"
	},
	{
	"type": "permanent",
	"condition": "KernelDeadlock",
	"reason": "AUFSUmountHung",
	"pattern": "task umount\\.aufs:\\w+ blocked for more than \\w+ seconds\\."
	},
	{
	"type": "permanent",
	"condition": "KernelDeadlock",
	"reason": "DockerHung",
	"pattern": "task docker:\\w+ blocked for more than \\w+ seconds\\."
	},
	{
	"type": "permanent",
	"condition": "ReadonlyFilesystem",
	"reason": "FilesystemIsReadOnly",
	"pattern": "Remounting filesystem read-only"
	}
	]
	}
	docker-monitor.json: \|
	{
	"plugin": "journald",
	"pluginConfig": {
	"source": "dockerd"
	},
	"logPath": "/var/log/journal",
	"lookback": "5m",
	"bufferSize": 10,
	"source": "docker-monitor",
	"conditions": [],
	"rules": [
	{
	"type": "temporary",
	"reason": "CorruptDockerImage",
	"pattern": "Error trying v2 registry: failed to register layer: rename /var/lib/docker/image/(.+) /var/lib/docker/image/(.+): directory not empty.*"
	}
	]
	}
	kind: ConfigMap
	metadata:
	name: node-problem-detector-config
	namespace: kube-system