Skip to content

Instantly share code, notes, and snippets.

@alexeldeib
Last active October 10, 2019 00:09
Show Gist options
  • Save alexeldeib/e46689e659298d67cfe0794a4ee21312 to your computer and use it in GitHub Desktop.
Save alexeldeib/e46689e659298d67cfe0794a4ee21312 to your computer and use it in GitHub Desktop.
Node problem one-shot config
apiVersion: v1
kind: Namespace
metadata:
labels:
app: npd
name: npd
---
apiVersion: v1
kind: ServiceAccount
metadata:
labels:
app: npd
name: node-problem-detector
namespace: npd
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
app: npd
name: node-problem-detector
rules:
- apiGroups:
- ""
resources:
- nodes
verbs:
- get
- apiGroups:
- ""
resources:
- nodes/status
verbs:
- patch
- apiGroups:
- ""
resources:
- events
verbs:
- create
- patch
- update
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
app: npd
name: node-problem-detector
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: node-problem-detector
subjects:
- kind: ServiceAccount
name: node-problem-detector
namespace: npd
---
apiVersion: v1
data:
kernel-monitor.json: |
{
"plugin": "kmsg",
"logPath": "/dev/kmsg",
"lookback": "5m",
"bufferSize": 10,
"source": "kernel-monitor",
"conditions": [
{
"type": "KernelDeadlock",
"reason": "KernelHasNoDeadlock",
"message": "kernel has no deadlock"
},
{
"type": "ReadonlyFilesystem",
"reason": "FilesystemIsReadOnly",
"message": "Filesystem is read-only"
}
],
"rules": [
{
"type": "temporary",
"reason": "OOMKilling",
"pattern": "Kill process \\d+ (.+) score \\d+ or sacrifice child\\nKilled process \\d+ (.+) total-vm:\\d+kB, anon-rss:\\d+kB, file-rss:\\d+kB.*"
},
{
"type": "temporary",
"reason": "TaskHung",
"pattern": "task \\S+:\\w+ blocked for more than \\w+ seconds\\."
},
{
"type": "temporary",
"reason": "UnregisterNetDevice",
"pattern": "unregister_netdevice: waiting for \\w+ to become free. Usage count = \\d+"
},
{
"type": "temporary",
"reason": "KernelOops",
"pattern": "BUG: unable to handle kernel NULL pointer dereference at .*"
},
{
"type": "temporary",
"reason": "KernelOops",
"pattern": "divide error: 0000 \\[#\\d+\\] SMP"
},
{
"type": "permanent",
"condition": "KernelDeadlock",
"reason": "AUFSUmountHung",
"pattern": "task umount\\.aufs:\\w+ blocked for more than \\w+ seconds\\."
},
{
"type": "permanent",
"condition": "KernelDeadlock",
"reason": "DockerHung",
"pattern": "task docker:\\w+ blocked for more than \\w+ seconds\\."
},
{
"type": "permanent",
"condition": "ReadonlyFilesystem",
"reason": "FilesystemIsReadOnly",
"pattern": "Remounting filesystem read-only"
}
]
}
docker-monitor.json: |
{
"plugin": "journald",
"pluginConfig": {
"source": "dockerd"
},
"logPath": "/var/log/journal",
"lookback": "5m",
"bufferSize": 10,
"source": "docker-monitor",
"conditions": [],
"rules": [
{
"type": "temporary",
"reason": "CorruptDockerImage",
"pattern": "Error trying v2 registry: failed to register layer: rename /var/lib/docker/image/(.+) /var/lib/docker/image/(.+): directory not empty.*"
}
]
}
systemd.json: |
{
"plugin": "journald",
"pluginConfig": {
"source": "systemd"
},
"logPath": "/var/log/journal",
"lookback": "",
"bufferSize": 10,
"source": "systemd-monitor",
"conditions": [],
"rules": [
{
"type": "temporary",
"reason": "KubeletStart",
"pattern": "Started Kubernetes kubelet."
},
{
"type": "temporary",
"reason": "DockerStart",
"pattern": "Starting Docker Application Container Engine..."
},
{
"type": "temporary",
"reason": "ContainerdStart",
"pattern": "Starting containerd container runtime..."
}
]
}
kind: ConfigMap
metadata:
labels:
app: npd
name: node-problem-detector-config
namespace: npd
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
labels:
app: npd
name: node-problem-detector
namespace: npd
spec:
selector:
matchLabels:
app: npd
template:
metadata:
labels:
app: npd
spec:
containers:
- command:
- /node-problem-detector
- --logtostderr
- --system-log-monitors=/config/kernel-monitor.json,/config/docker-monitor.json,/config/systemd.json
env:
- name: NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
image: k8s.gcr.io/node-problem-detector:v0.6.3
imagePullPolicy: Always
name: node-problem-detector
resources:
limits:
cpu: 10m
memory: 80Mi
requests:
cpu: 10m
memory: 80Mi
securityContext:
privileged: true
volumeMounts:
- mountPath: /var/log
name: log
readOnly: true
- mountPath: /dev/kmsg
name: kmsg
readOnly: true
- mountPath: /etc/localtime
name: localtime
readOnly: true
- mountPath: /config
name: config
readOnly: true
serviceAccount: node-problem-detector
tolerations:
- effect: NoSchedule
key: node-role.kubernetes.io/master
operator: Exists
volumes:
- hostPath:
path: /var/log/
name: log
- hostPath:
path: /dev/kmsg
name: kmsg
- hostPath:
path: /etc/localtime
name: localtime
- configMap:
items:
- key: kernel-monitor.json
path: kernel-monitor.json
- key: docker-monitor.json
path: docker-monitor.json
- key: systemd.json
path: systemd.json
name: node-problem-detector-config
name: config
@alexeldeib
Copy link
Author

results in:

Conditions:
  Type                 Status  LastHeartbeatTime                 LastTransitionTime                Reason                       Message
  ----                 ------  -----------------                 ------------------                ------                       -------
  KernelDeadlock       False   Wed, 03 Jul 2019 20:58:44 -0700   Wed, 03 Jul 2019 20:56:43 -0700   KernelHasNoDeadlock          kernel has no deadlock
  ReadonlyFilesystem   False   Wed, 03 Jul 2019 20:58:44 -0700   Wed, 03 Jul 2019 20:56:43 -0700   FilesystemIsReadOnly         Filesystem is read-only
  NetworkUnavailable   False   Wed, 03 Jul 2019 20:00:20 -0700   Wed, 03 Jul 2019 20:00:20 -0700   RouteCreated                 RouteController created a route
  MemoryPressure       False   Wed, 03 Jul 2019 20:59:20 -0700   Wed, 03 Jul 2019 19:59:56 -0700   KubeletHasSufficientMemory   kubelet has sufficient memory available
  DiskPressure         False   Wed, 03 Jul 2019 20:59:20 -0700   Wed, 03 Jul 2019 19:59:56 -0700   KubeletHasNoDiskPressure     kubelet has no disk pressure
  PIDPressure          False   Wed, 03 Jul 2019 20:59:20 -0700   Wed, 03 Jul 2019 19:59:56 -0700   KubeletHasSufficientPID      kubelet has sufficient PID available
  Ready                True    Wed, 03 Jul 2019 20:59:20 -0700   Wed, 03 Jul 2019 20:00:26 -0700   KubeletReady                 kubelet is posting ready status. AppArmor enabled

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment