Commands belonging to the Rancher webinar Troubleshooting Kubernetes
Check etcd members
docker exec etcd etcdctl member list
Check endpoint health
docker exec etcd etcdctl endpoint health --endpoints=$(docker exec etcd /bin/sh -c "etcdctl member list | cut -d, -f5 | sed -e 's/ //g' | paste -sd ','")
Check endpoint status
docker exec etcd etcdctl endpoint status --endpoints=$(docker exec etcd /bin/sh -c "etcdctl member list | cut -d, -f5 | sed -e 's/ //g' | paste -sd ','") --write-out table
Fill up etcd
docker exec -ti etcd sh
dd if=/dev/zero of=testfile.out bs=1500 count=1024
while true; do cat testfile.out | etcdctl put key || break; done
Check alarm status
docker exec etcd etcdctl alarm list
Compact
rev=$(docker exec etcd etcdctl endpoint status --write-out json | egrep -o '"revision":[0-9]*' | egrep -o '[0-9]*')
docker exec etcd etcdctl compact "$rev"
Defrag
docker exec etcd etcdctl defrag --endpoints=$(docker exec etcd /bin/sh -c "etcdctl member list | cut -d, -f5 | sed -e 's/ //g' | paste -sd ','")
Disarm
docker exec etcd etcdctl alarm disarm
docker exec etcd etcdctl alarm list
<empty>
etcd debug logging
curl -XPUT -d '{"Level":"DEBUG"}' --cacert $(docker exec etcd printenv ETCDCTL_CACERT) --cert $(docker exec etcd printenv ETCDCTL_CERT) --key $(docker exec etcd printenv ETCDCTL_KEY) https://localhost:2379/config/local/log
Restore to info logging
curl -XPUT -d '{"Level":"INFO"}' --cacert $(docker exec etcd printenv ETCDCTL_CACERT) --cert $(docker exec etcd printenv ETCDCTL_CERT) --key $(docker exec etcd printenv ETCDCTL_KEY) https://localhost:2379/config/local/log
etcd metrics: Get all metrics for wal_fsync_duration_seconds
curl -s --cacert $(docker exec etcd printenv ETCDCTL_CACERT) --cert $(docker exec etcd printenv ETCDCTL_CERT) --key $(docker exec etcd printenv ETCDCTL_KEY) $(docker exec etcd printenv ETCDCTL_ENDPOINTS)/metrics | grep wal_fsync_duration_seconds
etcd metrics: Loop and calculate percentage of wal_fsync_duration_seconds within 8ms (requires bc
)
while true; do echo "scale=2;$(curl -s --cacert $(docker exec etcd printenv ETCDCTL_CACERT) --cert $(docker exec etcd printenv ETCDCTL_CERT) --key $(docker exec etcd printenv ETCDCTL_KEY) $(docker exec etcd printenv ETCDCTL_ENDPOINTS)/metrics | grep 'etcd_disk_wal_fsync_duration_seconds_bucket{le="0.002"}' | awk '{ print $2+0 }') / $(curl -s --cacert $(docker exec etcd printenv ETCDCTL_CACERT) --cert $(docker exec etcd printenv ETCDCTL_CERT) --key $(docker exec etcd printenv ETCDCTL_KEY) $(docker exec etcd printenv ETCDCTL_ENDPOINTS)/metrics | grep wal_fsync_duration_seconds_count | awk '{ print $2+0 }')" | bc; done
Leader changes
# curl -s --cacert $(docker exec etcd printenv ETCDCTL_CACERT) --cert $(docker exec etcd printenv ETCDCTL_CERT) --key $(docker exec etcd printenv ETCDCTL_KEY) https://localhost:2379/metrics | grep ^etcd_server_leader_changes_seen_total
kube-apiserver to etcd-servers connectivity check
for etcdserver in $(docker inspect kube-apiserver --format='{{range .Args}}{{.}}{{"\n"}}{{end}}' | grep etcd-servers | awk -F= '{ print $2 }' | tr ',' '\n'); do SSLDIR=$(docker inspect kube-apiserver --format '{{ range .Mounts }}{{ if eq .Destination "/etc/kubernetes" }}{{ .Source }}{{ end }}{{ end }}'); echo "Validating connection to ${etcdserver}/health"; curl -w '\nConnect:%{time_connect}\nStart Transfer: %{time_starttransfer}\nTotal: %{time_total}\nResponse code: %{http_code}\n' --cacert $SSLDIR/ssl/kube-ca.pem --cert $SSLDIR/ssl/kube-apiserver.pem --key $SSLDIR/ssl/kube-apiserver-key.pem "${etcdserver}/health"; done
kube-apiserver responsiveness
for cip in $(kubectl get nodes -l "node-role.kubernetes.io/controlplane=true" -o jsonpath='{range.items[*].status.addresses[?(@.type=="InternalIP")]}{.address}{"\n"}{end}'); do kubectl --kubeconfig kube_config_cluster.yml --server https://${cip}:6443 get nodes -v6 2>&1 | grep round_trippers; done
Find current leader
kubectl -n kube-system get endpoints kube-controller-manager -o jsonpath='{.metadata.annotations.control-plane\.alpha\.kubernetes\.io/leader}'
{"holderIdentity":"seb-doctl-ubuntu-5_96fb83ba-6023-11e9-a7a7-429a019f0230","leaseDurationSeconds":15,"acquireTime":"2019-04-16T08:42:57Z","renewTime":"2019-04-16T10:36:25Z","leaderTransitions":1}
Find current leader
kubectl -n kube-system get endpoints kube-scheduler -o jsonpath='{.metadata.annotations.control-plane\.alpha\.kubernetes\.io/leader}'
Show kubelet stats
curl -sLk --cacert /etc/kubernetes/ssl/kube-ca.pem --cert /etc/kubernetes/ssl/kube-node.pem --key /etc/kubernetes/ssl/kube-node-key.pem https://127.0.0.1:10250/stats
Liveness check
apiVersion: v1
kind: Pod
metadata:
labels:
test: liveness
name: liveness-exec
spec:
containers:
- name: liveness
image: k8s.gcr.io/busybox
args:
- /bin/sh
- -c
- touch /tmp/healthy; sleep 30; rm -rf /tmp/healthy; sleep 600
livenessProbe:
exec:
command:
- cat
- /tmp/healthy
initialDelaySeconds: 5
periodSeconds: 5
kubectl get events --field-selector involvedObject.kind=Pod -w
Describe $resource
kubectl describe pod
Get events with filter
kubectl get events --field-selector involvedObject.kind=Pod -w
Check Pending pods
kubectl get pods --all-namespaces -o go-template='{{range .items}}{{if eq .status.phase "Pending"}}{{.spec.nodeName}}{{" "}}{{.metadata.name}}{{" "}}{{.metadata.namespace}}{{" "}}{{range .status.conditions}}{{.message}}{{";"}}{{end}}{{"\n"}}{{end}}{{end}}'
Node difference check
kubectl get nodes -o custom-columns=NAME:.metadata.name,OS:.status.nodeInfo.osImage,KERNEL:.status.nodeInfo.kernelVersion,RUNTIME:.status.nodeInfo.containerRuntimeVersion,KUBELET:.status.nodeInfo.kubeletVersion,KUBEPROXY:.status.nodeInfo.kubeProxyVersion
Show taints
kubectl get nodes -o custom-columns=NAME:.metadata.name,TAINTS:.spec.taints
Show labels
kubectl get nodes --show-labels
Show node conditions
kubectl get nodes -o go-template='{{range .items}}{{$node := .}}{{range .status.conditions}}{{$node.metadata.name}}{{": "}}{{.type}}{{":"}}{{.status}}{{"\n"}}{{end}}{{end}}'
Show node conditions that could cause issues
kubectl get nodes -o go-template='{{range .items}}{{$node := .}}{{range .status.conditions}}{{if ne .type "Ready"}}{{if eq .status "True"}}{{$node.metadata.name}}{{": "}}{{.type}}{{":"}}{{.status}}{{"\n"}}{{end}}{{else}}{{if ne .status "True"}}{{$node.metadata.name}}{{": "}}{{.type}}{{": "}}{{.status}}{{"\n"}}{{end}}{{end}}{{end}}{{end}}'
Check if internal cluster name resolve
kubectl run -it --rm --restart=Never busybox --image=busybox:1.28 -- nslookup kubernetes.default
Check if external name resolves
kubectl run -it --rm --restart=Never busybox --image=busybox:1.28 -- nslookup www.google.com
Check upstream DNS nameserver(s)
kubectl -n kube-system get pods -l k8s-app=kube-dns --no-headers -o custom-columns=NAME:.metadata.name,HOSTIP:.status.hostIP | while read pod host; do echo "Pod ${pod} on host ${host}"; kubectl -n kube-system exec $pod -c kubedns cat /etc/resolv.conf; done
Check responsiveness of Ingress Controller
kubectl -n ingress-nginx get pods -l app=ingress-nginx -o custom-columns=POD:.metadata.name,NODE:.spec.nodeName,IP:.status.podIP --no-headers | while read ingresspod nodename podip; do echo "=> Testing from ${ingresspod} on ${nodename} (${podip})"; curl -o /dev/null --connect-timeout 5 -s -w 'Connect: %{time_connect}\nStart Transfer: %{time_starttransfer}\nTotal: %{time_total}\nResponse code: %{http_code}\n' -k http://${podip}/healthz; done
Add packet loss of 40% to one node running Ingress controller
tc qdisc add dev eth0 root netem loss 40% && sleep 120 && tc qdisc del dev eth0 root netem loss 40%
Check responsiveness Ingress -> Pods
kubectl run nginx --image=nginx --port=80 --expose
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
name: nginx-ingress
annotations:
nginx.ingress.kubernetes.io/rewrite-target: /
spec:
rules:
- host: foo.bar.com
http:
paths:
- path: /
backend:
serviceName: nginx
servicePort: 80
$ kubectl -n ingress-nginx get pods -l app=ingress-nginx -o custom-columns=POD:.metadata.name,NODE:.spec.nodeName,IP:.status.podIP --no-headers | while read ingresspod nodename podip; do echo "=> Testing from ${ingresspod} on ${nodename} (${podip})"; kubectl -n default get i
ng -o custom-columns=NAMESPACE:.metadata.namespace,HOST:.spec.rules[].host,SERVICE:.spec.rules[].http.paths[].backend.serviceName --no-headers | while read namespace host service; do echo "==> Found host $host with service $service in $namespace"; kubectl -n $namespace get ep $service -o go-template='{{range .subsets
}}{{range .addresses}}{{ .ip}}{{" "}}{{ .nodeName}}{{"\n"}}{{end}}{{end}}' | while read ep epnodename; do echo "==> Connecting to ${ep} on ${epnodename}"; kubectl -n ingress-nginx exec $ingresspod -- curl -o /dev/null --connect-timeout 5 -s -w 'Connect:%{time_connect}\nStart Transfer: %{time_starttransfer}\nTotal: %{
time_total}\nResponse code: %{http_code}\n' --resolve $host:80:$ep http://${host}:80; RC=$?; if [ $RC -ne 0 ]; then echo "FAIL: ${nodename} cannot connectto ${epnodename}"; else echo OK; fi; done; done; done
Check static NGINX config
for pod in $(kubectl -n ingress-nginx get pods -l app=ingress-nginx -o custom-columns=NAME:.metadata.name --no-headers); do kubectl -n ingress-nginx exec $pod -- cat /etc/nginx/nginx.conf; done
Use checksum to find differences
for pod in $(kubectl -n ingress-nginx get pods -l app=ingress-nginx -o custom-columns=NAME:.metadata.name --no-headers); do echo $pod; kubectl -n ingress-nginx exec $pod -- cat /etc/nginx/nginx.conf | md5; done
Exclude instance specific and randomized lines
for pod in $(kubectl -n ingress-nginx get pods -l app=ingress-nginx -o custom-columns=NAME:.metadata.name --no-headers); do echo $pod; kubectl -n ingress-nginx exec $pod -- cat /etc/nginx/nginx.conf | grep -v nameservers | grep -v resolver | grep -v "PEM sha" | md5; done
Check dynamic NGINX config
for pod in $(kubectl -n ingress-nginx get pods -l app=ingress-nginx -o custom-columns=NAME:.metadata.name --no-headers); do echo $pod; kubectl -n ingress-nginx exec $pod -- curl -s http://127.0.0.1:18080/configuration/backends; done
Use checksum to view differences
for pod in $(kubectl -n ingress-nginx get pods -l app=ingress-nginx -o custom-columns=NAME:.metadata.name --no-headers); do echo $pod; kubectl -n ingress-nginx exec $pod -- curl -s http://127.0.0.1:18080/configuration/backends | md5; done
Pretty print using jq
for pod in $(kubectl -n ingress-nginx get pods -l app=ingress-nginx -o custom-columns=NAME:.metadata.name --no-headers); do echo $pod; kubectl -n ingress-nginx exec $pod -- curl -s http://127.0.0.1:18080/configuration/backends | jq .; done