Skip to content

Instantly share code, notes, and snippets.

@knisbet
Last active November 26, 2019 21:07
Show Gist options
  • Save knisbet/0e94afe4faf75005a0baf23af750391e to your computer and use it in GitHub Desktop.
Save knisbet/0e94afe4faf75005a0baf23af750391e to your computer and use it in GitHub Desktop.
Gravity network debug
#!/bin/bash
# Copyright 2019 Gravitational, Inc.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# set -x
now=$(date +"%Y%m%d_%H%M%S")
hostname=$(hostname)
_file="/var/lib/gravity/planet/share/gravity-network-debug_${now}.${hostname}.txt"
_tcpdump_flannel="/var/lib/gravity/planet/share/gravity-network-debug_${now}.${hostname}.flannel.pcap"
_tcpdump_vxlan="/var/lib/gravity/planet/share/gravity-network-debug_${now}.${hostname}.vxlan.pcap"
echo -e "Saving report to $_file"
echo -e "Saving flannel capture to $_tcpdump_flannel"
echo -e "Saving vxlan capture to $_tcpdump_vxlan"
echo -e ""
#
# Collect host information
#
echo -e "Collecting system information..."
echo -e "\n\n\n--- hostname" 2>${_file} >${_file}
hostname &>> ${_file}
echo -e "\n\n\n--- uname -a" &>> ${_file}
uname -a &>> ${_file}
echo -e "\n\n\n--- sysctl -a" &>> ${_file}
sysctl -a &>> ${_file}
#
# Collect current network configuration
#
echo -e "Collecting flannel / network configuration..."
echo -e "\n\n\n--- cat /run/flannel/subnet.env" &>> ${_file}
cat /run/flannel/subnet.env &>> ${_file}
echo -e "\n\n\n--- etcdctl get /coreos.com/network/config" &>> ${_file}
etcdctl get /coreos.com/network/config &>> ${_file}
echo -e "\n\n\n--- etcdctl ls --recursive -p /coreos.com/network/subnets | grep -v '/$' | xargs -n 1 -I% sh -c 'echo -n %:; etcdctl get %;'" &>> ${_file}
etcdctl ls --recursive -p /coreos.com/network/subnets | grep -v '/$' | xargs -n 1 -I% sh -c 'echo -n %:; etcdctl get %;' &>> ${_file}
echo -e "\n\n\n--- ip -d route" &>> ${_file}
ip -d route &>> ${_file}
echo -e "\n\n\n--- ip -d link show" &>> ${_file}
ip -d link show &>> ${_file}
echo -e "\n\n\n--- ip -d addr" &>> ${_file}
ip -d addr &>> ${_file}
echo -e "\n\n\n--- ip -d neighbor" &>> ${_file}
ip -d neighbor &>> ${_file}
echo -e "\n\n\n--- ip -d rule" &>> ${_file}
ip -d rule &>> ${_file}
echo -e "\n\n\n--- ip -d ntable" &>> ${_file}
ip -d ntable &>> ${_file}
echo -e "\n\n\n--- ip -d maddress" &>> ${_file}
ip -d maddress &>> ${_file}
echo -e "\n\n\n--- ip -d xfrm state show" &>> ${_file}
ip -d xfrm state show &>> ${_file}
echo -e "\n\n\n--- ip -d xfrm policy show" &>> ${_file}
ip -d xfrm policy show &>> ${_file}
echo -e "\n\n\n--- ip -d tcp_metrics" &>> ${_file}
ip -d tcp_metrics &>> ${_file}
echo -e "\n\n\n--- ip -d netconf" &>> ${_file}
ip -d netconf &>> ${_file}
echo -e "\n\n\n--- bridge fdb show" &>> ${_file}
bridge fdb show &>> ${_file}
echo -e "\n\n\n--- iptables-save" &>> ${_file}
iptables-save &>> ${_file}
#
# Collect kubernetes info
#
# Note: will return error on worker nodes
echo -e "\n\n\n--- kubectl get all -o wide --all-namespaces" &>> ${_file}
kubectl get all -o wide --all-namespaces &>> ${_file}
echo -e "\n\n\n--- kubectl get endpoints -o yaml --all-namespaces" &>> ${_file}
kubectl get endpoints -o yaml --all-namespaces &>> ${_file}
echo -e "\n\n\n--- kubectl get nodes -o yaml --all-namespaces" &>> ${_file}
kubectl get nodes -o yaml --all-namespaces &>> ${_file}
echo -e "\n\n\n--- kubectl -n kube-system get po -lk8s-app=kube-dns -o wide" &>> ${_file}
kubectl -n kube-system get po -lk8s-app=kube-dns -o wide &>> ${_file}
#
# Test network
#
echo -e "Testing network..."
# Setup packet capture for the test
vxlan_dev=$(ip -d link show flannel.1 | grep vxlan | grep -oP '(?<=dev\s)\w+')
echo "vxlan_dev: ${vxlan_dev}" &>> ${_file}
tcpdump_flannel=$(systemd-run tcpdump -vni flannel.1 -w ${_tcpdump_flannel} 2>&1 port 53 or icmp | awk '{print $4}')
tcpdump_vxlan=$(systemd-run tcpdump -vni ${vxlan_dev} -w ${_tcpdump_vxlan} 2>&1 port 8472 | awk '{print $4}')
echo -e "\n\n\n--- === Test network (host ns) ===" &>> ${_file}
# ping flannel.1 and docker0 on each host
mapfile -t flannel_subnets < <( etcdctl ls --recursive -p /coreos.com/network/subnets | grep -v '/$' | awk -F'/' '{print $5}' | awk -F'-' '{print $1}' | awk -F'.' '{print $1 "." $2 "." $3 "."}' )
#echo ${flannel_subnets[*]}
for subnet in ${flannel_subnets[*]}
do
echo -e "\n\n\n--- ping -c3 ${subnet}0" &>> ${_file}
ping -c3 ${subnet}0 &>> ${_file}
echo -e "\n\n\n--- ping -c3 ${subnet}1" &>> ${_file}
ping -c3 ${subnet}1 &>> ${_file}
done
echo -e "\n\n\n--- === Test kube-dns (host ns) ===" &>> ${_file}
# ping / dig the kube-dns pod on each host
mapfile -t kubedns_ips < <( iptables-save | grep kube-system/kube-dns:dns | grep '\-\-to-destination' | grep udp | awk '{print $14}' | awk -F':' '{print $1}' )
for ip in ${kubedns_ips[*]}
do
echo -e "\n\n\n--- ping -c3 ${ip}" &>> ${_file}
ping -c3 ${ip} &>> ${_file}
echo -e "\n\n\n--- dig @${ip} . NS" &>> ${_file}
dig @${ip} . NS >>${_file}
done
# ping flannel.1 / docker0 / dns-pod from kube-dns network ns
echo -e "\n\n\n--- === Test network (dns ns) ===" &>> ${_file}
nannypid=$(ps -ef | grep dnsmasq-nanny | grep -v grep | awk '{print $2}')
echo "dnsmasq-nanny pid: ${nannypid}" &>> ${_file}
for subnet in ${flannel_subnets[*]}
do
echo -e "\n\n\n--- nsenter -n -t ${nannypid} ping -c3 ${subnet}0" &>> ${_file}
nsenter -n -t ${nannypid} ping -c3 ${subnet}0 &>> ${_file}
echo -e "\n\n\n--- nsenter -n -t ${nannypid} ping -c3 ${subnet}1" &>> ${_file}
nsenter -n -t ${nannypid} ping -c3 ${subnet}1 &>> ${_file}
done
echo -e "\n\n\n--- === Test kube-dns (dns ns) ===" &>> ${_file}
for ip in ${kubedns_ips[*]}
do
echo -e "\n\n\n--- nsenter -n -t ${nannypid} ping -c3 ${ip}" &>> ${_file}
nsenter -n -t ${nannypid} ping -c3 ${ip} &>> ${_file}
echo -e "\n\n\n--- nsenter -n -t ${nannypid} dig @${ip} . NS" &>> ${_file}
nsenter -n -t ${nannypid} dig @${ip} . NS >>${_file}
done
# stop the runnign tcpdumps
echo -e "\n\n\n--- systemctl stop ${tcpdump_flannel}" &>> ${_file}
systemctl stop ${tcpdump_flannel} &>> ${_file}
echo -e "\n\n\n--- systemctl stop ${tcpdump_vxlan}" &>> ${_file}
systemctl stop ${tcpdump_vxlan} &>> ${_file}
#
# Collect flanneld logs
#
echo -e "\n\n\n--- journalctl -u flanneld --since="7 days ago" --no-pager" &>> ${_file}
journalctl -u flanneld --since="7 days ago" --no-pager &>> ${_file}
echo "Script completed..."
#!/bin/bash
# Copyright 2019 Gravitational, Inc.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# set -x
now=$(date +"%Y%m%d_%H%M%S")
hostname=$(hostname)
_file="/var/lib/gravity/planet/share/gravity-network-debug_${now}.${hostname}.txt"
_tcpdump_flannel="/var/lib/gravity/planet/share/gravity-network-debug_${now}.${hostname}.flannel.pcap"
_tcpdump_vxlan="/var/lib/gravity/planet/share/gravity-network-debug_${now}.${hostname}.vxlan.pcap"
_tcpdump_any="/var/lib/gravity/planet/share/gravity-network-debug_${now}.${hostname}.any.pcap"
echo -e "Saving report to $_file"
echo -e "Saving flannel capture to $_tcpdump_flannel"
echo -e "Saving vxlan capture to $_tcpdump_vxlan"
echo -e "Saving any capture to $_tcpdump_any"
echo -e ""
echo -e "Creating debug pods..."
kubectl apply -f - <<EOF
apiVersion: extensions/v1beta1
kind: DaemonSet
metadata:
name: network-debug
namespace: kube-system
labels:
k8s-app: network-debug
spec:
selector:
matchLabels:
name: network-debug
template:
metadata:
labels:
name: network-debug
spec:
tolerations:
- operator: "Exists"
containers:
- name: debian
image: apiserver:5000/gravitational/debian-tall:0.0.1
command: ["sleep", "99999"]
ports:
- containerPort: 53
---
apiVersion: v1
kind: Service
metadata:
labels:
name: network-debug
name: network-debug
namespace: kube-system
spec:
ports:
- port: 53
protocol: TCP
selector:
name: network-debug
type: ClusterIP
EOF
sleep 5
#
# Collect host information
#
echo -e "Collecting system information..."
echo -e "\n\n\n--- hostname" 2>${_file} >${_file}
hostname &>> ${_file}
echo -e "\n\n\n--- uname -a" &>> ${_file}
uname -a &>> ${_file}
echo -e "\n\n\n--- sysctl -a" &>> ${_file}
sysctl -a &>> ${_file}
echo -e "\n\n\n--- lsmod" &>> ${_file}
lsmod &>> ${_file}
#
# Collect current network configuration
#
echo -e "Collecting flannel / network configuration..."
echo -e "\n\n\n--- cat /run/flannel/subnet.env" &>> ${_file}
cat /run/flannel/subnet.env &>> ${_file}
echo -e "\n\n\n--- etcdctl get /coreos.com/network/config" &>> ${_file}
etcdctl get /coreos.com/network/config &>> ${_file}
echo -e "\n\n\n--- etcdctl ls --recursive -p /coreos.com/network/subnets | grep -v '/$' | xargs -n 1 -I% sh -c 'echo -n %:; etcdctl get %;'" &>> ${_file}
etcdctl ls --recursive -p /coreos.com/network/subnets | grep -v '/$' | xargs -n 1 -I% sh -c 'echo -n %:; etcdctl get %;' &>> ${_file}
echo -e "\n\n\n--- ip -d route" &>> ${_file}
ip -d route &>> ${_file}
echo -e "\n\n\n--- ip -d link show" &>> ${_file}
ip -d link show &>> ${_file}
echo -e "\n\n\n--- ip -d addr" &>> ${_file}
ip -d addr &>> ${_file}
echo -e "\n\n\n--- ip -d neighbor" &>> ${_file}
ip -d neighbor &>> ${_file}
echo -e "\n\n\n--- ip -d rule" &>> ${_file}
ip -d rule &>> ${_file}
echo -e "\n\n\n--- ip -d ntable" &>> ${_file}
ip -d ntable &>> ${_file}
echo -e "\n\n\n--- ip -d maddress" &>> ${_file}
ip -d maddress &>> ${_file}
echo -e "\n\n\n--- ip -d xfrm state show" &>> ${_file}
ip -d xfrm state show &>> ${_file}
echo -e "\n\n\n--- ip -d xfrm policy show" &>> ${_file}
ip -d xfrm policy show &>> ${_file}
echo -e "\n\n\n--- ip -d tcp_metrics" &>> ${_file}
ip -d tcp_metrics &>> ${_file}
echo -e "\n\n\n--- ip -d netconf" &>> ${_file}
ip -d netconf &>> ${_file}
echo -e "\n\n\n--- bridge fdb show" &>> ${_file}
bridge fdb show &>> ${_file}
echo -e "\n\n\n--- iptables-save" &>> ${_file}
iptables-save &>> ${_file}
#
# Collect kubernetes info
#
# Note: will return error on worker nodes
echo -e "\n\n\n--- kubectl get all -o wide --all-namespaces" &>> ${_file}
kubectl get all -o wide --all-namespaces &>> ${_file}
echo -e "\n\n\n--- kubectl get endpoints -o yaml --all-namespaces" &>> ${_file}
kubectl get endpoints -o yaml --all-namespaces &>> ${_file}
echo -e "\n\n\n--- kubectl get nodes -o yaml --all-namespaces" &>> ${_file}
kubectl get nodes -o yaml --all-namespaces &>> ${_file}
echo -e "\n\n\n--- kubectl -n kube-system get po -lk8s-app=kube-dns -o wide" &>> ${_file}
kubectl -n kube-system get po -lk8s-app=kube-dns -o wide &>> ${_file}
#
# Test network
#
echo -e "Testing network..."
# Setup packet capture for the test
vxlan_dev=$(ip -d link show flannel.1 | grep vxlan | grep -oP '(?<=dev\s)\w+')
echo "external_dev: ${vxlan_dev}" &>> ${_file}
tcpdump_flannel=$(systemd-run tcpdump -vni flannel.1 -w ${_tcpdump_flannel} 2>&1 port 53 or icmp | awk '{print $4}')
tcpdump_vxlan=$(systemd-run tcpdump -vni ${vxlan_dev} -w ${_tcpdump_vxlan} 2>&1 port 8472 or icmp | awk '{print $4}')
tcpdump_any=$(systemd-run tcpdump -vni any -w ${_tcpdump_any} 2>&1 icmp | awk '{print $4}')
echo -e "\n\n\n--- === Test network (host ns) ===" &>> ${_file}
# ping flannel.1 and docker0 on each host
mapfile -t flannel_subnets < <( etcdctl ls --recursive -p /coreos.com/network/subnets | grep -v '/$' | awk -F'/' '{print $5}' | awk -F'-' '{print $1}' | awk -F'.' '{print $1 "." $2 "." $3 "."}' )
#echo ${flannel_subnets[*]}
for subnet in ${flannel_subnets[*]}
do
echo -e "\n\n\n--- ping -c3 ${subnet}0" &>> ${_file}
ping -c3 ${subnet}0 &>> ${_file}
echo -e "\n\n\n--- ping -c3 ${subnet}1" &>> ${_file}
ping -c3 ${subnet}1 &>> ${_file}
done
echo -e "\n\n\n--- === Test ping debug pod (host ns) ===" &>> ${_file}
# ping / dig the kube-dns pod on each host
mapfile -t kubedns_ips < <( iptables-save | grep kube-system/network-debug: | grep '\-\-to-destination' | grep tcp | awk '{print $14}' | awk -F':' '{print $1}' )
for ip in ${kubedns_ips[*]}
do
echo -e "\n\n\n--- ping -c3 ${ip}" &>> ${_file}
ping -c3 ${ip} &>> ${_file}
done
echo -e "\n\n\n--- === Test ping masters (host ns) ===" &>> ${_file}
# ping / dig the kube-dns pod on each host
mapfile -t master_ips < <( etcdctl ls --recursive /planet/cluster/ | grep election/ | awk -F'/' '{print $6}' )
for ip in ${master_ips[*]}
do
echo -e "\n\n\n--- ping -c3 ${ip}" &>> ${_file}
ping -c3 ${ip} &>> ${_file}
done
# ping flannel.1 / docker0 / dns-pod from kube-dns network ns
echo -e "\n\n\n--- === Test network (debug ns) ===" &>> ${_file}
nannypid=$(ps -ef | grep "sleep 99999" | grep -v grep | awk '{print $2}')
echo "debug container pid: ${nannypid}" &>> ${_file}
for subnet in ${flannel_subnets[*]}
do
echo -e "\n\n\n--- nsenter -n -t ${nannypid} ping -c3 ${subnet}0" &>> ${_file}
nsenter -n -t ${nannypid} ping -c3 ${subnet}0 &>> ${_file}
echo -e "\n\n\n--- nsenter -n -t ${nannypid} ping -c3 ${subnet}1" &>> ${_file}
nsenter -n -t ${nannypid} ping -c3 ${subnet}1 &>> ${_file}
done
echo -e "\n\n\n--- === Test kube-dns (debug ns) ===" &>> ${_file}
for ip in ${kubedns_ips[*]}
do
echo -e "\n\n\n--- nsenter -n -t ${nannypid} ping -c3 ${ip}" &>> ${_file}
nsenter -n -t ${nannypid} ping -c3 ${ip} &>> ${_file}
done
echo -e "\n\n\n--- === Test ping masters (debug ns) ===" &>> ${_file}
for ip in ${master_ips[*]}
do
echo -e "\n\n\n--- nsenter -n -t ${nannypid} ping -c3 ${ip}" &>> ${_file}
nsenter -n -t ${nannypid} ping -c3 ${ip} &>> ${_file}
done
# stop the running tcpdumps
echo -e "\n\n\n--- systemctl stop ${tcpdump_flannel}" &>> ${_file}
systemctl stop ${tcpdump_flannel} &>> ${_file}
echo -e "\n\n\n--- systemctl stop ${tcpdump_vxlan}" &>> ${_file}
systemctl stop ${tcpdump_vxlan} &>> ${_file}
echo -e "\n\n\n--- systemctl stop ${tcpdump_any}" &>> ${_file}
systemctl stop ${tcpdump_any} &>> ${_file}
#
# Collect flanneld logs
#
echo -e "\n\n\n--- journalctl -u flanneld --since="7 days ago" --no-pager" &>> ${_file}
journalctl -u flanneld --since="7 days ago" --no-pager &>> ${_file}
echo "Script completed..."

Run these steps on all nodes in the cluster. It's important to capture this information from all nodes, to identify the miss configuration.

  1. Copy debug-gravity-network.sh to the server using preferred method, under /var/lib/gravity/planet/share/debug-gravity-network.sh

  2. Set execute permissions on the script chmod +x /var/lib/gravity/planet/share/debug-gravity-network.sh

  3. Get shell inside gravity gravity shell

  4. Run the script: /var/lib/gravity/planet/share/debug-gravity-network.sh

Sample Output:

Saving report to /var/lib/gravity/planet/share/gravity-network-debug_20190913_204813.kevin-test1.txt
Saving flannel capture to /var/lib/gravity/planet/share/gravity-network-debug_20190913_204813.kevin-test1.flannel.pcap
Saving vxlan capture to /var/lib/gravity/planet/share/gravity-network-debug_20190913_204813.kevin-test1.vxlan.pcap

Collecting system information...
Collecting flannel / network configuration...
Testing network...
Script completed...

Note: this can take a couple minutes to complete.

  1. Exit gravity exit

  2. Copy the 3 files produced off the system and attach to the ticket:

From the above example:
/var/lib/gravity/planet/share/gravity-network-debug_20190913_204813.kevin-test1.txt
/var/lib/gravity/planet/share/gravity-network-debug_20190913_204813.kevin-test1.flannel.pcap
/var/lib/gravity/planet/share/gravity-network-debug_20190913_204813.kevin-test1.vxlan.pcap
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment