Skip to content

Instantly share code, notes, and snippets.

@ormergi
Last active August 30, 2022 14:06
Show Gist options
  • Save ormergi/3ddbf901ddc95baf316b604994285a69 to your computer and use it in GitHub Desktop.
Save ormergi/3ddbf901ddc95baf316b604994285a69 to your computer and use it in GitHub Desktop.
Demo Allocated SR-IOV VFs Reset Issue
# Author: Or Mergi <[email protected]>
#
# This script runs a demonstration to reproduce the following issue:
# https://github.com/k8snetworkplumbingwg/sriov-cni/issues/219
#
# Warnings:
# * This script creates pods and re-configure you cluster node
# SR-IOV Physical Function (PF) interface and its Virtual Functions (VF).
# * Do not run this on production cluster.
# * Make sure no VF is attached to any workload on the node you picked.
#
# Requirements:
# - K8S cluster (KinD [1] or Openshift [2]) with SR-IOV network hardware.
# - SR-IOV CNI plugin [3] and SR-IOV Network device plugin [4].
# - Multus [5].
# - Namespace admin privileges.
# - NetworkAttachmentDefinition on target Namespace.
#
# Usage:
# export CLUSTER_TYPE="openshift" \
# TEST_NAMESPACE="test-sriov-issue-ocp4" \
# SRIOV_DP_NS="openshift-sriov-network-operator" \
# NODE_NAME="ocp-worker1.example.com" \
# PF_NAME="enp3s0f0" \
# NET_NAME="sriov-network"
# ./demo-allocated-vfs-reset-issue.sh
#
# View logs:
# $ ls logs
# 2022-08-30-10:50:33.1661856633
# $ tree logs/2022-08-30-10:50:33.1661856633
# ├── demo-allocated-vfs-reset-issue.sh.log <---------------- execution log
# ├── host-journal-dmseg.log
# ├── node-ocp-worker1.example.com-dmesg.log
# ├── node-ocp-worker1.example.com-journal.log
# ├── node-ocp-worker1.example.com-pf-enp3s0f0-vfs-id-map.txt
# ├── node-ocp-worker1.example.com-pf-enp3s0f0-vfs-state.txt
# ├── overview.txt
# ├── pod-test-sriov-issue-ocp4_testpod-5wpph-description.txt
# ├── pod-test-sriov-issue-ocp4_testpod-5wpph-env.txt
# ├── pod-test-sriov-issue-ocp4_testpod-5wpph.yaml
# ├── pod-openshift-sriov-network-operator-kube-sriov-device-plugin-amd64-g4q8c.log
# └── trials
#
# [1] https://github.com/kubernetes-sigs/kind
# [2] https://docs.openshift.com/container-platform/4.11/welcome/index.html
# [3] https://github.com/k8snetworkplumbingwg/sriov-cni
# [4] https://github.com/k8snetworkplumbingwg/sriov-netwokd-device-plugin
# [5] https://github.com/k8snetworkplumbingwg/multus-cni
#! /bin/bash
set -ex
# CLUSTER_TYPE options:
# "kind" - https://github.com/kubernetes-sigs/kind
# "openshift" - https://docs.openshift.com/container-platform/4.11/welcome/index.html
CLUSTER_TYPE="${CLUSTER_TYPE:-kind}"
# TEST_NAMESPACE target namespace where pods will be created
TEST_NAMESPACE="${TEST_NAMESPACE:-sriov-cni-test}"
# SRIOV_DP_NS sriov-network-device-plugin pods namespace name
SRIOV_DP_NS="${SRIOV_DP_NS:-sriov}"
# NODE_NAME cluster node name where pods will be created and VFs configured
NODE_NAME="${NODE_NAME:-sriov-worker2}"
# PF_NAME cluster node PF interface name
PF_NAME="${PF_NAME:-enp4s0f1}"
# NetworkAttachmentDefinition name
NET_NAME="${NET_NAME:-sriov-network}"
# SRIOV_CNI_BIN sriov-cni plugin binary path on the target node
SRIOV_CNI_BIN=${SRIOV_CNI_BIN:-"/opt/cni/bin/sriov"}
DEFAULT_MAC_ADDRESS="${DEFAULT_MAC_ADDRESS:-00:00:00:00:00:00}"
LOGS_DIR="${LOGS_DIR:-logs}"
# set cluster API client binary to use according to the cluster type
CTL=""
CTL_NODE_EXEC=""
if [[ ${CLUSTER_TYPE} == kind ]]; then
CTL="kubectl"
CTL_NODE_EXEC="docker exec ${NODE_NAME}"
elif [[ ${CLUSTER_TYPE} == ocp ]]; then
CTL="oc"
CTL_NODE_EXEC="oc debug \"node/${NODE_NAME}\" -n ${TEST_NAMESPACE} --"
SRIOV_CNI_BIN="/var/lib/cni/bin/sriov"
fi
# node_pf_vf_sys_devices returns the PF associated VF sys devices links
node_pf_vf_sys_devices() {
${CTL_NODE_EXEC} sh -c "ls -la /sys/class/net/${PF_NAME}/device/virtfn*"
}
# vf_mac_by_pci return the MAC address of the given VF PCI address.
# The VF ID is realized from the PF-to-VF device link, for example:
# ls /sys/class/net/<PF NAME>/device/virtfn* -la
# lrwxrwxrwx. 1 root root 0 Aug 10 13:02 /sys/class/net/<PF NAME>/device/virtfn4 -> ../0000:04:02.4
# /\ /\/\/\/\/\/\
# ref: https://support.mellanox.com/s/article/howto-map-vf-number-and-the-vm-in-sr-iov-mode
vf_mac_by_pci(){
local -r pci_address=$1
vf_index=$(node_pf_vf_sys_devices | grep ${pci_address} | grep -Po "virtfn\K\d")
vf_state=$(${CTL_NODE_EXEC} ip l | grep "vf ${vf_index}")
echo "${vf_state}" | grep -Po "link/ether\s\K([0-9a-zA-Z]{2}:?){6}"
}
# assert_pod_vf_mac_address_is_default returns true if the given pod VF MAC address is default.
assert_pod_vf_mac_address_is_default() {
local -r ns=$1
local -r name=$2
vf_pci_address=$(${CTL} exec ${name} -n ${ns} -- env | grep -Po "PCIDEVICE.*=\K.*")
vf_mac_address=$(vf_mac_by_pci ${vf_pci_address})
[[ ${vf_mac_address} == ${DEFAULT_MAC_ADDRESS} ]]
}
node_pf_numvfs() {
${CTL_NODE_EXEC} cat "/sys/class/net/${PF_NAME}/device/sriov_numvfs"
}
set_vfs_default_mac_address() {
echo "resting node \"${NODE_NAME}\" PF \"${PF_NAME}\" VFs MAC addresses..."
for i in $(seq 0 $(($(node_pf_numvfs)-1))); do
current_mac=$(${CTL_NODE_EXEC} /usr/sbin/ip link show dev ${PF_NAME} | grep "vf ${i}" | grep -Po "link/ether\s\K([0-9a-zA-Z]{2}:?){6}")
if [[ ${current_mac} != ${DEFAULT_MAC_ADDRESS} ]]; then
${CTL_NODE_EXEC} sh -c "/usr/sbin/ip link set ${PF_NAME} vf ${i} mac ${DEFAULT_MAC_ADDRESS} state auto"
fi
done
}
# dump_artifacts exports the pod and node state and logs.
dump_artifacts() {
local -r pod_ns=$1
local -r pod_name=$2
local -r test_start_timestamp=$3
set +e
# duration time of logs that will be collected before start time
log_time_offset_sec=60
now_unix_sec=$(date +%s)
collect_logs_since_sec=$(( (now_unix_sec - test_start_timestamp) + log_time_offset_sec ))
# node VFs state
${CTL_NODE_EXEC} sh -c "ls -la /sys/class/net/${PF_NAME}/device/virtfn*" > "node-${NODE_NAME}-pf-${PF_NAME}-vfs-id-map.txt"
${CTL_NODE_EXEC} ip link show ${PF_NAME} > "node-${NODE_NAME}-pf-${PF_NAME}-vfs-state.txt"
# pod state
${CTL} get pod -A -o wide > overview.txt
${CTL} get pod -n ${pod_ns} ${pod_name} -o yaml > "pod-${pod_ns}_${pod_name}.yaml"
${CTL} describe pod -n ${pod_ns} ${pod_name} > "pod-${pod_ns}_${pod_name}-description.txt"
${CTL} exec -n ${pod_ns} ${pod_name} -- env > "pod-${pod_ns}_${pod_name}-env.txt"
# sriov device plugin logs
sriov_dp_pod=$(${CTL} get po -n ${SRIOV_DP_NS} -l app=sriovdp -o wide --no-headers | grep -w ${NODE_NAME} | awk '{print $1'})
${CTL} logs -n ${SRIOV_DP_NS} ${sriov_dp_pod} --since="${collect_logs_since_sec}s" > "pod-${SRIOV_DP_NS}_${sriov_dp_pod}.log"
# journal log
${CTL_NODE_EXEC} journalctl --since "${collect_logs_since_sec} seconds ago" --system --user > "node-${NODE_NAME}-journal.log"
# node kernel log
${CTL_NODE_EXEC} dmesg --decode --userspace --kernel --ctime > "node-${NODE_NAME}-dmesg.log"
# host kernel log
journalctl --since "${collect_logs_since_sec} seconds ago" --dmesg > "host-journal-dmesg.log"
set -x
}
# reproduce_sriov_cni_issue
# 1. create pod with explicit MAC address and wait for it to become ready
# 2. delete the pod in background
# 3. create simmilar pod with explicit MAC address and wait for it to become ready.
# 4. check the VFs state on node where the pod is creaded.
# If all VFs have default MAC address, the issue is reproduced
# and the pod VF ended up w/o the requrested MAC address.
reproduce_sriov_cni_issue() {
local -r ns=$1
local -r artifacts_dir=$2
## ensure no leftover from previous executions
${CTL} delete po -n ${ns} -l test=pod1
${CTL} delete po -n ${ns} -l test=pod2
timeout 30s sh -c "until ${CTL} get po -n $ns 2>&1 | grep \"No resources found\"; do echo \"waiting for pods at namespace \"${ns}\" to dispose..\"; sleep 1; done"
## start
set_vfs_default_mac_address
local trials=0
local start_timestamp=""
while true; do
trials=$((trials+1))
start_timestamp="$(date +%s)"
## create and wait for pod to become ready
${CTL} create -n ${ns} -f pod1.yaml
${CTL} wait pod -n ${ns} -l test=pod1 --for condition=ContainersReady --timeout 2m
## delete the pod in background
${CTL} delete pod -n ${ns} -l test=pod1 &
## create similar pod
${CTL} create -n ${ns} -f pod2.yaml
${CTL} wait pod -n ${ns} -l test=pod2 --for condition=ContainersReady --timeout 2m
## print pod events
for pod in $(${CTL} get pod -n ${ns} -l test=pod2 --no-headers | awk '{print $1}'); do
${CTL} get event -n ${ns} | grep "pod/${pod}"
done
## print pods and VFs state
${CTL} get po -n ${ns} -o wide
${CTL_NODE_EXEC} ip link show ${PF_NAME}
for pod_name in $(${CTL} get po -n ${ns} --no-headers | awk '{print $1}'); do
if assert_pod_vf_mac_address_is_default "${ns}" "${pod_name}"; then
pushd "${artifacts_dir}"
echo "issue reproduced after ${trials} trials" > trials
dump_artifacts "${ns}" "${pod_name}" "${start_timestamp}"
popd
return
fi
done
## clean up
${CTL} delete pod -n ${ns} --all
done
}
artifacts_dir="${LOGS_DIR}/$(date +%Y-%m-%d-%H:%M:%S.%s)"
mkdir -p "${artifacts_dir}"
(
# print sriov-cni binary stats
${CTL_NODE_EXEC} ls -lah "${SRIOV_CNI_BIN}"
${CTL_NODE_EXEC} sha1sum "${SRIOV_CNI_BIN}"
# generate pod manifests from templates
export TEST_NAMESPACE NODE_NAME NET_NAME
envsubst < "pod1.yaml.tmpl" > "pod1.yaml"
envsubst < "pod2.yaml.tmpl" > "pod2.yaml"
reproduce_sriov_cni_issue "${TEST_NAMESPACE}" "${artifacts_dir}"
) 2>&1 | tee "${artifacts_dir}/$(basename $0).log"
apiVersion: v1
kind: Pod
metadata:
generateName: testpod-
labels:
test: pod1
annotations:
k8s.v1.cni.cncf.io/networks: '[{"interface":"net1","mac":"02:02:02:02:02:02","name":"$NET_NAME","namespace":"$TEST_NAMESPACE"}]'
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchFields:
- key: metadata.name
operator: In
values:
- $NODE_NAME
terminationGracePeriodSeconds: 0
containers:
- name: test
image: fedora
command: ["sleep", "1000000"]
resources:
limits:
kubevirt.io/sriov_net: "1"
requests:
kubevirt.io/sriov_net: "1"
apiVersion: v1
kind: Pod
metadata:
generateName: testpod-
labels:
test: pod2
annotations:
k8s.v1.cni.cncf.io/networks: '[{"interface":"net1","mac":"02:02:02:02:02:02","name":"$NET_NAME","namespace":"$TEST_NAMESPACE"}]'
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchFields:
- key: metadata.name
operator: In
values:
- $NODE_NAME
terminationGracePeriodSeconds: 0
containers:
- name: test
image: fedora
command: ["sleep", "1000000"]
resources:
limits:
kubevirt.io/sriov_net: "1"
requests:
kubevirt.io/sriov_net: "1"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment