Last active
August 30, 2022 14:06
-
-
Save ormergi/3ddbf901ddc95baf316b604994285a69 to your computer and use it in GitHub Desktop.
Demo Allocated SR-IOV VFs Reset Issue
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Author: Or Mergi <[email protected]> | |
# | |
# This script runs a demonstration to reproduce the following issue: | |
# https://github.com/k8snetworkplumbingwg/sriov-cni/issues/219 | |
# | |
# Warnings: | |
# * This script creates pods and re-configure you cluster node | |
# SR-IOV Physical Function (PF) interface and its Virtual Functions (VF). | |
# * Do not run this on production cluster. | |
# * Make sure no VF is attached to any workload on the node you picked. | |
# | |
# Requirements: | |
# - K8S cluster (KinD [1] or Openshift [2]) with SR-IOV network hardware. | |
# - SR-IOV CNI plugin [3] and SR-IOV Network device plugin [4]. | |
# - Multus [5]. | |
# - Namespace admin privileges. | |
# - NetworkAttachmentDefinition on target Namespace. | |
# | |
# Usage: | |
# export CLUSTER_TYPE="openshift" \ | |
# TEST_NAMESPACE="test-sriov-issue-ocp4" \ | |
# SRIOV_DP_NS="openshift-sriov-network-operator" \ | |
# NODE_NAME="ocp-worker1.example.com" \ | |
# PF_NAME="enp3s0f0" \ | |
# NET_NAME="sriov-network" | |
# ./demo-allocated-vfs-reset-issue.sh | |
# | |
# View logs: | |
# $ ls logs | |
# 2022-08-30-10:50:33.1661856633 | |
# $ tree logs/2022-08-30-10:50:33.1661856633 | |
# ├── demo-allocated-vfs-reset-issue.sh.log <---------------- execution log | |
# ├── host-journal-dmseg.log | |
# ├── node-ocp-worker1.example.com-dmesg.log | |
# ├── node-ocp-worker1.example.com-journal.log | |
# ├── node-ocp-worker1.example.com-pf-enp3s0f0-vfs-id-map.txt | |
# ├── node-ocp-worker1.example.com-pf-enp3s0f0-vfs-state.txt | |
# ├── overview.txt | |
# ├── pod-test-sriov-issue-ocp4_testpod-5wpph-description.txt | |
# ├── pod-test-sriov-issue-ocp4_testpod-5wpph-env.txt | |
# ├── pod-test-sriov-issue-ocp4_testpod-5wpph.yaml | |
# ├── pod-openshift-sriov-network-operator-kube-sriov-device-plugin-amd64-g4q8c.log | |
# └── trials | |
# | |
# [1] https://github.com/kubernetes-sigs/kind | |
# [2] https://docs.openshift.com/container-platform/4.11/welcome/index.html | |
# [3] https://github.com/k8snetworkplumbingwg/sriov-cni | |
# [4] https://github.com/k8snetworkplumbingwg/sriov-netwokd-device-plugin | |
# [5] https://github.com/k8snetworkplumbingwg/multus-cni | |
#! /bin/bash | |
set -ex | |
# CLUSTER_TYPE options: | |
# "kind" - https://github.com/kubernetes-sigs/kind | |
# "openshift" - https://docs.openshift.com/container-platform/4.11/welcome/index.html | |
CLUSTER_TYPE="${CLUSTER_TYPE:-kind}" | |
# TEST_NAMESPACE target namespace where pods will be created | |
TEST_NAMESPACE="${TEST_NAMESPACE:-sriov-cni-test}" | |
# SRIOV_DP_NS sriov-network-device-plugin pods namespace name | |
SRIOV_DP_NS="${SRIOV_DP_NS:-sriov}" | |
# NODE_NAME cluster node name where pods will be created and VFs configured | |
NODE_NAME="${NODE_NAME:-sriov-worker2}" | |
# PF_NAME cluster node PF interface name | |
PF_NAME="${PF_NAME:-enp4s0f1}" | |
# NetworkAttachmentDefinition name | |
NET_NAME="${NET_NAME:-sriov-network}" | |
# SRIOV_CNI_BIN sriov-cni plugin binary path on the target node | |
SRIOV_CNI_BIN=${SRIOV_CNI_BIN:-"/opt/cni/bin/sriov"} | |
DEFAULT_MAC_ADDRESS="${DEFAULT_MAC_ADDRESS:-00:00:00:00:00:00}" | |
LOGS_DIR="${LOGS_DIR:-logs}" | |
# set cluster API client binary to use according to the cluster type | |
CTL="" | |
CTL_NODE_EXEC="" | |
if [[ ${CLUSTER_TYPE} == kind ]]; then | |
CTL="kubectl" | |
CTL_NODE_EXEC="docker exec ${NODE_NAME}" | |
elif [[ ${CLUSTER_TYPE} == ocp ]]; then | |
CTL="oc" | |
CTL_NODE_EXEC="oc debug \"node/${NODE_NAME}\" -n ${TEST_NAMESPACE} --" | |
SRIOV_CNI_BIN="/var/lib/cni/bin/sriov" | |
fi | |
# node_pf_vf_sys_devices returns the PF associated VF sys devices links | |
node_pf_vf_sys_devices() { | |
${CTL_NODE_EXEC} sh -c "ls -la /sys/class/net/${PF_NAME}/device/virtfn*" | |
} | |
# vf_mac_by_pci return the MAC address of the given VF PCI address. | |
# The VF ID is realized from the PF-to-VF device link, for example: | |
# ls /sys/class/net/<PF NAME>/device/virtfn* -la | |
# lrwxrwxrwx. 1 root root 0 Aug 10 13:02 /sys/class/net/<PF NAME>/device/virtfn4 -> ../0000:04:02.4 | |
# /\ /\/\/\/\/\/\ | |
# ref: https://support.mellanox.com/s/article/howto-map-vf-number-and-the-vm-in-sr-iov-mode | |
vf_mac_by_pci(){ | |
local -r pci_address=$1 | |
vf_index=$(node_pf_vf_sys_devices | grep ${pci_address} | grep -Po "virtfn\K\d") | |
vf_state=$(${CTL_NODE_EXEC} ip l | grep "vf ${vf_index}") | |
echo "${vf_state}" | grep -Po "link/ether\s\K([0-9a-zA-Z]{2}:?){6}" | |
} | |
# assert_pod_vf_mac_address_is_default returns true if the given pod VF MAC address is default. | |
assert_pod_vf_mac_address_is_default() { | |
local -r ns=$1 | |
local -r name=$2 | |
vf_pci_address=$(${CTL} exec ${name} -n ${ns} -- env | grep -Po "PCIDEVICE.*=\K.*") | |
vf_mac_address=$(vf_mac_by_pci ${vf_pci_address}) | |
[[ ${vf_mac_address} == ${DEFAULT_MAC_ADDRESS} ]] | |
} | |
node_pf_numvfs() { | |
${CTL_NODE_EXEC} cat "/sys/class/net/${PF_NAME}/device/sriov_numvfs" | |
} | |
set_vfs_default_mac_address() { | |
echo "resting node \"${NODE_NAME}\" PF \"${PF_NAME}\" VFs MAC addresses..." | |
for i in $(seq 0 $(($(node_pf_numvfs)-1))); do | |
current_mac=$(${CTL_NODE_EXEC} /usr/sbin/ip link show dev ${PF_NAME} | grep "vf ${i}" | grep -Po "link/ether\s\K([0-9a-zA-Z]{2}:?){6}") | |
if [[ ${current_mac} != ${DEFAULT_MAC_ADDRESS} ]]; then | |
${CTL_NODE_EXEC} sh -c "/usr/sbin/ip link set ${PF_NAME} vf ${i} mac ${DEFAULT_MAC_ADDRESS} state auto" | |
fi | |
done | |
} | |
# dump_artifacts exports the pod and node state and logs. | |
dump_artifacts() { | |
local -r pod_ns=$1 | |
local -r pod_name=$2 | |
local -r test_start_timestamp=$3 | |
set +e | |
# duration time of logs that will be collected before start time | |
log_time_offset_sec=60 | |
now_unix_sec=$(date +%s) | |
collect_logs_since_sec=$(( (now_unix_sec - test_start_timestamp) + log_time_offset_sec )) | |
# node VFs state | |
${CTL_NODE_EXEC} sh -c "ls -la /sys/class/net/${PF_NAME}/device/virtfn*" > "node-${NODE_NAME}-pf-${PF_NAME}-vfs-id-map.txt" | |
${CTL_NODE_EXEC} ip link show ${PF_NAME} > "node-${NODE_NAME}-pf-${PF_NAME}-vfs-state.txt" | |
# pod state | |
${CTL} get pod -A -o wide > overview.txt | |
${CTL} get pod -n ${pod_ns} ${pod_name} -o yaml > "pod-${pod_ns}_${pod_name}.yaml" | |
${CTL} describe pod -n ${pod_ns} ${pod_name} > "pod-${pod_ns}_${pod_name}-description.txt" | |
${CTL} exec -n ${pod_ns} ${pod_name} -- env > "pod-${pod_ns}_${pod_name}-env.txt" | |
# sriov device plugin logs | |
sriov_dp_pod=$(${CTL} get po -n ${SRIOV_DP_NS} -l app=sriovdp -o wide --no-headers | grep -w ${NODE_NAME} | awk '{print $1'}) | |
${CTL} logs -n ${SRIOV_DP_NS} ${sriov_dp_pod} --since="${collect_logs_since_sec}s" > "pod-${SRIOV_DP_NS}_${sriov_dp_pod}.log" | |
# journal log | |
${CTL_NODE_EXEC} journalctl --since "${collect_logs_since_sec} seconds ago" --system --user > "node-${NODE_NAME}-journal.log" | |
# node kernel log | |
${CTL_NODE_EXEC} dmesg --decode --userspace --kernel --ctime > "node-${NODE_NAME}-dmesg.log" | |
# host kernel log | |
journalctl --since "${collect_logs_since_sec} seconds ago" --dmesg > "host-journal-dmesg.log" | |
set -x | |
} | |
# reproduce_sriov_cni_issue | |
# 1. create pod with explicit MAC address and wait for it to become ready | |
# 2. delete the pod in background | |
# 3. create simmilar pod with explicit MAC address and wait for it to become ready. | |
# 4. check the VFs state on node where the pod is creaded. | |
# If all VFs have default MAC address, the issue is reproduced | |
# and the pod VF ended up w/o the requrested MAC address. | |
reproduce_sriov_cni_issue() { | |
local -r ns=$1 | |
local -r artifacts_dir=$2 | |
## ensure no leftover from previous executions | |
${CTL} delete po -n ${ns} -l test=pod1 | |
${CTL} delete po -n ${ns} -l test=pod2 | |
timeout 30s sh -c "until ${CTL} get po -n $ns 2>&1 | grep \"No resources found\"; do echo \"waiting for pods at namespace \"${ns}\" to dispose..\"; sleep 1; done" | |
## start | |
set_vfs_default_mac_address | |
local trials=0 | |
local start_timestamp="" | |
while true; do | |
trials=$((trials+1)) | |
start_timestamp="$(date +%s)" | |
## create and wait for pod to become ready | |
${CTL} create -n ${ns} -f pod1.yaml | |
${CTL} wait pod -n ${ns} -l test=pod1 --for condition=ContainersReady --timeout 2m | |
## delete the pod in background | |
${CTL} delete pod -n ${ns} -l test=pod1 & | |
## create similar pod | |
${CTL} create -n ${ns} -f pod2.yaml | |
${CTL} wait pod -n ${ns} -l test=pod2 --for condition=ContainersReady --timeout 2m | |
## print pod events | |
for pod in $(${CTL} get pod -n ${ns} -l test=pod2 --no-headers | awk '{print $1}'); do | |
${CTL} get event -n ${ns} | grep "pod/${pod}" | |
done | |
## print pods and VFs state | |
${CTL} get po -n ${ns} -o wide | |
${CTL_NODE_EXEC} ip link show ${PF_NAME} | |
for pod_name in $(${CTL} get po -n ${ns} --no-headers | awk '{print $1}'); do | |
if assert_pod_vf_mac_address_is_default "${ns}" "${pod_name}"; then | |
pushd "${artifacts_dir}" | |
echo "issue reproduced after ${trials} trials" > trials | |
dump_artifacts "${ns}" "${pod_name}" "${start_timestamp}" | |
popd | |
return | |
fi | |
done | |
## clean up | |
${CTL} delete pod -n ${ns} --all | |
done | |
} | |
artifacts_dir="${LOGS_DIR}/$(date +%Y-%m-%d-%H:%M:%S.%s)" | |
mkdir -p "${artifacts_dir}" | |
( | |
# print sriov-cni binary stats | |
${CTL_NODE_EXEC} ls -lah "${SRIOV_CNI_BIN}" | |
${CTL_NODE_EXEC} sha1sum "${SRIOV_CNI_BIN}" | |
# generate pod manifests from templates | |
export TEST_NAMESPACE NODE_NAME NET_NAME | |
envsubst < "pod1.yaml.tmpl" > "pod1.yaml" | |
envsubst < "pod2.yaml.tmpl" > "pod2.yaml" | |
reproduce_sriov_cni_issue "${TEST_NAMESPACE}" "${artifacts_dir}" | |
) 2>&1 | tee "${artifacts_dir}/$(basename $0).log" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
apiVersion: v1 | |
kind: Pod | |
metadata: | |
generateName: testpod- | |
labels: | |
test: pod1 | |
annotations: | |
k8s.v1.cni.cncf.io/networks: '[{"interface":"net1","mac":"02:02:02:02:02:02","name":"$NET_NAME","namespace":"$TEST_NAMESPACE"}]' | |
spec: | |
affinity: | |
nodeAffinity: | |
requiredDuringSchedulingIgnoredDuringExecution: | |
nodeSelectorTerms: | |
- matchFields: | |
- key: metadata.name | |
operator: In | |
values: | |
- $NODE_NAME | |
terminationGracePeriodSeconds: 0 | |
containers: | |
- name: test | |
image: fedora | |
command: ["sleep", "1000000"] | |
resources: | |
limits: | |
kubevirt.io/sriov_net: "1" | |
requests: | |
kubevirt.io/sriov_net: "1" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
apiVersion: v1 | |
kind: Pod | |
metadata: | |
generateName: testpod- | |
labels: | |
test: pod2 | |
annotations: | |
k8s.v1.cni.cncf.io/networks: '[{"interface":"net1","mac":"02:02:02:02:02:02","name":"$NET_NAME","namespace":"$TEST_NAMESPACE"}]' | |
spec: | |
affinity: | |
nodeAffinity: | |
requiredDuringSchedulingIgnoredDuringExecution: | |
nodeSelectorTerms: | |
- matchFields: | |
- key: metadata.name | |
operator: In | |
values: | |
- $NODE_NAME | |
terminationGracePeriodSeconds: 0 | |
containers: | |
- name: test | |
image: fedora | |
command: ["sleep", "1000000"] | |
resources: | |
limits: | |
kubevirt.io/sriov_net: "1" | |
requests: | |
kubevirt.io/sriov_net: "1" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment