Last active
July 5, 2024 08:42
-
-
Save roolebo/32ffdbdede0f3c5ada949973ec195a15 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# SPDX-License-Identifier: MIT | |
# Copyright (C) 2024 Roman Bolshakov. | |
# All rights reserved. | |
# | |
# The script was tested on Samsung PM173x with FW EPK9CB5Q and EPK9GB5Q. | |
# | |
# It's recommended to use at least VQ=3 VI=3 with PM173x for decent | |
# single-threaded I/O performance. | |
# | |
# NVMe namespaces can be created and attached to secondary controllers at any | |
# moment, regardless if seconary controllers are offline or online. | |
# | |
# Example invocation to create three secondary NVMe controllers and expose them | |
# to the host: | |
# sudo env DRIVER=nvme VQ=3 VI=3 NUM_VFS=3 ./nvme-vf SERIAL|nvmeX | |
# | |
# NB1! It's recommended to pass NVMe serial as an argument rather than device | |
# name to the script because Linux NVMe controller number is dynamic and may | |
# change across reboots. | |
# | |
# NB2! A number of Primary Flexible Resources should be freed once before using | |
# the script. E.g. to give all Flexible Resources for secondary controllers: | |
# | |
# nvme virt-mgmt /dev/nvmeX -c 0x41 -r 0 -n 0 -a 1 | |
# nvme virt-mgmt /dev/nvmeX -c 0x41 -r 1 -n 0 -a 1 | |
# | |
# Then you need to make a Controller Level Reset. NVMe spec defines multiple | |
# ways to do it but none worked on my machine. I have found a simple reboot as | |
# the most predictable way to apply the changes. | |
# | |
# Alternative ways to perform Controller Level Reset that may or may not work: | |
# | |
# 1. Controller reset: | |
# nvme reset /dev/nvmeX | |
# echo 1 > /sys/bus/pci/rescan | |
# | |
# 2. Susbystem reset (frezes my machine): | |
# nvme subsystem-reset /dev/nvmeX | |
# | |
# After the reboot/reset you should see the zero-valued vqrfap and virfap in | |
# nvme Primary Controller Capabilities structure: | |
# | |
# nvme primary-ctrl-caps /dev/nvmeX | |
# NVME Identify Primary Controller Capabilities: | |
# cntlid : 0x41 | |
# portid : 0 | |
# crt : 0x3 | |
# vqfrt : 226 | |
# vqrfa : 0 | |
# vqrfap : 0 | |
# vqprt : 64 | |
# vqfrsm : 9 | |
# vqgran : 1 | |
# vifrt : 226 | |
# virfa : 0 | |
# virfap : 0 | |
# viprt : 64 | |
# vifrsm : 9 | |
# vigran : 1 | |
# vigran : 1 | |
# | |
# Now you're good to go to create NVMe VFs with the script. | |
# | |
set -e | |
: ${VQ:=2} | |
: ${VI:=1} | |
: ${NUM_VFS:=1} | |
: ${DRIVER:=vfio-pci} | |
die() | |
{ | |
echo >&2 "$@" | |
exit 1 | |
} | |
quiet_nvme() | |
{ | |
out=$(nvme "$@" 2>&1) || { | |
local rc=$? | |
echo >&2 "ERROR: $out" | |
return $rc | |
} | |
} | |
[ -z "$1" ] && die "ERROR: NVMe device or NVMe device serial is not provided" | |
[ "$UID" == 0 ] || die "ERROR: Need superuser permissions" | |
if [ -e "/sys/class/nvme/$1" ]; then | |
NVME=$1 | |
else | |
# Test if it's a serial | |
for nvme in /sys/class/nvme/*; do | |
[ "$1" == $(cat $nvme/serial) ] && { | |
NVME=$(basename $nvme) | |
break | |
} | |
done | |
fi | |
[ -z "$NVME" ] && die "ERROR: Invalid NVMe device: $1" | |
PF=$(basename $(readlink /sys/class/nvme/$NVME/device)) | |
SECONDARY_CTRL_LIST_JSON=$(nvme list-secondary /dev/$NVME -o json) | |
MAX_VFS=$(cat /sys/class/nvme/$NVME/device/sriov_totalvfs) | |
[ "$NUM_VFS" -gt "$MAX_VFS" ] && \ | |
die "ERROR: Maximum number of VFs on $NVME is $MAX_VFS." | |
PRIMARY_CTRL_CAPS_JSON=$(nvme primary-ctrl-caps /dev/$NVME -o json) | |
VQFRT=$(jq .vqfrt <<<"$PRIMARY_CTRL_CAPS_JSON") | |
VQRFAP=$(jq .vqrfap <<<"$PRIMARY_CTRL_CAPS_JSON") | |
VQFRSM=$(jq .vqfrsm <<<"$PRIMARY_CTRL_CAPS_JSON") | |
VIFRT=$(jq .vifrt <<<"$PRIMARY_CTRL_CAPS_JSON") | |
VIRFAP=$(jq .virfap <<<"$PRIMARY_CTRL_CAPS_JSON") | |
VIFRSM=$(jq .vifrsm <<<"$PRIMARY_CTRL_CAPS_JSON") | |
# "If a secondary controller has no VQ Resources assigned to it, then it | |
# remains in the Offline state. A secondary controller cannot transition to the | |
# Online state until it has VQ Resources for an Admin Queue and one or more I/O | |
# Queues assigned to it (i.e., the minimum number of VQ Resources that may be | |
# assigned is two)." | |
[ "$VQ" -ge 2 ] || die "ERROR: Minimum VQ count is 2" | |
[ "$VQ" -gt "$VQFRSM" ] && die "ERROR: Maximum VQ count is $VQFRSM" | |
VQ_AVAIL=$(( $VQFRT - $VQRFAP )) | |
VQ_NEED=$(( $VQ * $NUM_VFS )) | |
[ "$VQ_NEED" -gt "$VQ_AVAIL" ] && \ | |
die "ERROR: Not enough VQ resources: need $VQ_NEED, available $VQ_AVAIL" | |
# If a secondary controller that supports VI Resources has no VI Resources | |
# assigned to it, then it remains in the Offline state. A secondary controller | |
# cannot transition to the Online state until it has a VI Resource for | |
# interrupt vector 0 assigned to it. For a secondary controller that supports | |
# VI Resources with MSI-X vectors, if no VI Resources are assigned to it, then | |
# MSIXCAP.MXC.TS is reserved. | |
[ "$VI" -ge 1 ] || die "ERROR: Minimum VI count is 1" | |
[ "$VI" -gt "$VIFRSM" ] && die "ERROR: Maximum VI count is $VIFRSM" | |
VI_AVAIL=$(( $VIFRT - $VIRFAP )) | |
VI_NEED=$(( $VI * $NUM_VFS )) | |
[ "$VI_NEED" -gt "$VI_AVAIL" ] && \ | |
die "ERROR: Not enough VI resources: need $VI_NEED, available $VI_AVAIL" | |
# Pre-load the desired driver | |
if [ -n "$DRIVER" ]; then | |
modprobe $DRIVER | |
fi | |
# Avoid autoprobe hang when controller is not yet online | |
echo 0 > /sys/bus/pci/devices/$PF/sriov_drivers_autoprobe | |
CUR_VFS=$(cat /sys/bus/pci/devices/$PF/sriov_numvfs) | |
if [ "$CUR_VFS" != "$NUM_VFS" ]; then | |
echo 0 > /sys/bus/pci/devices/$PF/sriov_numvfs | |
[ "$NUM_VFS" != 0 ] && echo $NUM_VFS > /sys/bus/pci/devices/$PF/sriov_numvfs | |
fi | |
VFS=() | |
for i in $(seq 1 $NUM_VFS); do | |
VFN=$(jq ".[\"secondary-controllers\"][$(( $i - 1 ))][\"virtual-function-number\"]" <<<"$SECONDARY_CTRL_LIST_JSON") | |
PF_BUS=$(grep -oP '[[:alnum:]]{4}:[[:alnum:]]{2}' <<< "$PF") | |
PF_DEV=$(grep -oP '[[:alnum:]]{4}:[[:alnum:]]{2}:\K[[:alnum:]]{2}' <<< "$PF") | |
VF=$(printf "$PF_BUS:%02x.%x" $(( 0x$PF_DEV + $VFN / 8 )) $(( $VFN % 8 ))) | |
[ -e "/sys/bus/pci/devices/$VF/driver" ] && \ | |
echo "$VF" > "/sys/bus/pci/devices/$VF/driver/unbind" | |
# Per NVMe Specification, revision 1.3, 8.5.3, "To ensure that the host | |
# accurately detects capabilities of the secondary controller, the host | |
# should complete the following procedure to bring a secondary controller | |
# Online: | |
# | |
# 1. Use the Virtualization Management command to set the secondary | |
# controller to the Offline state. | |
quiet_nvme virt-mgmt /dev/$NVME -c $i -a 7 | |
# 2. Use the Virtualization Management command to assign VQ resources and VI | |
# resources. | |
quiet_nvme virt-mgmt /dev/$NVME -c $i -r 0 -n $VQ -a 8 | |
quiet_nvme virt-mgmt /dev/$NVME -c $i -r 1 -n $VI -a 8 | |
# 3. Perform a Controller Level Reset. If the secondary controller is a VF, | |
# then this should be a VF Function Level Reset. | |
echo 1 > /sys/bus/pci/devices/$VF/reset | |
# 4. Use the Virtualization Management command to set the secondary | |
# controller to the Online state." | |
quiet_nvme virt-mgmt /dev/$NVME -c $i -a 9 | |
if [ -n "$DRIVER" ]; then | |
echo "$DRIVER" > /sys/bus/pci/devices/$VF/driver_override | |
fi | |
echo "VF $VF online" | |
VFS+=($VF) | |
done | |
# Enable autoprobe to allow explicit binding | |
echo 1 > /sys/bus/pci/devices/$PF/sriov_drivers_autoprobe | |
for VF in "${VFS[@]}"; do | |
echo "$VF" > "/sys/bus/pci/drivers/$DRIVER/bind" | |
echo "VF $VF bound to $DRIVER" | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment