Last active
April 19, 2021 14:12
-
-
Save williamcaban/761ebb5b2dc6d9bcfa4978b83292ea01 to your computer and use it in GitHub Desktop.
Temporary workaround for RPS, IRQs and Queues
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
""" | |
Usage: python3 pp-rps-mc.py 1,2,52-53 > ran-rps-netqueues-fix.yaml ; oc apply -f ran-rps-netqueues-fix.yaml | |
Updated April 2 2021 -- patch to address issue cnf1360 | |
This script generates a MachineConfiguration OCP object that | |
can be applied to a 4.6 cluster and adds additional tuning | |
fixes related to networking. | |
This workarounds the following bugs: | |
- https://bugzilla.redhat.com/show_bug.cgi?id=1899321 | |
- https://bugzilla.redhat.com/show_bug.cgi?id=1887568 | |
""" | |
import base64 | |
import functools | |
import gzip | |
import shutil | |
import sys | |
LLHOOKSSH = """#!/usr/bin/env bash | |
mask="${1}" | |
[ -n "${mask}" ] || { logger "${0}: The rps-mask parameter is missing" ; exit 0; } | |
pid=$(jq '.pid' /dev/stdin 2>&1) | |
[[ $? -eq 0 && -n "${pid}" ]] || { logger "${0}: Failed to extract the pid: ${pid}"; exit 0; } | |
ns=$(ip netns identify "${pid}" 2>&1) | |
[[ $? -eq 0 && -n "${ns}" ]] || { logger "${0} Failed to identify the namespace: ${ns}"; exit 0; } | |
mode=$(ip netns exec "${ns}" [ -w /sys ] && echo "rw" || echo "ro" 2>&1) | |
[ $? -eq 0 ] || { logger "${0} Failed to determine if the /sys is writable: ${mode}"; exit 0; } | |
if [ "${mode}" = "ro" ]; then | |
res=$(ip netns exec "${ns}" mount -o remount,rw /sys 2>&1) | |
[ $? -eq 0 ] || { logger "${0}: Failed to remount /sys as rw: ${res}"; exit 0; } | |
fi | |
# /sys/class/net can't be used recursively to find the rps_cpus file, use /sys/devices instead | |
res=$(ip netns exec "${ns}" find /sys/devices -type f -name rps_cpus -exec sh -c "echo ${mask} | cat > {}" \; 2>&1) | |
[[ $? -eq 0 && -z "${res}" ]] || logger "${0}: Failed to apply the RPS mask: ${res}" | |
if [ "${mode}" = "ro" ]; then | |
ip netns exec "${ns}" mount -o remount,ro /sys | |
[ $? -eq 0 ] || exit 1 # Error out so the pod will not start with a writable /sys | |
fi | |
""" | |
RUNCWRPR = """#!/usr/bin/env bash | |
if [ -n "$3" ] && [ "$3" == "create" ] && [ -f "$5/config.json" ]; then | |
conf="$5/config.json" | |
cpus=$(jq -r '.linux.resources.cpu.cpus // empty' $conf) | |
if [ -n $cpus ]; then | |
tmp=$(mktemp) | |
jq '.linux.resources.cpu.cpus = "{mask}"' $conf > "$tmp" | |
mv "$tmp" $conf | |
fi | |
fi | |
/bin/runc "$@" | |
""" | |
CRIORNTM = """# We should copy paste the default runtime because this snippet will override the whole runtimes section | |
[crio.runtime.runtimes.runc] | |
runtime_path = "/usr/local/bin/runc-wrapper.sh" | |
runtime_type = "oci" | |
runtime_root = "/run/runc" | |
# The CRI-O will check the runtime handler name under the code and will activate high-performance features, | |
# like CPU load balancing. | |
# We should provide the runtime_path because we need to inform that we want to re-use runc binary and we | |
# do not have high-performance binary under the $PATH that will point to it. | |
[crio.runtime.runtimes.high-performance] | |
runtime_path = "/usr/local/bin/runc-wrapper.sh" | |
runtime_type = "oci" | |
runtime_root = "/run/runc" | |
""" | |
SETRPSMSK = """#!/usr/bin/env bash | |
dev=$1 | |
[ -n "${dev}" ] || { echo "The device argument is missing" >&2 ; exit 1; } | |
mask=$2 | |
[ -n "${mask}" ] || { echo "The mask argument is missing" >&2 ; exit 1; } | |
dev_dir="/sys/class/net/${dev}" | |
function find_dev_dir { | |
systemd_devs=$(systemctl list-units -t device | grep sys-subsystem-net-devices | cut -d' ' -f1) | |
for systemd_dev in ${systemd_devs}; do | |
dev_sysfs=$(systemctl show "${systemd_dev}" -p SysFSPath --value) | |
dev_orig_name="${dev_sysfs##*/}" | |
if [ "${dev_orig_name}" = "${dev}" ]; then | |
dev_name="${systemd_dev##*-}" | |
dev_name="${dev_name%%.device}" | |
if [ "${dev_name}" = "${dev}" ]; then # disregard the original device unit | |
continue | |
fi | |
echo "${dev} device was renamed to $dev_name" | |
dev_dir="/sys/class/net/${dev_name}" | |
break | |
fi | |
done | |
} | |
[ -d "${dev_dir}"/queues ] || find_dev_dir # the net device was renamed, find the new name | |
[ -d "${dev_dir}"/queues ] || { sleep 5; find_dev_dir; } # search failed, wait a little and try again | |
[ -d "${dev_dir}"/queues ] || { echo "${dev_dir}"/queues directory not found >&2 ; exit 0; } # the interface disappeared, not an error | |
find "${dev_dir}"/queues -type f -name rps_cpus -exec sh -c "echo ${mask} | cat > {}" \; | |
""" | |
JSONTPL = """{{ | |
"version": "1.0.0", | |
"hook": {{ | |
"path": "/usr/local/bin/low-latency-hooks.sh", | |
"args": ["low-latency-hooks.sh", "{mask}"] | |
}}, | |
"when": {{ | |
"always": true | |
}}, | |
"stages": ["prestart"] | |
}} | |
""" | |
TPL = """apiVersion: machineconfiguration.openshift.io/v1 | |
kind: MachineConfig | |
metadata: | |
labels: | |
machineconfiguration.openshift.io/role: ran | |
name: ran-rps-netqueues-fix | |
spec: | |
config: | |
ignition: | |
version: 3.1.0 | |
storage: | |
files: | |
- contents: | |
source: data:text/plain;charset=utf-8;base64,{llhooks} | |
verification: {{}} | |
filesystem: root | |
mode: 448 | |
path: /usr/local/bin/low-latency-hooks.sh | |
- contents: | |
source: data:text/plain;charset=utf-8;base64,{runcwrapper} | |
verification: {{}} | |
filesystem: root | |
mode: 448 | |
path: /usr/local/bin/runc-wrapper.sh | |
- contents: | |
source: data:text/plain;charset=utf-8;base64,{crioruntime} | |
verification: {{}} | |
filesystem: root | |
mode: 448 | |
path: /etc/crio/crio.conf.d/99-runtimes.conf | |
- contents: | |
source: data:text/plain;charset=utf-8;base64,{setrpsmask} | |
verification: {{}} | |
filesystem: root | |
mode: 448 | |
path: /usr/local/bin/set-rps-mask.sh | |
- contents: | |
source: data:text/plain;charset=utf-8;base64,{json} | |
verification: {{}} | |
filesystem: root | |
mode: 420 | |
path: /etc/containers/oci/hooks.d/99-low-latency-hooks.json | |
- contents: | |
source: data:text/plain;charset=utf-8;base64,U1VCU1lTVEVNPT0ibmV0IiwgQUNUSU9OPT0iYWRkIiwgVEFHKz0ic3lzdGVtZCIsIEVOVntTWVNURU1EX1dBTlRTfT0idXBkYXRlLXJwc0Alay5zZXJ2aWNlIgo= | |
verification: {{}} | |
filesystem: root | |
mode: 420 | |
path: /etc/udev/rules.d/99-netdev-rps.rules | |
- contents: | |
source: data:text/plain;charset=utf-8;base64,{queues} | |
verification: {{}} | |
filesystem: root | |
mode: 448 | |
path: /usr/local/bin/fix-net-queues.sh | |
systemd: | |
units: | |
- contents: | | |
[Unit] | |
Description=Sets network devices RPS mask | |
[Service] | |
Type=oneshot | |
ExecStart=/usr/local/bin/set-rps-mask.sh %i {mask} | |
name: [email protected] | |
- name: fix-net-queues.service | |
enabled: true | |
contents: | | |
[Unit] | |
Description=Limit the net queues to 2 per housekeeping cpu | |
Before=kubelet.service | |
After=network.service | |
[Service] | |
Type=oneshot | |
ExecStart=/usr/local/bin/fix-net-queues.sh | |
[Install] | |
WantedBy=multi-user.target | |
""" | |
QUEUES_SH = """#!/bin/bash | |
#!/bin/bash | |
queue_cnt=4 # 2x reserved cores (for RX/TX queues) | |
if [ ! -f /usr/local/bin/ethtool ]; then | |
# find ethtool in local containers (e.g. tuned container) | |
find_ethtool=$(find /var/lib/containers/storage/overlay -name "ethtool" | head -1) | |
cp $find_ethtool /usr/local/bin/ethtool | |
fi | |
for n in /sys/class/net/*; | |
do | |
syspth=$(realpath $n | grep -v '/sys/devices/virtual') | |
if [ x$syspth == "x" ]; then continue; fi | |
iface=$(basename $syspth) | |
echo "Configuring $iface to have $queue_cnt queues" | |
/usr/local/bin/ethtool -L $iface combined $queue_cnt | |
done | |
""" | |
def expand_cpus(acc, cpu): | |
if "-" in cpu: | |
first, last = cpu.split("-") | |
return acc + list(range(int(first), int(last) + 1)) | |
else: | |
return acc + [int(cpu)] | |
if __name__ == "__main__": | |
cpus = functools.reduce(expand_cpus, sys.argv[1].split(","), []) | |
cpumask = functools.reduce(lambda acc,x: acc + 2**x, cpus, 0) | |
hexmaskraw = hex(cpumask)[2:] #strip 0x | |
hexmasklist = [] | |
while hexmaskraw: | |
partmask = hexmaskraw[-8:] | |
partmask = "0" * (8 - len(partmask)) + partmask | |
hexmaskraw = hexmaskraw[:-8] | |
hexmasklist.insert(0, partmask) | |
hexmask = ",".join(hexmasklist) | |
hook_json = base64.b64encode(JSONTPL.format( | |
mask=hexmask).encode('UTF-8')).decode('UTF-8') | |
queues = base64.b64encode(QUEUES_SH.format( | |
cpucount=len(cpus)).encode('UTF-8')).decode('UTF-8') | |
llhooks = base64.b64encode(LLHOOKSSH.encode('UTF-8')).decode('UTF-8') | |
runcwrapper = base64.b64encode(RUNCWRPR.format( | |
mask=sys.argv[1]).encode('UTF-8')).decode('UTF-8') | |
crioruntime = base64.b64encode(CRIORNTM.encode('UTF-8')).decode('UTF-8') | |
setrpsmask = base64.b64encode(SETRPSMSK.encode('UTF-8')).decode('UTF-8') | |
mc_yaml = TPL.format(json=hook_json, queues=queues, mask=hexmask, llhooks=llhooks, | |
runcwrapper=runcwrapper, crioruntime=crioruntime, setrpsmask=setrpsmask ) | |
print(mc_yaml) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
apiVersion: tuned.openshift.io/v1 | |
kind: Tuned | |
metadata: | |
name: performance-tuned-patch-ran-du | |
namespace: openshift-cluster-node-tuning-operator | |
spec: | |
profile: | |
- data: | | |
[main] | |
summary=Configuration changes profile inherited from performance created tuned | |
include=openshift-node-performance-ran-du | |
[scheduler] | |
group.ksoftirqd=0:f:11:*:ksoftirqd.* | |
group.rcuc=0:f:11:*:rcuc.* | |
group.rcub=0:f:11:*:rcub.* | |
[sysctl] | |
# Only uncomment if DPDK zero latency required (may cause latency for some configurations) | |
net.core.busy_read=0 | |
net.core.busy_poll=0 | |
[service] | |
service.stalld=stop,disable | |
[sysfs] | |
## set CPU affinity for the kernel's write-back bdi-flush threads to the housekeeping core | |
/sys/devices/virtual/workqueue/nvme-wq/cpumask = ${not_isolated_cpumask} | |
/sys/devices/virtual/workqueue/nvme-delete-wq/cpumask = ${not_isolated_cpumask} | |
/sys/devices/virtual/workqueue/nvme-reset-wq/cpumask = ${not_isolated_cpumask} | |
/sys/bus/workqueue/devices/nvme-wq/cpumask = ${not_isolated_cpumask} | |
/sys/bus/workqueue/devices/nvme-delete-wq/cpumask = ${not_isolated_cpumask} | |
/sys/bus/workqueue/devices/nvme-reset-wq/cpumask = ${not_isolated_cpumask} | |
/sys/bus/workqueue/devices/writeback/cpumask = ${not_isolated_cpumask} | |
/sys/devices/virtual/workqueue/cpumask = ${not_isolated_cpumask} | |
/sys/devices/system/machinecheck/machinecheck*/ignore_ce = 1 | |
/sys/bus/workqueue/devices/ib-comp-unb-wq/cpumask = ${not_isolated_cpumask} | |
[bootloader] | |
cmdline_removeKernelArgs=-nohz=on | |
# NOTE: nohz_full=${isolated_cores} improves cyclictest results but may have negative impact on some DPDK low-latency applications | |
cmdline_additionalKernelArgs=+intel_idle.max_cstate=0 processor.max_cstate=1 nohz=off rcu_nocbs=all rcu_nocb_poll nowatchdog nmi_watchdog=0 nohz_full=${isolated_cores} | |
name: performance-tuned-patch-ran-du | |
recommend: | |
- machineConfigLabels: | |
machineconfiguration.openshift.io/role: "ran-du" | |
priority: 21 | |
profile: performance-tuned-patch-ran-du |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment