Skip to content

Instantly share code, notes, and snippets.

@j0ju
Created July 16, 2020 12:15
Show Gist options
  • Save j0ju/631ef19c5fdd20dfaaf3bb7d6349a8d0 to your computer and use it in GitHub Desktop.
Save j0ju/631ef19c5fdd20dfaaf3bb7d6349a8d0 to your computer and use it in GitHub Desktop.
docker-service-companion.bash
#!/bin/bash
### BEGIN INIT INFO
# Provides: mesh-docker-service-companion
# Required-Start: $local_fs $network $remote_fs
# Required-Stop: $local_fs $network $remote_fs
# Default-Start: 2 3 4 5
# Default-Stop: 0 1 6
# Short-Description: mesh-docker-service-companion docker network attach sidecar helper
# Description: mesh-docker-service-companion docker network attach sidecar helper
### END INIT INFO
set -euo pipefail
# Label that this deamon would act on for modes in network mode 'none'
# service_ips= # comma seperated list for IPs on a dedicated peer to peer interfaces handed into the container
# bridge_member= # a bridge a interface should be added to
# bridge_ips= # an IP for the bridge interface
# service_interface= # a interface available on host were we add service ips via proxy arp
#
# Label that this deamon would act on for modes in network mode 'host'
# service_interface= # a interface available in network mode == host, where service IPs can bound to
# service_ips= # a comma seperated list of IPs added to the service_interface, and later cleaned up on exit
# Examples
# docker run --rm -dti --network=none -l service_ips=1.1.1.1 debian:buster bash -
# docker run --rm -dti --network=host -l service_interface=foo -l service_ips=8.8.8.8,heise.de debian:buster bash -
PIDFILE=/var/run/docker-events.pid
PIPE=/var/run/docker-events.pipe
PIPEPIDFILE=/var/run/docker-events.pipe.pid
STATEDIR=/run/mesh-docker-service-companion
HOST_SVC_IP=169.254.123.123/32
HOST_SVC_IPv6=fe80::1/64
CONTAINER_IPv6=fe80::dead/64
HOST_SVC_PFX=V
CONTAINER_SVC_PFX=v
HOST_BRIDGE_PFX=B
CONTAINER_BRIDGE_PFX=b
# removes SHM and GRAPH mounts - this might prevent container restarts on certain graph drivers
#MOUNT_CLEANUP_REGEX=" [-/a-zA-Z0-9]+/docker/(zfs/graph/|overlay2|containers)[^ ]+"
# removes only SHM mounts
MOUNT_CLEANUP_REGEX=" [-/a-zA-Z0-9]+/docker/containers[^ ]+"
container_add_p2p_interface() { #
local oIFS="$IFS"
# check if p2p interface already in place
if nsenter -t $pid -n ip link show dev p2p > /dev/null 2>&1; then
echo "I: container $cid: p2p interface already present, skipping"
return 0
fi
if [ -n "$label_service_interface" ] && [ -d "/sys/class/net/$label_service_interface" ]; then
HOST_SVC_PFX="S"
fi
# set interface names
# set host side interface name
for hostIf in "${HOST_SVC_PFX}$cname" "${HOST_SVC_PFX}$cid" $cid ${$}${cid}; do
hostIf="$(echo "${hostIf}" | head -c 15)"
if ! [ -d "/sys/class/net/$hostIf" ]; then
break
fi
echo "W: $hostIf is already used, skipping" >&2
hostIf=
done
if [ -z "$hostIf" ]; then
echo "E: no host side service interface name found, skipping" >&2
break
fi
# set container side interface name
for containerIf in "${CONTAINER_SVC_PFX}$cname" "${CONTAINER_SVC_PFX}$cid" $cid ${$}${cid}; do
containerIf="$(echo "${containerIf}" | head -c 15)"
if ! [ -d "/sys/class/net/$containerIf" ]; then
break
fi
echo "W: $containerIf is already used, skipping" >&2
containerIf=
done
if [ -z "$containerIf" ]; then
echo "E: no container side service interface name found, skipping" >&2
break
fi
# create interfaces
ip link add "$hostIf" type veth peer name "$containerIf"
sysctl -q -w net.ipv6.conf.$containerIf.disable_ipv6=1
sysctl -q -w net.ipv6.conf.$hostIf.disable_ipv6=1
ip link set netns "$pid" dev "$containerIf"
# setup container side
nsenter -t $pid -n ip link set up name p2p dev $containerIf
echo "$cname:p2p <-> $hostIf"
# setup host side
ip link set up dev $hostIf
# ToDo
V6_ready=no
V4_ready=no
# collect the list of IPs and hostnames and resolve them, afterwards pipe them to the address setting loop
IFS=,
for item in $label_service_ips; do # collect and resolve - output per line: $ip $item
IFS="$oIFS"
case "$item" in
"" )
# we have nothing
continue
;;
*:* | [1-9]*.*.* )
# assume we have an IPv6 or IPV4 address
echo "$item $item"
;;
* )
# assume we have an hostname, try to resolve it
getent ahosts "$item" | grep -Eo "^[^ ]+" | sort -u | sed -e "s/$/ $item/"
;;
esac
done | while IFS="$oIFS" read ip label; do # set the addresses
IFS="$oIFS"
case "$ip" in
*:* ) # assume we have an IPv6 address
ip="$ip/128"
if [ $V6_ready = no ]; then # enable v6, add IP to host and container
sysctl -q -w net.ipv6.conf.$hostIf.disable_ipv6=0
nsenter -t $pid -n sysctl -q -w net.ipv6.conf.p2p.disable_ipv6=0
ip addr add $HOST_SVC_IPv6 dev $hostIf
nsenter -t $pid -n ip addr add $CONTAINER_IPv6 dev p2p
nsenter -t $pid -n ip route add default via ${HOST_SVC_IPv6%/*} dev p2p metric 100
V6_ready=yes
fi
nsenter -t $pid -n ip addr add $ip dev p2p
# prevent asymetric routing
nsenter -t $pid -n ip route add default from "$ip" via ${HOST_SVC_IPv6%/*} metric 100
# add route from host to container
ip route add $ip dev $hostIf
;;
[1-9]*.*.* ) # assume we have an IPv4 address
ip="$ip/32"
if [ $V4_ready = no ]; then
ip addr add $HOST_SVC_IP dev $hostIf
fi
nsenter -t $pid -n ip addr add "$ip" dev p2p
ip route add $ip dev $hostIf
if [ $V4_ready = no ]; then
nsenter -t $pid -n ip route add $HOST_SVC_IP dev p2p
# add a default route
nsenter -t $pid -n ip route add default via ${HOST_SVC_IP%/*} metric 100
# prevent asymetric routing via policy routing, as the FROM extension/source routing is not valid for ipv4
nsenter -t $pid -n ip route add default via ${HOST_SVC_IP%/*} metric 100 table 23
V4_ready=yes
fi
# prevent asymetric routing via policy routing, as the FROM extension is not valid for ipv4
nsenter -t $pid -n ip rule add from "$ip" table 23 pref 23
;;
* ) # this should not happen
echo "E: '$ip' is not a IPv4/6 address, set via label '$label', abort" >&2
break
;;
esac
if [ -n "$label_service_interface" ] && [ -d "/sys/class/net/$label_service_interface" ]; then
ip neigh add proxy ${ip%/*} dev $label_service_interface
echo "$hostIf $ip" >> "$statefile"
fi
echo "$cname:p2p added $ip"
done
echo "$cname:p2p $hostIf host service interface up"
# write out state for later usage on stop or destroy
echo "$hostIf" >> "$statefile"
} #
container_add_eth0_interface() { #
# check if bridge interface already in place
if nsenter -t $pid -n ip link show dev eth0 > /dev/null 2>&1; then
echo "I: container $cid: eth0 bridge interface already present, skipping"
return 0
fi
# TODO: trap on failure and delete interface
# set interface names
# set host side interface name
for hostIf in "${HOST_BRIDGE_PFX}$cname" "${HOST_BRIDGE_PFX}$cid" $cid ${$}${cid}; do
hostIf="$(echo "${hostIf}" | head -c 15)"
if ! [ -d "/sys/class/net/$hostIf" ]; then
break
fi
echo "W: $hostIf is already used, skipping" >&2
hostIf=
done
if [ -z "$hostIf" ]; then
echo "E: no host side service interface name found, skipping" >&2
break
fi
# set container side interface name
for containerIf in "${CONTAINER_BRIDGE_PFX}$cname" "${CONTAINER_BRIDGE_PFX}$cid" $cid ${$}${cid}; do
containerIf="$(echo "${containerIf}" | head -c 15)"
if ! [ -d "/sys/class/net/$containerIf" ]; then
break
fi
echo "W: $containerIf is already used, skipping" >&2
containerIf=
done
if [ -z "$containerIf" ]; then
echo "E: no container side service interface name found, skipping" >&2
break
fi
# create interfaces
h_mac="fe:$(echo $cname | md5sum | sed -re 's/^(..)(..)(..)(..)(..).*/\1:\2:\3:\4:\5/')"
c_mac="02:${h_mac##??:}"
ip link add "$hostIf" type veth peer name "$containerIf"
ip link set address "$h_mac" dev "$hostIf"
ip link set address "$c_mac" dev "$containerIf"
sysctl -q -w net.ipv6.conf.$hostIf.disable_ipv6=1
ip link set netns "$pid" dev "$containerIf"
# setup host side
ip link set up master "$label_bridge_member" dev $hostIf
echo "$cname:eth0 (mac: $c_mac) <-> $hostIf (mac: $h_mac, bridge: $label_bridge_member)"
# setup container side
nsenter -t $pid -n sysctl -q -w net.ipv6.conf.lo.disable_ipv6=0
nsenter -t $pid -n sysctl -q -w net.ipv6.conf.$containerIf.disable_ipv6=0
nsenter -t $pid -n ip link set up name eth0 dev $containerIf
if [ "$label_bridge_checksum_offload" = off ]; then
nsenter -t $pid -n ethtool -K eth0 rx off tx off
fi
if [ "$label_bridge_ips" = dhcp ]; then
# this needs to set up addesses/prefixes and routes
nsenter -t $pid -n udhcpc -i eth0 -q -x "hostname:$cname"
echo "$cname:eth0 got DHCP"
nsenter -t $pid -n ip addr show dev eth0 | sed -rne "/inet[6]?/ s/^[ ]+/$cname:eth0: / p"
else
# add addresses
( IFS=,
for ip in $label_bridge_ips; do
[ "$ip" = dhcp ] && \
continue || \
nsenter -t $pid -n ip addr add "$ip"/32 dev eth0
echo "$cname:eth0 added $ip"
done
)
# set device routes
ip r list dev $label_bridge_member | awk '$1 ~ "^default|/[0-9]+$" {print}' | while read pfx via gw _; do
if [ "$via" != via ]; then
nsenter -t $pid -n ip route add $pfx dev eth0 metric 10
echo "$cname:eth0 added device route $pfx"
fi
done
# set routes with gateways
ip r list dev $label_bridge_member | awk '$1 ~ "^default|/[0-9]+$" {print}' | while read pfx via gw _; do
if [ "$via" = via ]; then
nsenter -t $pid -n ip route add $pfx via "$gw" dev eth0 metric 10
echo "$cname:eth0 added route $pfx via $gw"
fi
done
fi
echo "$cname:eth0 $hostIf host bridge interface up"
# write out state for later usage on stop or destroy
echo "$hostIf" >> "$statefile"
} #
container_add_service_ips() { #
if ! ip link show dev "$label_service_interface" > /dev/null 2>&1; then
echo "E: interface from service_interface label '$label_service_interface' not found" >&2
echo "W: not adding any IPs" >&2
return 0
fi
local ip=
local label=
local oIFS="$IFS"
# get CIDR of primary addresses on interface
# use them to add svc address to interface to avoid more specific imports
local cidr4="$( ip -4 a s dev "$label_service_interface" | awk -F"[ /]+" '$2 == "inet" {print $4; exit}' )"
local cidr6="$( ip -6 a s dev "$label_service_interface" | awk -F"[ /]+" '$2 == "inet6" {print $4; exit}' )"
# collect the list of IPs and hostnames and resolve them, afterwards pipe them to the address setting loop
IFS=,
for ip in $label_service_ips; do # collect and resolve
IFS="$oIFS"
case "$ip" in
"" )
# we have nothing
continue
;;
*:* | [1-9]*.*.* )
# assume we have an IPv6 or IPV4 address
echo "$ip $ip"
;;
* )
# assume we have an hostname, try to resolve it
getent ahosts "$ip" | grep -Eo "^[^ ]+" | sort -u | sed -e "s/$/ $ip/"
;;
esac
done | while IFS="$oIFS" read ip label; do # set the addresses
IFS="$oIFS"
case "$ip" in
*:* ) # assume we have an IPv6 address
cidr=128
cidr_if="$cidr6"
;;
[1-9]*.*.* ) # assume we have an IPv4 address
cidr=32
cidr_if="$cidr4"
;;
* ) # this should not happen
echo "E: '$ip' is not a IPv4/6 address, set via label '$label', abort" >&2
break
;;
esac
# we now have either an IPv4 or an IPv6 address, determine the correct cidr
# if it is not routed via a gateway, but via the specified $label_service_interface", then use the interfaces
# CIDR, otherwise use /32 or /128 depending on the protocol
if route="$(ip route get "$ip" | grep " dev $label_service_interface ")"; then
case "$route" in
*" via "* ) ip="$ip/$cidr" ;;
* ) ip="$ip/$cidr_if" ;;
esac
else
ip="$ip/$cidr"
fi
if ip route get ${ip%/*} | grep ^local > /dev/null || ip addr add "$ip" dev "$label_service_interface"; then
echo "I: $cname: added $ip to $label_service_interface"
echo "$label_service_interface $ip" >> "$statefile"
else
echo "E: could not add '$ip' ($label) to '$label_service_interface'" >&2
fi
done
} #
host_container_mount_cleanup() { #
grep -oE "$MOUNT_CLEANUP_REGEX" /proc/1/mounts | \
while read mount; do
mount="${mount# }"
if nsenter -t 1 -m umount "${mount}"; then
echo "$cname: umounted '${mount}' from host"
else
echo "E: $cname: error when umounting container mount '$mount' from host" >&2
fi
done || :
} #
container_start() { #
cid="$1"
# ensure clean variables
label_service_ips= # comma seperated list of IPs on a dedicated peer to peer interfaces handed into the container
label_bridge_member= # a bridge a interface should be added to
label_bridge_ips= # an IP for the bridge interface, dhcp is an option
#
label_bridge_checksum_offload= # when using bridge an local VMs not from KVM (eg VirtualBox, VMWare), we need this to fix checksum errors
#
label_service_interface= # a interface available in network mode == host, where service IPs can bound to
label_service_ips= # a comma seperated list of IPs added to the service_interface, and later cleaned up on exit
# only act if network mode is "none"
network_mode="$( docker inspect --format='{{ range $key, $value := .NetworkSettings.Networks}}{{ $key }}{{ end}}' "$cid" )"
case "$network_mode" in
none | host ) ;; # we handle this types of network modes
* ) return 0 ;; # we do NOT
esac
# fetch label from container
eval "$(docker inspect -f '{{ range $k, $v := .Config.Labels -}}
label_{{ $k }}='"'"'{{ $v }}'"'"'
{{ end -}}' "$cid" | \
grep -E "^[ ]*label_(service_interface|service_ips|bridge_ips|bridge_member|bridge_checksum_offload)="
)
" # eo EVAL
# fetch container name
cname="$(docker inspect --format='{{.Name}}' "$cid")"
cname="${cname#/}"
# get pid for nsenter
pid="$(docker inspect --format='{{.State.Pid}}' $cid)"
# create/cleanup statefile
statefile="$STATEDIR/$cid.interfaces"
if [ -L "$statefile" ]; then
rm -f "$statefile"
fi
[ -f "$statefile" ] || \
: > "$statefile"
echo "$cname: container started (pid $pid, id $cid)"
case "$network_mode" in
none )
if [ -n "$label_bridge_ips" -a -n "$label_bridge_member" ]; then
container_add_eth0_interface
fi
# setup direct link p2p service interface
if [ -n "$label_service_ips" ]; then
container_add_p2p_interface
fi
;;
host )
if [ -n "$label_service_interface" -a -n "$label_service_ips" ]; then
container_add_service_ips
fi
;;
esac
# cleanup containter filesystems from main namespaces, if possible
host_container_mount_cleanup
} #
container_die() { #
local cid="$1"
# fetch container name
local cname="$(docker inspect --format='{{.Name}}' "$cid")"
cname="${cname#/}"
local statefile="$STATEDIR/$cid.interfaces"
if [ -f "$statefile" ]; then
ip=
while read hostIf ip; do
if [ -z "$ip" ]; then
ip link del "$hostIf" 2> /dev/null || :
else
while ip neigh del "${ip%/*}" 2> /dev/null; do :; done
ip addr del "$ip" dev "$hostIf" 2> /dev/null || :
fi
ip=
rm -f "$statefile"
done < "$statefile"
rm -f "$statefile"
fi
} #
daemon() { #
mkdir -p "$STATEDIR" || if [ ! -d "$STATEDIR" ]; then
echo "E: '$STATEDIR' is not a directory, EXITING" >&2
exit 1
fi
# container:start coldplug
docker ps --filter status=running --format "{{.ID}}" | while read cid; do
echo "I: found running container $cid"
# we have the short Id: fetch long Id
cid="$( docker inspect "$cid" --format='{{ .Id }}' )"
container_start "$cid" &
done
# setup pipe to receive events from "docker events"
rm -f "$PIPE" "$PIPEPIDFILE"
mknod "$PIPE" p
"$(which docker)" events > "$PIPE" < /dev/null 2> /dev/null & PIPEPID=$!
echo "$PIPEPID" > "$PIPEPIDFILE"
exec < "$PIPE"
# proper cleanup of pipe attached child processes
trap "cleanup; exit" EXIT HUP INT QUIT TERM
while read date object event cid _; do
case "$object:$event" in # this is running as a detached background job
container:start ) container_start "$cid" & ;;
container:die ) container_die "$cid" ;;
#* ) echo "I: $object:$event $cid" ;;
esac
done
} #
start() { #
stop
if [ -L "$0" ]; then
case "$0" in
/etc/init.d/* ) ;;
* ) exec "$(readlink "$0")" start
esac
fi
/sbin/start-stop-daemon --start --exec "$0" \
--quiet --background \
--chdir /tmp \
--pidfile "$PIDFILE" --make-pidfile \
-- \
daemon \
# eo start-stop-daemon
} #
cleanup() { #
[ ! -r "$PIPEPIDFILE" ] || \
/sbin/start-stop-daemon --stop \
--pidfile "$PIPEPIDFILE" \
--quiet \
--retry=TERM/5/KILL/1 || : \
# eo start-stop-daemon
rm -f "$PIPEPIDFILE"
} #
stop() { #
/sbin/start-stop-daemon --stop \
--pidfile "$PIDFILE" \
--quiet \
--retry=TERM/5/KILL/1 \
# eo start-stop-daemon
cleanup
} #
restart() { #
status && start
} #
reload() { #
restart
} #
status() { #
/sbin/start-stop-daemon --stop \
--pidfile "$PIDFILE" \
--signal 0 \
# eo start-stop-daemon
} #
usage() { #
exec 1>&2
echo
echo "usage: $0 [start|stop|restart|reload|status|daemon]"
echo
exit 1
} #
case "$1" in
start | stop | restart | reload | status | daemon ) "$1" ;;
* ) usage ;;
esac
# vim: ft=sh sw=2 ts=2 et foldmethod=marker foldmarker={\ #,}\ #
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment