Created
July 16, 2020 12:15
-
-
Save j0ju/631ef19c5fdd20dfaaf3bb7d6349a8d0 to your computer and use it in GitHub Desktop.
docker-service-companion.bash
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
### BEGIN INIT INFO | |
# Provides: mesh-docker-service-companion | |
# Required-Start: $local_fs $network $remote_fs | |
# Required-Stop: $local_fs $network $remote_fs | |
# Default-Start: 2 3 4 5 | |
# Default-Stop: 0 1 6 | |
# Short-Description: mesh-docker-service-companion docker network attach sidecar helper | |
# Description: mesh-docker-service-companion docker network attach sidecar helper | |
### END INIT INFO | |
set -euo pipefail | |
# Label that this deamon would act on for modes in network mode 'none' | |
# service_ips= # comma seperated list for IPs on a dedicated peer to peer interfaces handed into the container | |
# bridge_member= # a bridge a interface should be added to | |
# bridge_ips= # an IP for the bridge interface | |
# service_interface= # a interface available on host were we add service ips via proxy arp | |
# | |
# Label that this deamon would act on for modes in network mode 'host' | |
# service_interface= # a interface available in network mode == host, where service IPs can bound to | |
# service_ips= # a comma seperated list of IPs added to the service_interface, and later cleaned up on exit | |
# Examples | |
# docker run --rm -dti --network=none -l service_ips=1.1.1.1 debian:buster bash - | |
# docker run --rm -dti --network=host -l service_interface=foo -l service_ips=8.8.8.8,heise.de debian:buster bash - | |
PIDFILE=/var/run/docker-events.pid | |
PIPE=/var/run/docker-events.pipe | |
PIPEPIDFILE=/var/run/docker-events.pipe.pid | |
STATEDIR=/run/mesh-docker-service-companion | |
HOST_SVC_IP=169.254.123.123/32 | |
HOST_SVC_IPv6=fe80::1/64 | |
CONTAINER_IPv6=fe80::dead/64 | |
HOST_SVC_PFX=V | |
CONTAINER_SVC_PFX=v | |
HOST_BRIDGE_PFX=B | |
CONTAINER_BRIDGE_PFX=b | |
# removes SHM and GRAPH mounts - this might prevent container restarts on certain graph drivers | |
#MOUNT_CLEANUP_REGEX=" [-/a-zA-Z0-9]+/docker/(zfs/graph/|overlay2|containers)[^ ]+" | |
# removes only SHM mounts | |
MOUNT_CLEANUP_REGEX=" [-/a-zA-Z0-9]+/docker/containers[^ ]+" | |
container_add_p2p_interface() { # | |
local oIFS="$IFS" | |
# check if p2p interface already in place | |
if nsenter -t $pid -n ip link show dev p2p > /dev/null 2>&1; then | |
echo "I: container $cid: p2p interface already present, skipping" | |
return 0 | |
fi | |
if [ -n "$label_service_interface" ] && [ -d "/sys/class/net/$label_service_interface" ]; then | |
HOST_SVC_PFX="S" | |
fi | |
# set interface names | |
# set host side interface name | |
for hostIf in "${HOST_SVC_PFX}$cname" "${HOST_SVC_PFX}$cid" $cid ${$}${cid}; do | |
hostIf="$(echo "${hostIf}" | head -c 15)" | |
if ! [ -d "/sys/class/net/$hostIf" ]; then | |
break | |
fi | |
echo "W: $hostIf is already used, skipping" >&2 | |
hostIf= | |
done | |
if [ -z "$hostIf" ]; then | |
echo "E: no host side service interface name found, skipping" >&2 | |
break | |
fi | |
# set container side interface name | |
for containerIf in "${CONTAINER_SVC_PFX}$cname" "${CONTAINER_SVC_PFX}$cid" $cid ${$}${cid}; do | |
containerIf="$(echo "${containerIf}" | head -c 15)" | |
if ! [ -d "/sys/class/net/$containerIf" ]; then | |
break | |
fi | |
echo "W: $containerIf is already used, skipping" >&2 | |
containerIf= | |
done | |
if [ -z "$containerIf" ]; then | |
echo "E: no container side service interface name found, skipping" >&2 | |
break | |
fi | |
# create interfaces | |
ip link add "$hostIf" type veth peer name "$containerIf" | |
sysctl -q -w net.ipv6.conf.$containerIf.disable_ipv6=1 | |
sysctl -q -w net.ipv6.conf.$hostIf.disable_ipv6=1 | |
ip link set netns "$pid" dev "$containerIf" | |
# setup container side | |
nsenter -t $pid -n ip link set up name p2p dev $containerIf | |
echo "$cname:p2p <-> $hostIf" | |
# setup host side | |
ip link set up dev $hostIf | |
# ToDo | |
V6_ready=no | |
V4_ready=no | |
# collect the list of IPs and hostnames and resolve them, afterwards pipe them to the address setting loop | |
IFS=, | |
for item in $label_service_ips; do # collect and resolve - output per line: $ip $item | |
IFS="$oIFS" | |
case "$item" in | |
"" ) | |
# we have nothing | |
continue | |
;; | |
*:* | [1-9]*.*.* ) | |
# assume we have an IPv6 or IPV4 address | |
echo "$item $item" | |
;; | |
* ) | |
# assume we have an hostname, try to resolve it | |
getent ahosts "$item" | grep -Eo "^[^ ]+" | sort -u | sed -e "s/$/ $item/" | |
;; | |
esac | |
done | while IFS="$oIFS" read ip label; do # set the addresses | |
IFS="$oIFS" | |
case "$ip" in | |
*:* ) # assume we have an IPv6 address | |
ip="$ip/128" | |
if [ $V6_ready = no ]; then # enable v6, add IP to host and container | |
sysctl -q -w net.ipv6.conf.$hostIf.disable_ipv6=0 | |
nsenter -t $pid -n sysctl -q -w net.ipv6.conf.p2p.disable_ipv6=0 | |
ip addr add $HOST_SVC_IPv6 dev $hostIf | |
nsenter -t $pid -n ip addr add $CONTAINER_IPv6 dev p2p | |
nsenter -t $pid -n ip route add default via ${HOST_SVC_IPv6%/*} dev p2p metric 100 | |
V6_ready=yes | |
fi | |
nsenter -t $pid -n ip addr add $ip dev p2p | |
# prevent asymetric routing | |
nsenter -t $pid -n ip route add default from "$ip" via ${HOST_SVC_IPv6%/*} metric 100 | |
# add route from host to container | |
ip route add $ip dev $hostIf | |
;; | |
[1-9]*.*.* ) # assume we have an IPv4 address | |
ip="$ip/32" | |
if [ $V4_ready = no ]; then | |
ip addr add $HOST_SVC_IP dev $hostIf | |
fi | |
nsenter -t $pid -n ip addr add "$ip" dev p2p | |
ip route add $ip dev $hostIf | |
if [ $V4_ready = no ]; then | |
nsenter -t $pid -n ip route add $HOST_SVC_IP dev p2p | |
# add a default route | |
nsenter -t $pid -n ip route add default via ${HOST_SVC_IP%/*} metric 100 | |
# prevent asymetric routing via policy routing, as the FROM extension/source routing is not valid for ipv4 | |
nsenter -t $pid -n ip route add default via ${HOST_SVC_IP%/*} metric 100 table 23 | |
V4_ready=yes | |
fi | |
# prevent asymetric routing via policy routing, as the FROM extension is not valid for ipv4 | |
nsenter -t $pid -n ip rule add from "$ip" table 23 pref 23 | |
;; | |
* ) # this should not happen | |
echo "E: '$ip' is not a IPv4/6 address, set via label '$label', abort" >&2 | |
break | |
;; | |
esac | |
if [ -n "$label_service_interface" ] && [ -d "/sys/class/net/$label_service_interface" ]; then | |
ip neigh add proxy ${ip%/*} dev $label_service_interface | |
echo "$hostIf $ip" >> "$statefile" | |
fi | |
echo "$cname:p2p added $ip" | |
done | |
echo "$cname:p2p $hostIf host service interface up" | |
# write out state for later usage on stop or destroy | |
echo "$hostIf" >> "$statefile" | |
} # | |
container_add_eth0_interface() { # | |
# check if bridge interface already in place | |
if nsenter -t $pid -n ip link show dev eth0 > /dev/null 2>&1; then | |
echo "I: container $cid: eth0 bridge interface already present, skipping" | |
return 0 | |
fi | |
# TODO: trap on failure and delete interface | |
# set interface names | |
# set host side interface name | |
for hostIf in "${HOST_BRIDGE_PFX}$cname" "${HOST_BRIDGE_PFX}$cid" $cid ${$}${cid}; do | |
hostIf="$(echo "${hostIf}" | head -c 15)" | |
if ! [ -d "/sys/class/net/$hostIf" ]; then | |
break | |
fi | |
echo "W: $hostIf is already used, skipping" >&2 | |
hostIf= | |
done | |
if [ -z "$hostIf" ]; then | |
echo "E: no host side service interface name found, skipping" >&2 | |
break | |
fi | |
# set container side interface name | |
for containerIf in "${CONTAINER_BRIDGE_PFX}$cname" "${CONTAINER_BRIDGE_PFX}$cid" $cid ${$}${cid}; do | |
containerIf="$(echo "${containerIf}" | head -c 15)" | |
if ! [ -d "/sys/class/net/$containerIf" ]; then | |
break | |
fi | |
echo "W: $containerIf is already used, skipping" >&2 | |
containerIf= | |
done | |
if [ -z "$containerIf" ]; then | |
echo "E: no container side service interface name found, skipping" >&2 | |
break | |
fi | |
# create interfaces | |
h_mac="fe:$(echo $cname | md5sum | sed -re 's/^(..)(..)(..)(..)(..).*/\1:\2:\3:\4:\5/')" | |
c_mac="02:${h_mac##??:}" | |
ip link add "$hostIf" type veth peer name "$containerIf" | |
ip link set address "$h_mac" dev "$hostIf" | |
ip link set address "$c_mac" dev "$containerIf" | |
sysctl -q -w net.ipv6.conf.$hostIf.disable_ipv6=1 | |
ip link set netns "$pid" dev "$containerIf" | |
# setup host side | |
ip link set up master "$label_bridge_member" dev $hostIf | |
echo "$cname:eth0 (mac: $c_mac) <-> $hostIf (mac: $h_mac, bridge: $label_bridge_member)" | |
# setup container side | |
nsenter -t $pid -n sysctl -q -w net.ipv6.conf.lo.disable_ipv6=0 | |
nsenter -t $pid -n sysctl -q -w net.ipv6.conf.$containerIf.disable_ipv6=0 | |
nsenter -t $pid -n ip link set up name eth0 dev $containerIf | |
if [ "$label_bridge_checksum_offload" = off ]; then | |
nsenter -t $pid -n ethtool -K eth0 rx off tx off | |
fi | |
if [ "$label_bridge_ips" = dhcp ]; then | |
# this needs to set up addesses/prefixes and routes | |
nsenter -t $pid -n udhcpc -i eth0 -q -x "hostname:$cname" | |
echo "$cname:eth0 got DHCP" | |
nsenter -t $pid -n ip addr show dev eth0 | sed -rne "/inet[6]?/ s/^[ ]+/$cname:eth0: / p" | |
else | |
# add addresses | |
( IFS=, | |
for ip in $label_bridge_ips; do | |
[ "$ip" = dhcp ] && \ | |
continue || \ | |
nsenter -t $pid -n ip addr add "$ip"/32 dev eth0 | |
echo "$cname:eth0 added $ip" | |
done | |
) | |
# set device routes | |
ip r list dev $label_bridge_member | awk '$1 ~ "^default|/[0-9]+$" {print}' | while read pfx via gw _; do | |
if [ "$via" != via ]; then | |
nsenter -t $pid -n ip route add $pfx dev eth0 metric 10 | |
echo "$cname:eth0 added device route $pfx" | |
fi | |
done | |
# set routes with gateways | |
ip r list dev $label_bridge_member | awk '$1 ~ "^default|/[0-9]+$" {print}' | while read pfx via gw _; do | |
if [ "$via" = via ]; then | |
nsenter -t $pid -n ip route add $pfx via "$gw" dev eth0 metric 10 | |
echo "$cname:eth0 added route $pfx via $gw" | |
fi | |
done | |
fi | |
echo "$cname:eth0 $hostIf host bridge interface up" | |
# write out state for later usage on stop or destroy | |
echo "$hostIf" >> "$statefile" | |
} # | |
container_add_service_ips() { # | |
if ! ip link show dev "$label_service_interface" > /dev/null 2>&1; then | |
echo "E: interface from service_interface label '$label_service_interface' not found" >&2 | |
echo "W: not adding any IPs" >&2 | |
return 0 | |
fi | |
local ip= | |
local label= | |
local oIFS="$IFS" | |
# get CIDR of primary addresses on interface | |
# use them to add svc address to interface to avoid more specific imports | |
local cidr4="$( ip -4 a s dev "$label_service_interface" | awk -F"[ /]+" '$2 == "inet" {print $4; exit}' )" | |
local cidr6="$( ip -6 a s dev "$label_service_interface" | awk -F"[ /]+" '$2 == "inet6" {print $4; exit}' )" | |
# collect the list of IPs and hostnames and resolve them, afterwards pipe them to the address setting loop | |
IFS=, | |
for ip in $label_service_ips; do # collect and resolve | |
IFS="$oIFS" | |
case "$ip" in | |
"" ) | |
# we have nothing | |
continue | |
;; | |
*:* | [1-9]*.*.* ) | |
# assume we have an IPv6 or IPV4 address | |
echo "$ip $ip" | |
;; | |
* ) | |
# assume we have an hostname, try to resolve it | |
getent ahosts "$ip" | grep -Eo "^[^ ]+" | sort -u | sed -e "s/$/ $ip/" | |
;; | |
esac | |
done | while IFS="$oIFS" read ip label; do # set the addresses | |
IFS="$oIFS" | |
case "$ip" in | |
*:* ) # assume we have an IPv6 address | |
cidr=128 | |
cidr_if="$cidr6" | |
;; | |
[1-9]*.*.* ) # assume we have an IPv4 address | |
cidr=32 | |
cidr_if="$cidr4" | |
;; | |
* ) # this should not happen | |
echo "E: '$ip' is not a IPv4/6 address, set via label '$label', abort" >&2 | |
break | |
;; | |
esac | |
# we now have either an IPv4 or an IPv6 address, determine the correct cidr | |
# if it is not routed via a gateway, but via the specified $label_service_interface", then use the interfaces | |
# CIDR, otherwise use /32 or /128 depending on the protocol | |
if route="$(ip route get "$ip" | grep " dev $label_service_interface ")"; then | |
case "$route" in | |
*" via "* ) ip="$ip/$cidr" ;; | |
* ) ip="$ip/$cidr_if" ;; | |
esac | |
else | |
ip="$ip/$cidr" | |
fi | |
if ip route get ${ip%/*} | grep ^local > /dev/null || ip addr add "$ip" dev "$label_service_interface"; then | |
echo "I: $cname: added $ip to $label_service_interface" | |
echo "$label_service_interface $ip" >> "$statefile" | |
else | |
echo "E: could not add '$ip' ($label) to '$label_service_interface'" >&2 | |
fi | |
done | |
} # | |
host_container_mount_cleanup() { # | |
grep -oE "$MOUNT_CLEANUP_REGEX" /proc/1/mounts | \ | |
while read mount; do | |
mount="${mount# }" | |
if nsenter -t 1 -m umount "${mount}"; then | |
echo "$cname: umounted '${mount}' from host" | |
else | |
echo "E: $cname: error when umounting container mount '$mount' from host" >&2 | |
fi | |
done || : | |
} # | |
container_start() { # | |
cid="$1" | |
# ensure clean variables | |
label_service_ips= # comma seperated list of IPs on a dedicated peer to peer interfaces handed into the container | |
label_bridge_member= # a bridge a interface should be added to | |
label_bridge_ips= # an IP for the bridge interface, dhcp is an option | |
# | |
label_bridge_checksum_offload= # when using bridge an local VMs not from KVM (eg VirtualBox, VMWare), we need this to fix checksum errors | |
# | |
label_service_interface= # a interface available in network mode == host, where service IPs can bound to | |
label_service_ips= # a comma seperated list of IPs added to the service_interface, and later cleaned up on exit | |
# only act if network mode is "none" | |
network_mode="$( docker inspect --format='{{ range $key, $value := .NetworkSettings.Networks}}{{ $key }}{{ end}}' "$cid" )" | |
case "$network_mode" in | |
none | host ) ;; # we handle this types of network modes | |
* ) return 0 ;; # we do NOT | |
esac | |
# fetch label from container | |
eval "$(docker inspect -f '{{ range $k, $v := .Config.Labels -}} | |
label_{{ $k }}='"'"'{{ $v }}'"'"' | |
{{ end -}}' "$cid" | \ | |
grep -E "^[ ]*label_(service_interface|service_ips|bridge_ips|bridge_member|bridge_checksum_offload)=" | |
) | |
" # eo EVAL | |
# fetch container name | |
cname="$(docker inspect --format='{{.Name}}' "$cid")" | |
cname="${cname#/}" | |
# get pid for nsenter | |
pid="$(docker inspect --format='{{.State.Pid}}' $cid)" | |
# create/cleanup statefile | |
statefile="$STATEDIR/$cid.interfaces" | |
if [ -L "$statefile" ]; then | |
rm -f "$statefile" | |
fi | |
[ -f "$statefile" ] || \ | |
: > "$statefile" | |
echo "$cname: container started (pid $pid, id $cid)" | |
case "$network_mode" in | |
none ) | |
if [ -n "$label_bridge_ips" -a -n "$label_bridge_member" ]; then | |
container_add_eth0_interface | |
fi | |
# setup direct link p2p service interface | |
if [ -n "$label_service_ips" ]; then | |
container_add_p2p_interface | |
fi | |
;; | |
host ) | |
if [ -n "$label_service_interface" -a -n "$label_service_ips" ]; then | |
container_add_service_ips | |
fi | |
;; | |
esac | |
# cleanup containter filesystems from main namespaces, if possible | |
host_container_mount_cleanup | |
} # | |
container_die() { # | |
local cid="$1" | |
# fetch container name | |
local cname="$(docker inspect --format='{{.Name}}' "$cid")" | |
cname="${cname#/}" | |
local statefile="$STATEDIR/$cid.interfaces" | |
if [ -f "$statefile" ]; then | |
ip= | |
while read hostIf ip; do | |
if [ -z "$ip" ]; then | |
ip link del "$hostIf" 2> /dev/null || : | |
else | |
while ip neigh del "${ip%/*}" 2> /dev/null; do :; done | |
ip addr del "$ip" dev "$hostIf" 2> /dev/null || : | |
fi | |
ip= | |
rm -f "$statefile" | |
done < "$statefile" | |
rm -f "$statefile" | |
fi | |
} # | |
daemon() { # | |
mkdir -p "$STATEDIR" || if [ ! -d "$STATEDIR" ]; then | |
echo "E: '$STATEDIR' is not a directory, EXITING" >&2 | |
exit 1 | |
fi | |
# container:start coldplug | |
docker ps --filter status=running --format "{{.ID}}" | while read cid; do | |
echo "I: found running container $cid" | |
# we have the short Id: fetch long Id | |
cid="$( docker inspect "$cid" --format='{{ .Id }}' )" | |
container_start "$cid" & | |
done | |
# setup pipe to receive events from "docker events" | |
rm -f "$PIPE" "$PIPEPIDFILE" | |
mknod "$PIPE" p | |
"$(which docker)" events > "$PIPE" < /dev/null 2> /dev/null & PIPEPID=$! | |
echo "$PIPEPID" > "$PIPEPIDFILE" | |
exec < "$PIPE" | |
# proper cleanup of pipe attached child processes | |
trap "cleanup; exit" EXIT HUP INT QUIT TERM | |
while read date object event cid _; do | |
case "$object:$event" in # this is running as a detached background job | |
container:start ) container_start "$cid" & ;; | |
container:die ) container_die "$cid" ;; | |
#* ) echo "I: $object:$event $cid" ;; | |
esac | |
done | |
} # | |
start() { # | |
stop | |
if [ -L "$0" ]; then | |
case "$0" in | |
/etc/init.d/* ) ;; | |
* ) exec "$(readlink "$0")" start | |
esac | |
fi | |
/sbin/start-stop-daemon --start --exec "$0" \ | |
--quiet --background \ | |
--chdir /tmp \ | |
--pidfile "$PIDFILE" --make-pidfile \ | |
-- \ | |
daemon \ | |
# eo start-stop-daemon | |
} # | |
cleanup() { # | |
[ ! -r "$PIPEPIDFILE" ] || \ | |
/sbin/start-stop-daemon --stop \ | |
--pidfile "$PIPEPIDFILE" \ | |
--quiet \ | |
--retry=TERM/5/KILL/1 || : \ | |
# eo start-stop-daemon | |
rm -f "$PIPEPIDFILE" | |
} # | |
stop() { # | |
/sbin/start-stop-daemon --stop \ | |
--pidfile "$PIDFILE" \ | |
--quiet \ | |
--retry=TERM/5/KILL/1 \ | |
# eo start-stop-daemon | |
cleanup | |
} # | |
restart() { # | |
status && start | |
} # | |
reload() { # | |
restart | |
} # | |
status() { # | |
/sbin/start-stop-daemon --stop \ | |
--pidfile "$PIDFILE" \ | |
--signal 0 \ | |
# eo start-stop-daemon | |
} # | |
usage() { # | |
exec 1>&2 | |
echo | |
echo "usage: $0 [start|stop|restart|reload|status|daemon]" | |
echo | |
exit 1 | |
} # | |
case "$1" in | |
start | stop | restart | reload | status | daemon ) "$1" ;; | |
* ) usage ;; | |
esac | |
# vim: ft=sh sw=2 ts=2 et foldmethod=marker foldmarker={\ #,}\ # |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment