Skip to content

Instantly share code, notes, and snippets.

@tuna2134
Last active November 6, 2023 04:46
Show Gist options
  • Save tuna2134/d91d84a3264d41c90e8aa56b957f390c to your computer and use it in GitHub Desktop.
Save tuna2134/d91d84a3264d41c90e8aa56b957f390c to your computer and use it in GitHub Desktop.
lke-init.sh
#!/bin/bash
# <UDF name="token" label="The kubeadm join token to use for cluster init">
# <UDF name="hostname" label="Hostname to use, should match linode label">
# <UDF name="endpoint" label="The kube-apiserver endpoint to use">
# <UDF name="wgapipeers" label="A space separated list of WG Public Keys and their IPs for the API server">
# <UDF name="hosts" label="The hosts to be added to /etc/hosts">
set -e
set -x
(
ARCH=amd64
# Make /mnt/disks and populate it with custom devices
mkdir -p /mnt/disks
for device in /dev/sd[b-h]; do
ln -s "$device" "/mnt/disks/$(basename $device)"
done
# Mount BPF filesystem now and automatically on reboot
mount bpffs /sys/fs/bpf -t bpf
line='bpffs /sys/fs/bpf bpf defaults 0 0'
grep -qs "${line}" /etc/fstab || echo "${line}" >> /etc/fstab
# Set hostname
hostnamectl set-hostname "$HOSTNAME" && hostname -F /etc/hostname
if [[ $HOSTS != "no_hosts" ]]; then
echo "$HOSTS" >> /etc/hosts
fi
cat << EOF | sudo tee /etc/containerd/config.toml
disabled_plugins = []
imports = []
oom_score = 0
plugin_dir = ""
required_plugins = []
root = "/var/lib/containerd"
state = "/run/containerd"
version = 2
[cgroup]
path = ""
[debug]
address = ""
format = ""
gid = 0
level = ""
uid = 0
[grpc]
address = "/run/containerd/containerd.sock"
gid = 0
max_recv_message_size = 16777216
max_send_message_size = 16777216
tcp_address = ""
tcp_tls_cert = ""
tcp_tls_key = ""
uid = 0
[metrics]
address = ""
grpc_histogram = false
[plugins]
[plugins."io.containerd.gc.v1.scheduler"]
deletion_threshold = 0
mutation_threshold = 100
pause_threshold = 0.02
schedule_delay = "0s"
startup_delay = "100ms"
[plugins."io.containerd.grpc.v1.cri"]
disable_apparmor = false
disable_cgroup = false
disable_hugetlb_controller = true
disable_proc_mount = false
disable_tcp_service = true
enable_selinux = false
enable_tls_streaming = false
ignore_image_defined_volumes = false
max_concurrent_downloads = 3
max_container_log_line_size = 16384
netns_mounts_under_state_dir = false
restrict_oom_score_adj = false
sandbox_image = "registry.k8s.io/pause:3.5"
selinux_category_range = 1024
stats_collect_period = 10
stream_idle_timeout = "4h0m0s"
stream_server_address = "127.0.0.1"
stream_server_port = "0"
systemd_cgroup = false
tolerate_missing_hugetlb_controller = true
unset_seccomp_profile = ""
[plugins."io.containerd.grpc.v1.cri".cni]
bin_dir = "/opt/cni/bin"
conf_dir = "/etc/cni/net.d"
conf_template = ""
max_conf_num = 1
[plugins."io.containerd.grpc.v1.cri".containerd]
default_runtime_name = "runc"
disable_snapshot_annotations = true
discard_unpacked_layers = false
no_pivot = false
snapshotter = "overlayfs"
[plugins."io.containerd.grpc.v1.cri".containerd.default_runtime]
base_runtime_spec = ""
container_annotations = []
pod_annotations = []
privileged_without_host_devices = false
runtime_engine = ""
runtime_root = ""
runtime_type = ""
[plugins."io.containerd.grpc.v1.cri".containerd.default_runtime.options]
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes]
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc]
base_runtime_spec = ""
container_annotations = []
pod_annotations = []
privileged_without_host_devices = false
runtime_engine = ""
runtime_root = ""
runtime_type = "io.containerd.runc.v2"
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options]
BinaryName = ""
CriuImagePath = ""
CriuPath = ""
CriuWorkPath = ""
IoGid = 0
IoUid = 0
NoNewKeyring = false
NoPivotRoot = false
Root = ""
ShimCgroup = ""
SystemdCgroup = false
[plugins."io.containerd.grpc.v1.cri".containerd.untrusted_workload_runtime]
base_runtime_spec = ""
container_annotations = []
pod_annotations = []
privileged_without_host_devices = false
runtime_engine = ""
runtime_root = ""
runtime_type = ""
[plugins."io.containerd.grpc.v1.cri".containerd.untrusted_workload_runtime.options]
[plugins."io.containerd.grpc.v1.cri".image_decryption]
key_model = "node"
[plugins."io.containerd.grpc.v1.cri".registry]
config_path = ""
[plugins."io.containerd.grpc.v1.cri".x509_key_pair_streaming]
tls_cert_file = ""
tls_key_file = ""
[plugins."io.containerd.internal.v1.opt"]
path = "/opt/containerd"
[plugins."io.containerd.internal.v1.restart"]
interval = "10s"
[plugins."io.containerd.metadata.v1.bolt"]
content_sharing_policy = "shared"
[plugins."io.containerd.monitor.v1.cgroups"]
no_prometheus = false
[plugins."io.containerd.runtime.v1.linux"]
no_shim = false
runtime = "runc"
runtime_root = ""
shim = "containerd-shim"
shim_debug = false
[plugins."io.containerd.runtime.v2.task"]
platforms = ["linux/amd64"]
[plugins."io.containerd.service.v1.diff-service"]
default = ["walking"]
[plugins."io.containerd.snapshotter.v1.aufs"]
root_path = ""
[plugins."io.containerd.snapshotter.v1.btrfs"]
root_path = ""
[plugins."io.containerd.snapshotter.v1.devmapper"]
async_remove = false
base_image_size = ""
pool_name = ""
root_path = ""
[plugins."io.containerd.snapshotter.v1.native"]
root_path = ""
[plugins."io.containerd.snapshotter.v1.overlayfs"]
root_path = ""
[plugins."io.containerd.snapshotter.v1.zfs"]
root_path = ""
[proxy_plugins]
[stream_processors]
[stream_processors."io.containerd.ocicrypt.decoder.v1.tar"]
accepts = ["application/vnd.oci.image.layer.v1.tar+encrypted"]
args = ["--decryption-keys-path", "/etc/containerd/ocicrypt/keys"]
env = ["OCICRYPT_KEYPROVIDER_CONFIG=/etc/containerd/ocicrypt/ocicrypt_keyprovider.conf"]
path = "ctd-decoder"
returns = "application/vnd.oci.image.layer.v1.tar"
[stream_processors."io.containerd.ocicrypt.decoder.v1.tar.gzip"]
accepts = ["application/vnd.oci.image.layer.v1.tar+gzip+encrypted"]
args = ["--decryption-keys-path", "/etc/containerd/ocicrypt/keys"]
env = ["OCICRYPT_KEYPROVIDER_CONFIG=/etc/containerd/ocicrypt/ocicrypt_keyprovider.conf"]
path = "ctd-decoder"
returns = "application/vnd.oci.image.layer.v1.tar+gzip"
[timeouts]
"io.containerd.timeout.shim.cleanup" = "5s"
"io.containerd.timeout.shim.load" = "5s"
"io.containerd.timeout.shim.shutdown" = "3s"
"io.containerd.timeout.task.state" = "2s"
[ttrpc]
address = ""
gid = 0
uid = 0
EOF
systemctl daemon-reload
systemctl restart containerd
systemctl restart kubelet
cat > /etc/kubeadm-join-config.yaml <<END
apiVersion: kubeadm.k8s.io/v1beta3
caCertPath: /etc/kubernetes/pki/ca.crt
discovery:
bootstrapToken:
apiServerEndpoint: "${ENDPOINT}"
token: "${TOKEN}"
unsafeSkipCAVerification: true
timeout: 15m0s
tlsBootstrapToken: "${TOKEN}"
kind: JoinConfiguration
nodeRegistration:
criSocket: unix:///run/containerd/containerd.sock
name: "${HOSTNAME}"
taints: null
kubeletExtraArgs:
cloud-provider: external
pod-infra-container-image: "linode/pause:3.2"
END
# Join node to cluster
kubeadm join --ignore-preflight-errors=all --config=/etc/kubeadm-join-config.yaml
# Create LKE node boot script.
# IMPORTANT NOTE: We don't want any variable substitution done in this step.
# 'END_OF_FILE' is used since the string 'END' is used within the script.
cat << 'END_OF_FILE' | sudo tee /usr/local/bin/lke-node-onboot
#!/bin/bash
NODENAME="${1}"
WGAPIPEERS="${2}"
# Prefix for all LKE-related labels and annotations.
ANNOTATION_PFX=lke.linode.com
# Globals used for wireguard-related annotations.
PRIVATEKEY=
PUBLICKEY=
N=
# These variables will be set once when we successfully get the Node object with a non-empty
# podCIDR.
NODE_JSON=
POD_CIDR=
# Gets a value from the NODE_JSON.
# ARGUMENTS:
# The key path
# RETURN:
# The value at the key path
get_node_value() {
local key_path="$1"
echo "$NODE_JSON" | python3 -c "$(cat <<END_OF_SCRIPT
import json, sys, contextlib
node=json.load(sys.stdin)
with contextlib.suppress(KeyError):
print(node${key_path})
END_OF_SCRIPT
)"
}
# Returns the value for the third IP address octet for the node's pod network.
# It assumes that we have the following POD network format: "10.2.N.0/24"
# ARGUMENTS:
# None
# RETURN:
# The octet value
gimme_n() {
echo "$POD_CIDR" | awk -F. '($1==10 && $2==2 && $4=="0/24") {print $3}'
}
# Retrieves the existing wireguard interface configuration which will be
# annotated on the node object.
# ARGUMENTS:
# None
# RETURN:
# None
get_previous_wg_config() {
PRIVATEKEY="$(cat /etc/wireguard/wg0.conf | grep "PrivateKey" | awk '{print $3}')"
PUBLICKEY="$(echo "${PRIVATEKEY}" | wg pubkey)"
N="$(cat /etc/wireguard/wg0.conf | grep "Address" | awk -F. '{print $3}')"
}
# Configures the wireguard interface.
# ARGUMENTS:
# None
# RETURN:
# None
configure_wireguard() {
PRIVATEKEY="$(wg genkey)"
PUBLICKEY="$(echo "${PRIVATEKEY}" | wg pubkey)"
N="$(gimme_n)"
local peer_map=( $(echo $WGAPIPEERS) )
local apiserver_peers=''
local peer_ip=
local peer_pubkey=
for peer in "${peer_map[@]}"; do
peer_ip=$(echo "$peer" | cut -d ":" -f 1)
peer_pubkey=$(echo "$peer" | cut -d ":" -f 2)
apiserver_peers=$(cat <<-END
${apiserver_peers}
[Peer]
AllowedIPs = ${peer_ip}
PublicKey = ${peer_pubkey}
END
)
done
mkdir -p /etc/wireguard
cat >/etc/wireguard/wg0.conf <<END
[Interface]
PrivateKey = ${PRIVATEKEY}
Address = 172.31.${N}.1
SaveConfig = true
ListenPort = 51820
${apiserver_peers}
END
}
# Startup: kubelet does not join the cluster immediately. The script cannot run
# until the node has joined and has been allocated a pod CIDR.
echo "Waiting until the Node is available..."
MAX_ATTEMPTS=120
attempt=1
while true; do
# Get the Node object.
if NODE_JSON="$(kubectl --kubeconfig /etc/kubernetes/kubelet.conf get node "${NODENAME}" -ojson)"; then
# The Node object is available; check for the node's pod CIDR.
POD_CIDR="$(get_node_value "['spec']['podCIDR']")"
if [ -n "$POD_CIDR" ]; then
# The node's podCIDR is assigned; wireguard can be configured.
echo "Node is available and was assigned pod CIDR: '$POD_CIDR'"
break
fi
# We'll need to keep checking until the Node has a pod CIDR.
echo "Node has not yet been allocated a pod CIDR"
else
# We'll need to keep trying until we can get the Node.
echo "Failed to get node ${NODENAME}"
fi
if [ $attempt -eq $MAX_ATTEMPTS ]; then
# If we've gotten this far, something is wrong. We waited, but the node cannot be initialized
# because either the kube-apiserver is unavailable, the Node object does not exist, or the KCM
# could not allocate pod CIDR for the Node. This will be retried on next boot.
echo "Timed out waiting for the node to be available and allocated a pod CIDR"
exit 1
fi
# Retry in a second.
((attempt++))
sleep 1
done
# Check for wireguard annotations
wgip_annotation="$(get_node_value "['metadata']['annotations']['lke.linode.com/wgip']")"
wgpub_annotation="$(get_node_value "['metadata']['annotations']['lke.linode.com/wgpub']")"
if [[ -z "${wgip_annotation}" || -z "${wgpub_annotation}" ]]; then
# Check if the wireguard interface was already configured
if ip a | grep wg0; then
# The previous interface was configured, so we just need to re-annotate
# the node object with the values that were used before.
get_previous_wg_config
else
configure_wireguard
wg-quick up wg0
systemctl enable wg-quick@wg0
fi
wg show
kubectl --kubeconfig /etc/kubernetes/kubelet.conf annotate --overwrite node "${NODENAME}" \
"${ANNOTATION_PFX}/wgpub=${PUBLICKEY}" \
"${ANNOTATION_PFX}/wgip=172.31.${N}.1"
else
echo "\"${ANNOTATION_PFX}/wgpub\" and \"${ANNOTATION_PFX}/wgip\" annotations exist, skipping"
fi
# Check for node with pool id label
pool_id_label="$(get_node_value "['metadata']['labels']['lke.linode.com/pool-id']")"
if [[ -z "${pool_id_label}" ]]; then
pool_id=$(echo "${NODENAME}" | awk -F- '{print $2}')
if [[ "${pool_id}" =~ ^[1-9][0-9]*$ ]]; then
kubectl --kubeconfig /etc/kubernetes/kubelet.conf label --overwrite node/"${NODENAME}" \
"${ANNOTATION_PFX}"/pool-id="${pool_id}"
else
echo "Refusing to label node/${NODENAME} with \"${ANNOTATION_PFX}/pool-id\" label: Value \"${pool_id}\" is not numeric"
fi
else
echo "\"${ANNOTATION_PFX}/pool-id\" label exists, skipping"
fi
cat << EOF | sudo tee /etc/systemd/system/kubelet.service.d/11-cgroups.conf
[Service]
CPUAccounting=true
MemoryAccounting=true
EOF
systemctl daemon-reload
systemctl restart kubelet
echo "LKE node has initialized successfully"
END_OF_FILE
chmod 744 /usr/local/bin/lke-node-onboot
# Create LKE node startup service
cat << EOF | sudo tee /etc/systemd/system/lke-node-startup.service
[Unit]
After=kubelet.service
[Service]
ExecStart=/usr/local/bin/lke-node-onboot "${HOSTNAME}" "${WGAPIPEERS}"
[Install]
WantedBy=multi-user.target
EOF
chmod 644 /etc/systemd/system/lke-node-startup.service
# Disable rpcbind that is packaged with nfs-common: LKE-2586
systemctl stop rpcbind.service
systemctl disable rpcbind.service
systemctl daemon-reload
systemctl enable lke-node-startup
systemctl start lke-node-startup
echo done
) 2>&1 | tee /var/log/startup.log
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment