Last active
November 6, 2023 04:46
-
-
Save tuna2134/d91d84a3264d41c90e8aa56b957f390c to your computer and use it in GitHub Desktop.
lke-init.sh
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# <UDF name="token" label="The kubeadm join token to use for cluster init"> | |
# <UDF name="hostname" label="Hostname to use, should match linode label"> | |
# <UDF name="endpoint" label="The kube-apiserver endpoint to use"> | |
# <UDF name="wgapipeers" label="A space separated list of WG Public Keys and their IPs for the API server"> | |
# <UDF name="hosts" label="The hosts to be added to /etc/hosts"> | |
set -e | |
set -x | |
( | |
ARCH=amd64 | |
# Make /mnt/disks and populate it with custom devices | |
mkdir -p /mnt/disks | |
for device in /dev/sd[b-h]; do | |
ln -s "$device" "/mnt/disks/$(basename $device)" | |
done | |
# Mount BPF filesystem now and automatically on reboot | |
mount bpffs /sys/fs/bpf -t bpf | |
line='bpffs /sys/fs/bpf bpf defaults 0 0' | |
grep -qs "${line}" /etc/fstab || echo "${line}" >> /etc/fstab | |
# Set hostname | |
hostnamectl set-hostname "$HOSTNAME" && hostname -F /etc/hostname | |
if [[ $HOSTS != "no_hosts" ]]; then | |
echo "$HOSTS" >> /etc/hosts | |
fi | |
cat << EOF | sudo tee /etc/containerd/config.toml | |
disabled_plugins = [] | |
imports = [] | |
oom_score = 0 | |
plugin_dir = "" | |
required_plugins = [] | |
root = "/var/lib/containerd" | |
state = "/run/containerd" | |
version = 2 | |
[cgroup] | |
path = "" | |
[debug] | |
address = "" | |
format = "" | |
gid = 0 | |
level = "" | |
uid = 0 | |
[grpc] | |
address = "/run/containerd/containerd.sock" | |
gid = 0 | |
max_recv_message_size = 16777216 | |
max_send_message_size = 16777216 | |
tcp_address = "" | |
tcp_tls_cert = "" | |
tcp_tls_key = "" | |
uid = 0 | |
[metrics] | |
address = "" | |
grpc_histogram = false | |
[plugins] | |
[plugins."io.containerd.gc.v1.scheduler"] | |
deletion_threshold = 0 | |
mutation_threshold = 100 | |
pause_threshold = 0.02 | |
schedule_delay = "0s" | |
startup_delay = "100ms" | |
[plugins."io.containerd.grpc.v1.cri"] | |
disable_apparmor = false | |
disable_cgroup = false | |
disable_hugetlb_controller = true | |
disable_proc_mount = false | |
disable_tcp_service = true | |
enable_selinux = false | |
enable_tls_streaming = false | |
ignore_image_defined_volumes = false | |
max_concurrent_downloads = 3 | |
max_container_log_line_size = 16384 | |
netns_mounts_under_state_dir = false | |
restrict_oom_score_adj = false | |
sandbox_image = "registry.k8s.io/pause:3.5" | |
selinux_category_range = 1024 | |
stats_collect_period = 10 | |
stream_idle_timeout = "4h0m0s" | |
stream_server_address = "127.0.0.1" | |
stream_server_port = "0" | |
systemd_cgroup = false | |
tolerate_missing_hugetlb_controller = true | |
unset_seccomp_profile = "" | |
[plugins."io.containerd.grpc.v1.cri".cni] | |
bin_dir = "/opt/cni/bin" | |
conf_dir = "/etc/cni/net.d" | |
conf_template = "" | |
max_conf_num = 1 | |
[plugins."io.containerd.grpc.v1.cri".containerd] | |
default_runtime_name = "runc" | |
disable_snapshot_annotations = true | |
discard_unpacked_layers = false | |
no_pivot = false | |
snapshotter = "overlayfs" | |
[plugins."io.containerd.grpc.v1.cri".containerd.default_runtime] | |
base_runtime_spec = "" | |
container_annotations = [] | |
pod_annotations = [] | |
privileged_without_host_devices = false | |
runtime_engine = "" | |
runtime_root = "" | |
runtime_type = "" | |
[plugins."io.containerd.grpc.v1.cri".containerd.default_runtime.options] | |
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes] | |
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc] | |
base_runtime_spec = "" | |
container_annotations = [] | |
pod_annotations = [] | |
privileged_without_host_devices = false | |
runtime_engine = "" | |
runtime_root = "" | |
runtime_type = "io.containerd.runc.v2" | |
[plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options] | |
BinaryName = "" | |
CriuImagePath = "" | |
CriuPath = "" | |
CriuWorkPath = "" | |
IoGid = 0 | |
IoUid = 0 | |
NoNewKeyring = false | |
NoPivotRoot = false | |
Root = "" | |
ShimCgroup = "" | |
SystemdCgroup = false | |
[plugins."io.containerd.grpc.v1.cri".containerd.untrusted_workload_runtime] | |
base_runtime_spec = "" | |
container_annotations = [] | |
pod_annotations = [] | |
privileged_without_host_devices = false | |
runtime_engine = "" | |
runtime_root = "" | |
runtime_type = "" | |
[plugins."io.containerd.grpc.v1.cri".containerd.untrusted_workload_runtime.options] | |
[plugins."io.containerd.grpc.v1.cri".image_decryption] | |
key_model = "node" | |
[plugins."io.containerd.grpc.v1.cri".registry] | |
config_path = "" | |
[plugins."io.containerd.grpc.v1.cri".x509_key_pair_streaming] | |
tls_cert_file = "" | |
tls_key_file = "" | |
[plugins."io.containerd.internal.v1.opt"] | |
path = "/opt/containerd" | |
[plugins."io.containerd.internal.v1.restart"] | |
interval = "10s" | |
[plugins."io.containerd.metadata.v1.bolt"] | |
content_sharing_policy = "shared" | |
[plugins."io.containerd.monitor.v1.cgroups"] | |
no_prometheus = false | |
[plugins."io.containerd.runtime.v1.linux"] | |
no_shim = false | |
runtime = "runc" | |
runtime_root = "" | |
shim = "containerd-shim" | |
shim_debug = false | |
[plugins."io.containerd.runtime.v2.task"] | |
platforms = ["linux/amd64"] | |
[plugins."io.containerd.service.v1.diff-service"] | |
default = ["walking"] | |
[plugins."io.containerd.snapshotter.v1.aufs"] | |
root_path = "" | |
[plugins."io.containerd.snapshotter.v1.btrfs"] | |
root_path = "" | |
[plugins."io.containerd.snapshotter.v1.devmapper"] | |
async_remove = false | |
base_image_size = "" | |
pool_name = "" | |
root_path = "" | |
[plugins."io.containerd.snapshotter.v1.native"] | |
root_path = "" | |
[plugins."io.containerd.snapshotter.v1.overlayfs"] | |
root_path = "" | |
[plugins."io.containerd.snapshotter.v1.zfs"] | |
root_path = "" | |
[proxy_plugins] | |
[stream_processors] | |
[stream_processors."io.containerd.ocicrypt.decoder.v1.tar"] | |
accepts = ["application/vnd.oci.image.layer.v1.tar+encrypted"] | |
args = ["--decryption-keys-path", "/etc/containerd/ocicrypt/keys"] | |
env = ["OCICRYPT_KEYPROVIDER_CONFIG=/etc/containerd/ocicrypt/ocicrypt_keyprovider.conf"] | |
path = "ctd-decoder" | |
returns = "application/vnd.oci.image.layer.v1.tar" | |
[stream_processors."io.containerd.ocicrypt.decoder.v1.tar.gzip"] | |
accepts = ["application/vnd.oci.image.layer.v1.tar+gzip+encrypted"] | |
args = ["--decryption-keys-path", "/etc/containerd/ocicrypt/keys"] | |
env = ["OCICRYPT_KEYPROVIDER_CONFIG=/etc/containerd/ocicrypt/ocicrypt_keyprovider.conf"] | |
path = "ctd-decoder" | |
returns = "application/vnd.oci.image.layer.v1.tar+gzip" | |
[timeouts] | |
"io.containerd.timeout.shim.cleanup" = "5s" | |
"io.containerd.timeout.shim.load" = "5s" | |
"io.containerd.timeout.shim.shutdown" = "3s" | |
"io.containerd.timeout.task.state" = "2s" | |
[ttrpc] | |
address = "" | |
gid = 0 | |
uid = 0 | |
EOF | |
systemctl daemon-reload | |
systemctl restart containerd | |
systemctl restart kubelet | |
cat > /etc/kubeadm-join-config.yaml <<END | |
apiVersion: kubeadm.k8s.io/v1beta3 | |
caCertPath: /etc/kubernetes/pki/ca.crt | |
discovery: | |
bootstrapToken: | |
apiServerEndpoint: "${ENDPOINT}" | |
token: "${TOKEN}" | |
unsafeSkipCAVerification: true | |
timeout: 15m0s | |
tlsBootstrapToken: "${TOKEN}" | |
kind: JoinConfiguration | |
nodeRegistration: | |
criSocket: unix:///run/containerd/containerd.sock | |
name: "${HOSTNAME}" | |
taints: null | |
kubeletExtraArgs: | |
cloud-provider: external | |
pod-infra-container-image: "linode/pause:3.2" | |
END | |
# Join node to cluster | |
kubeadm join --ignore-preflight-errors=all --config=/etc/kubeadm-join-config.yaml | |
# Create LKE node boot script. | |
# IMPORTANT NOTE: We don't want any variable substitution done in this step. | |
# 'END_OF_FILE' is used since the string 'END' is used within the script. | |
cat << 'END_OF_FILE' | sudo tee /usr/local/bin/lke-node-onboot | |
#!/bin/bash | |
NODENAME="${1}" | |
WGAPIPEERS="${2}" | |
# Prefix for all LKE-related labels and annotations. | |
ANNOTATION_PFX=lke.linode.com | |
# Globals used for wireguard-related annotations. | |
PRIVATEKEY= | |
PUBLICKEY= | |
N= | |
# These variables will be set once when we successfully get the Node object with a non-empty | |
# podCIDR. | |
NODE_JSON= | |
POD_CIDR= | |
# Gets a value from the NODE_JSON. | |
# ARGUMENTS: | |
# The key path | |
# RETURN: | |
# The value at the key path | |
get_node_value() { | |
local key_path="$1" | |
echo "$NODE_JSON" | python3 -c "$(cat <<END_OF_SCRIPT | |
import json, sys, contextlib | |
node=json.load(sys.stdin) | |
with contextlib.suppress(KeyError): | |
print(node${key_path}) | |
END_OF_SCRIPT | |
)" | |
} | |
# Returns the value for the third IP address octet for the node's pod network. | |
# It assumes that we have the following POD network format: "10.2.N.0/24" | |
# ARGUMENTS: | |
# None | |
# RETURN: | |
# The octet value | |
gimme_n() { | |
echo "$POD_CIDR" | awk -F. '($1==10 && $2==2 && $4=="0/24") {print $3}' | |
} | |
# Retrieves the existing wireguard interface configuration which will be | |
# annotated on the node object. | |
# ARGUMENTS: | |
# None | |
# RETURN: | |
# None | |
get_previous_wg_config() { | |
PRIVATEKEY="$(cat /etc/wireguard/wg0.conf | grep "PrivateKey" | awk '{print $3}')" | |
PUBLICKEY="$(echo "${PRIVATEKEY}" | wg pubkey)" | |
N="$(cat /etc/wireguard/wg0.conf | grep "Address" | awk -F. '{print $3}')" | |
} | |
# Configures the wireguard interface. | |
# ARGUMENTS: | |
# None | |
# RETURN: | |
# None | |
configure_wireguard() { | |
PRIVATEKEY="$(wg genkey)" | |
PUBLICKEY="$(echo "${PRIVATEKEY}" | wg pubkey)" | |
N="$(gimme_n)" | |
local peer_map=( $(echo $WGAPIPEERS) ) | |
local apiserver_peers='' | |
local peer_ip= | |
local peer_pubkey= | |
for peer in "${peer_map[@]}"; do | |
peer_ip=$(echo "$peer" | cut -d ":" -f 1) | |
peer_pubkey=$(echo "$peer" | cut -d ":" -f 2) | |
apiserver_peers=$(cat <<-END | |
${apiserver_peers} | |
[Peer] | |
AllowedIPs = ${peer_ip} | |
PublicKey = ${peer_pubkey} | |
END | |
) | |
done | |
mkdir -p /etc/wireguard | |
cat >/etc/wireguard/wg0.conf <<END | |
[Interface] | |
PrivateKey = ${PRIVATEKEY} | |
Address = 172.31.${N}.1 | |
SaveConfig = true | |
ListenPort = 51820 | |
${apiserver_peers} | |
END | |
} | |
# Startup: kubelet does not join the cluster immediately. The script cannot run | |
# until the node has joined and has been allocated a pod CIDR. | |
echo "Waiting until the Node is available..." | |
MAX_ATTEMPTS=120 | |
attempt=1 | |
while true; do | |
# Get the Node object. | |
if NODE_JSON="$(kubectl --kubeconfig /etc/kubernetes/kubelet.conf get node "${NODENAME}" -ojson)"; then | |
# The Node object is available; check for the node's pod CIDR. | |
POD_CIDR="$(get_node_value "['spec']['podCIDR']")" | |
if [ -n "$POD_CIDR" ]; then | |
# The node's podCIDR is assigned; wireguard can be configured. | |
echo "Node is available and was assigned pod CIDR: '$POD_CIDR'" | |
break | |
fi | |
# We'll need to keep checking until the Node has a pod CIDR. | |
echo "Node has not yet been allocated a pod CIDR" | |
else | |
# We'll need to keep trying until we can get the Node. | |
echo "Failed to get node ${NODENAME}" | |
fi | |
if [ $attempt -eq $MAX_ATTEMPTS ]; then | |
# If we've gotten this far, something is wrong. We waited, but the node cannot be initialized | |
# because either the kube-apiserver is unavailable, the Node object does not exist, or the KCM | |
# could not allocate pod CIDR for the Node. This will be retried on next boot. | |
echo "Timed out waiting for the node to be available and allocated a pod CIDR" | |
exit 1 | |
fi | |
# Retry in a second. | |
((attempt++)) | |
sleep 1 | |
done | |
# Check for wireguard annotations | |
wgip_annotation="$(get_node_value "['metadata']['annotations']['lke.linode.com/wgip']")" | |
wgpub_annotation="$(get_node_value "['metadata']['annotations']['lke.linode.com/wgpub']")" | |
if [[ -z "${wgip_annotation}" || -z "${wgpub_annotation}" ]]; then | |
# Check if the wireguard interface was already configured | |
if ip a | grep wg0; then | |
# The previous interface was configured, so we just need to re-annotate | |
# the node object with the values that were used before. | |
get_previous_wg_config | |
else | |
configure_wireguard | |
wg-quick up wg0 | |
systemctl enable wg-quick@wg0 | |
fi | |
wg show | |
kubectl --kubeconfig /etc/kubernetes/kubelet.conf annotate --overwrite node "${NODENAME}" \ | |
"${ANNOTATION_PFX}/wgpub=${PUBLICKEY}" \ | |
"${ANNOTATION_PFX}/wgip=172.31.${N}.1" | |
else | |
echo "\"${ANNOTATION_PFX}/wgpub\" and \"${ANNOTATION_PFX}/wgip\" annotations exist, skipping" | |
fi | |
# Check for node with pool id label | |
pool_id_label="$(get_node_value "['metadata']['labels']['lke.linode.com/pool-id']")" | |
if [[ -z "${pool_id_label}" ]]; then | |
pool_id=$(echo "${NODENAME}" | awk -F- '{print $2}') | |
if [[ "${pool_id}" =~ ^[1-9][0-9]*$ ]]; then | |
kubectl --kubeconfig /etc/kubernetes/kubelet.conf label --overwrite node/"${NODENAME}" \ | |
"${ANNOTATION_PFX}"/pool-id="${pool_id}" | |
else | |
echo "Refusing to label node/${NODENAME} with \"${ANNOTATION_PFX}/pool-id\" label: Value \"${pool_id}\" is not numeric" | |
fi | |
else | |
echo "\"${ANNOTATION_PFX}/pool-id\" label exists, skipping" | |
fi | |
cat << EOF | sudo tee /etc/systemd/system/kubelet.service.d/11-cgroups.conf | |
[Service] | |
CPUAccounting=true | |
MemoryAccounting=true | |
EOF | |
systemctl daemon-reload | |
systemctl restart kubelet | |
echo "LKE node has initialized successfully" | |
END_OF_FILE | |
chmod 744 /usr/local/bin/lke-node-onboot | |
# Create LKE node startup service | |
cat << EOF | sudo tee /etc/systemd/system/lke-node-startup.service | |
[Unit] | |
After=kubelet.service | |
[Service] | |
ExecStart=/usr/local/bin/lke-node-onboot "${HOSTNAME}" "${WGAPIPEERS}" | |
[Install] | |
WantedBy=multi-user.target | |
EOF | |
chmod 644 /etc/systemd/system/lke-node-startup.service | |
# Disable rpcbind that is packaged with nfs-common: LKE-2586 | |
systemctl stop rpcbind.service | |
systemctl disable rpcbind.service | |
systemctl daemon-reload | |
systemctl enable lke-node-startup | |
systemctl start lke-node-startup | |
echo done | |
) 2>&1 | tee /var/log/startup.log | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment