Instantly share code, notes, and snippets.
Last active
September 29, 2021 23:28
-
Star
(0)
0
You must be signed in to star a gist -
Fork
(0)
0
You must be signed in to fork a gist
-
Save keymon/11826b5f511469f70bb0d3a664c806f0 to your computer and use it in GitHub Desktop.
Replace SG in ELBs in EKS with a shared one without downtime
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# AWS sets a limit of how many rules per Security Group can be added. | |
# This limit dynamic, and depends on the hard limit of rules by ENI: 1000, and computed as: 1000 / max number SG per ENI | |
# | |
# In EKS this is a problem as each ELB gets one SG that is added to the nodes SG. For instance, | |
# if you want to have up to 3 SG per ENI, so our hard limit is 333 loadbalancers. | |
# | |
set -e -o pipefail -u | |
SCRIPT_NAME="$0" | |
# Context to process | |
CONTEXT="${CONTEXT:-$(kubectl config current-context)}" | |
# NAMESPACE to update LBs with | |
NAMESPACE_SELECTOR="${NAMESPACE_SELECTOR:-all}" | |
EXTERNAL_SECURITY_GROUP_NAME="shared-lb" | |
EXTERNAL_SECURITY_GROUP_DESCRIPTION="Shared SG for loadbalancers" | |
panic() { | |
echo "ERROR: $@" 1>&2 | |
exit 1 | |
} | |
# extract AWS info from the nodes | |
get_k8s_aws_info() { | |
local instanceid | |
local region | |
read region instanceid < <( | |
kubectl --context "${CONTEXT}" get nodes -o json | \ | |
jq -r ' | |
.items[0].spec.providerID | | |
capture("aws:///(?<region>.*)./(?<id>.*)") | | |
[.region, .id] | | |
join(" ") | |
' | |
) | |
[ -n "${region}" -a -n "${instanceid}" ] || panic "Failed to get region/instance from nodes" | |
local vpcid | |
local sgid | |
read vpcid sgid < <( | |
aws ec2 describe-instances --region "${region}" --instance-ids "${instanceid}" | \ | |
jq -r ' | |
.Reservations[0].Instances[0] | | |
[ | |
.VpcId, | |
(.SecurityGroups | map(select(.GroupName == "node"))[0] | .GroupId) | |
] | join(" ") | |
' | |
) | |
[ -n "${vpcid}" -a -n "${sgid}" ] || panic "Failed to get vpcid/nodes sg from nodes" | |
echo "${region} ${vpcid} ${sgid}" | |
} | |
# Find the group, return error if not fond | |
find_shared_security_group() { | |
local region="$1" | |
local vpcid="$2" | |
local group_name="${EXTERNAL_SECURITY_GROUP_NAME}" | |
aws ec2 describe-security-groups \ | |
--region "${region}" \ | |
--filters "Name=vpc-id,Values=${vpcid}" "Name=group-name,Values=${group_name}" | \ | |
jq -re .SecurityGroups[0].GroupId | |
} | |
ensure_shared_security_group() { | |
local region="$1" | |
local vpcid="$2" | |
local sgid | |
# exit if exists | |
if sgid="$(find_shared_security_group "${region}" "${vpcid}")"; then | |
echo "Found existing SG named ${EXTERNAL_SECURITY_GROUP_NAME}: $sgid" 1>&2 | |
echo "$sgid" | |
return | |
fi | |
echo "Creating new SG named ${EXTERNAL_SECURITY_GROUP_NAME}" 1>&2 | |
sgid="$( | |
aws ec2 create-security-group \ | |
--region "${region}" \ | |
--group-name "${EXTERNAL_SECURITY_GROUP_NAME}" \ | |
--description "${EXTERNAL_SECURITY_GROUP_DESCRIPTION}" \ | |
--vpc-id "${vpcid}" | \ | |
jq -r .GroupId | |
)" | |
# TODO if this fails, it won retry | |
aws ec2 authorize-security-group-ingress \ | |
--region "${region}" \ | |
--group-id "${sgid}" \ | |
--protocol -1 \ | |
--cidr 0.0.0.0/0 > /dev/null | |
echo "Creating new SG named ${EXTERNAL_SECURITY_GROUP_NAME}: $sgid" 1>&2 | |
echo "$sgid" | |
} | |
ensure_authorize_shared_sg_in_nodes_sg() { | |
local region="$1" | |
local vpcid="$2" | |
local nodes_sgid="$3" | |
local shared_sgid="$4" | |
# Skip if already presetnt | |
if aws ec2 describe-security-groups \ | |
--region "${region}" \ | |
--group-id "${nodes_sgid}" | \ | |
jq -e --arg source_sg "${shared_sgid}" ' | |
[.SecurityGroups[].IpPermissions[].UserIdGroupPairs[].GroupId] | | |
any(.==$source_sg) | |
' > /dev/null; then | |
echo "WARNING: Shared sg ${shared_sgid} is already authorized in nodes SG ${nodes_sgid}" 1>&2 | |
return | |
fi | |
echo "Authorize sg ${shared_sgid} in nodes SG ${nodes_sgid}" 1>&2 | |
aws ec2 authorize-security-group-ingress \ | |
--region "${region}" \ | |
--group-id "${nodes_sgid}" \ | |
--protocol -1 \ | |
--source-group "${shared_sgid}" | |
} | |
get_loadbalancer_services() { | |
local namespace="$1" | |
local namespace_selector | |
local loadbalancer_services | |
if [ "${namespace}" == "all" ]; then | |
namespace_selector="--all-namespaces" | |
else | |
namespace_selector="-n ${namespace}" | |
fi | |
kubectl --context "${CONTEXT}" get service ${namespace_selector} -o json | \ | |
jq -r ' | |
.items[] | | |
select(.spec.type == "LoadBalancer") | | |
"\(.metadata.namespace)/\(.metadata.name)" | |
' | |
} | |
get_elb_from_service() { | |
local ns="$1" | |
local name="$2" | |
kubectl --context "${CONTEXT}" get service -n "${ns}" "${name}" -o json | \ | |
jq -r ' | |
.status.loadBalancer.ingress[0].hostname | | |
capture("^(internal-)?(?<elbname>[^\\.]*)-.*\\.") |.elbname | |
' | |
} | |
print_load_balancers_sg() { | |
local namespace="$1" | |
local region="$2" | |
local loadbalancer_services | |
loadbalancer_services="$( | |
get_loadbalancer_services "${namespace}" | |
)" | |
echo "ELBs for NAMESPACEs $namespace:" 1>&2 | |
for lb in ${loadbalancer_services}; do | |
ns="${lb%/*}" | |
name="${lb#*/}" | |
elb_name="$(get_elb_from_service "${ns}" "${name}")" | |
if [ -z "${elb_name}" ]; then | |
echo "WARNING: cannot find ELB for $lb" 1>&2 | |
else | |
elb_sgids="$( | |
aws elb describe-load-balancers \ | |
--region "${region}" \ | |
--load-balancer-names "${elb_name}" | \ | |
jq -r '.LoadBalancerDescriptions[0].SecurityGroups | join(",")' | |
)" | |
echo "${lb}: ${elb_sgids}" 1>&2 | |
fi | |
done | |
} | |
annotate_load_balancers() { | |
local namespace="$1" | |
local shared_sgid="$2" | |
local loadbalancer_services | |
loadbalancer_services="$( | |
get_loadbalancer_services "${namespace}" | |
)" | |
echo "Annotating LB services of NAMESPACE ${namespace} to use SG ${shared_sgid}" 1>&2 | |
for lb in ${loadbalancer_services}; do | |
ns="${lb%/*}" | |
name="${lb#*/}" | |
if existing_annotation="$( | |
kubectl --context "${CONTEXT}" get service -n "${ns}" "${name}" -o json | \ | |
jq -e '.metadata.annotations."service.beta.kubernetes.io/aws-load-balancer-security-groups"' | |
)"; then | |
echo "WARNING: ${lb} already annotated service.beta.kubernetes.io/aws-load-balancer-security-groups=${existing_annotation}. Skipping" 1>&2 | |
continue | |
fi | |
kubectl --context "${CONTEXT}" annotate service -n "${ns}" "${name}" \ | |
"service.beta.kubernetes.io/aws-load-balancer-security-groups=${shared_sgid}" | |
done | |
} | |
get_elb_sg_in_vpc() { | |
local region="$1" | |
local vpcid="$2" | |
aws ec2 describe-security-groups \ | |
--region "${region}" \ | |
--filters Name=vpc-id,Values="${vpcid}" | \ | |
jq -r '.SecurityGroups[] | select(.GroupName | startswith("k8s-elb-")) | .GroupId' | |
} | |
is_sg_orphan() { | |
local region="$1" | |
local sgid="$2" | |
aws ec2 describe-network-interfaces \ | |
--region "${region}" \ | |
--filters Name=group-id,Values="${sgid}" | \ | |
jq -e '.NetworkInterfaces == []' > /dev/null | |
} | |
delete_orphan_sg() { | |
local region="$1" | |
local vpcid="$2" | |
local sg="$3" | |
local nodes_sgid="$4" | |
if is_sg_orphan "${region}" "${sg}" ; then | |
[ -n "${FORCE:-}" ] || read -p "$sg is orphan, delete?. (Ctrl+C to cancel)" | |
aws ec2 revoke-security-group-ingress --region "${region}" \ | |
--group-id "${nodes_sgid}" \ | |
--protocol -1 \ | |
--source-group "${sg}" > /dev/null || true | |
aws ec2 delete-security-group --region "${region}" --group-id "${sg}" > /dev/null | |
echo "Security group $sg deleted" 1>&2 | |
else | |
echo "${sg} is still in use: " 1>&2 | |
aws ec2 describe-network-interfaces \ | |
--filters Name=group-id,Values="${sg}" | \ | |
jq -r '"- " + .NetworkInterfaces[].Description' | |
fi | |
} | |
delete_orphan_elb_sg() { | |
local region="$1" | |
local vpcid="$2" | |
local nodes_sgid="$3" | |
sgids="$(get_elb_sg_in_vpc "${region}" "${vpcid}")" | |
for sg in ${sgids}; do | |
delete_orphan_sg "${region}" "${vpcid}" "${sg}" "${nodes_sgid}" | |
done | |
} | |
revert() { | |
local region="$1" | |
local vpcid="$2" | |
local nodes_sgid="$3" | |
local shared_sgid="$4" | |
local namespace="$5" | |
echo "Removing annotation of LB services of NAMESPACE ${namespace} to not use SG ${shared_sgid}" 1>&2 | |
local loadbalancer_services | |
loadbalancer_services="$( | |
get_loadbalancer_services "${namespace}" | |
)" | |
local removed=false | |
for lb in ${loadbalancer_services}; do | |
ns="${lb%/*}" | |
name="${lb#*/}" | |
if kubectl --context "${CONTEXT}" get service -n "${ns}" "${name}" -o json | \ | |
jq -e --arg shared_sgid "${shared_sgid}" \ | |
'.metadata.annotations."service.beta.kubernetes.io/aws-load-balancer-security-groups" == $shared_sgid' >/dev/null; then | |
kubectl --context "${CONTEXT}" annotate service -n "${ns}" "${name}" \ | |
"service.beta.kubernetes.io/aws-load-balancer-security-groups-" | |
removed=true | |
else | |
echo "${lb} is not annotated" 1>&2 | |
fi | |
done | |
if [ "${removed}" == "true" ]; then | |
echo "Waiting reconciliation" 1>&2 | |
sleep 30 | |
fi | |
delete_orphan_sg "${region}" "${vpcid}" "${shared_sgid}" "${nodes_sgid}" | |
} | |
####################################### | |
echo "Processing ${CONTEXT}..." | |
read region vpcid nodes_sgid < <(get_k8s_aws_info) | |
echo "Detected region=${region} vpc=${vpcid} nodes SG=${nodes_sgid}" 1>&2 | |
main() { | |
action="$1" | |
case "${action}" in | |
"setup_sg") | |
####################################### | |
# 1. Create and retrieve the new shared SG | |
shared_sgid="$( | |
ensure_shared_security_group "${region}" "${vpcid}" | |
)" | |
####################################### | |
# 2. Authorize that in the nodes | |
# | |
# This is also done by the controller, but after the LB SGs are updated | |
# so adding it in before adding the annotation would prevent any downtime | |
# | |
ensure_authorize_shared_sg_in_nodes_sg "${region}" "${vpcid}" "${nodes_sgid}" "${shared_sgid}" | |
;; | |
"annotate_lbs") | |
####################################### | |
# 3. Annotate the loadbalancers | |
# | |
if ! shared_sgid="$( | |
find_shared_security_group "${region}" "${vpcid}" | |
)"; then | |
echo "ERROR: shared SG not found" 1>&2 | |
exit 1 | |
fi | |
annotate_load_balancers "${NAMESPACE_SELECTOR}" "${shared_sgid}" | |
;; | |
"check_load_balancers_sg") | |
print_load_balancers_sg "${NAMESPACE_SELECTOR}" "${region}" | |
;; | |
"delete_orphan_sg") | |
####################################### | |
# 5. Clean up orphaned ELB security groups | |
delete_orphan_elb_sg "${region}" "${vpcid}" "${nodes_sgid}" | |
;; | |
"all") | |
main setup_sg | |
main check_load_balancers_sg | |
main annotate_lbs | |
echo "Waiting for reconciliation" 1>&2 | |
sleep 30 | |
main check_load_balancers_sg | |
main delete_orphan_sg | |
;; | |
"revert") | |
if ! shared_sgid="$( | |
find_shared_security_group "${region}" "${vpcid}" | |
)"; then | |
echo "ERROR: shared SG not found" 1>&2 | |
exit 1 | |
fi | |
revert "${region}" "${vpcid}" "${nodes_sgid}" "${shared_sgid}" "${NAMESPACE_SELECTOR}" | |
print_load_balancers_sg "${NAMESPACE_SELECTOR}" "${region}" | |
;; | |
*) | |
echo "Unknown action: ${action}" 1>&2 | |
exit 1 | |
;; | |
esac | |
} | |
main "${1:-all}" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment