Last active
May 18, 2025 03:18
-
-
Save thimslugga/5238646ca6daac37784e54f5615a24ff to your computer and use it in GitHub Desktop.
Setup EC2 NAT Instance with Amazon Linux 2
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# https://github.com/1debit/alternat | |
# https://serverfault.com/questions/1137692/aws-nat-instance-setup | |
# https://www.redhat.com/en/blog/using-iptables-nft-hybrid-linux-firewall | |
# https://www.frozentux.net/iptables-tutorial/iptables-tutorial.html | |
# https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html/security_guide/sec-configuring_nat_using_nftables | |
# https://wiki.nftables.org/wiki-nftables/index.php/Performing_Network_Address_Translation_(NAT) | |
# https://home.regit.org/netfilter-en/nftables-quick-howto/ | |
# https://unix.stackexchange.com/questions/283275/how-to-do-masquerading-with-nftables | |
exec > >(tee /var/log/user-data.log|logger -t user-data -s 2>/dev/console) 2>&1 | |
shopt -s expand_aliases | |
export AWS_PAGER="" | |
curl_cmd="curl --silent --fail"; | |
imds_uri='http://169.254.169.254/latest' | |
token=$($curl_cmd -X PUT "${imds_uri}/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 300") | |
alias curl_cmd_with_token="$curl_cmd -H \"X-aws-ec2-metadata-token: ${token}\"" | |
iid_uri="${imds_uri}/meta-data/instance-id"; | |
instance_id=$(curl_cmd_with_token $iid_uri) | |
sudo yum update -y | |
sudo amazon-linux-extras install -y kernel-5.15 | |
sudo yum install -y yum-utils \ | |
bzip2 \ | |
zstd \ | |
sysstat \ | |
ethtool \ | |
iproute-tc \ | |
iptables-nft \ | |
nftables \ | |
ipset \ | |
tuned \ | |
tuna \ | |
irqbalance \ | |
conntrack-tools \ | |
grubby \ | |
perf \ | |
vim \ | |
jq | |
#yum install -y pcp pcp-system-tools pcp-zeroconf | |
sudo grubby --update-kernel=ALL --args="intel_idle.max_cstate=0 processor.max_cstate=0" | |
# Enable cgroupsv2 and pressure stall info | |
#sudo grubby --update-kernel=ALL --args="systemd.unified_cgroup_hierarchy=1 swapaccount=1 psi=1" | |
# Enable irqbalance | |
sudo systemctl enable --now irqbalance | |
# Enable TuneD and use the network-throughput profile | |
sudo systemctl enable --now tuned | |
sudo tuned-adm profile network-throughput | |
tuned-adm active | |
# Required to enable tcp bbr for congestion control | |
# Default is pfifo_fast on AL2 | |
sudo sysctl -q -w net.core.default_qdisc=fq_codel | |
# Use Google TCP BBR | |
# Default is cubic on AL2 | |
sudo /sbin/modprobe tcp_bbr | |
sudo sysctl -q -w net.ipv4.tcp_congestion_control=bbr | |
# More network tuning at runtime | |
sudo sysctl -q -w net.ipv4.ip_forward=1 | |
sudo sysctl -q -w net.ipv4.ip_local_port_range="1024 65535" | |
sudo sysctl -q -w net.ipv4.tcp_slow_start_after_idle=0 | |
sudo sysctl -q -w net.ipv4.conf.eth0.send_redirects=0 | |
# Get the total amount of memory in kilobytes from /proc/meminfo | |
memtotal=$(grep '^MemTotal:' /proc/meminfo | awk '{print $2}') | |
#memtotal=$(grep '^MemTotal:' /proc/meminfo | tr -s ' ' | cut -d ' ' -f2) | |
# Calculate the optimal value of vm.min_free_kbytes based on the amount of memory | |
if test "${memtotal}" -lt 524288; then | |
# if the system has less than 512MB of memory, set vm.min_free_kbytes to 12800 | |
echo "vm.min_free_kbytes=12800" | |
vm_min_free_kbytes=12800 | |
elif test "${memtotal}" -lt 1048576; then | |
# if the system has less than 1GB of memory, set vm.min_free_kbytes to 64000 | |
echo "vm.min_free_kbytes=64000" | |
vm_min_free_kbytes=64000 | |
elif test "${memtotal}" -lt 2097152; then | |
# if the system has less than 2GB of memory, set vm.min_free_kbytes to 128000 | |
echo "vm.min_free_kbytes=128000" | |
vm_min_free_kbytes=128000 | |
elif test "${memtotal}" -lt 4194304; then | |
# if the system has less than 4GB of memory, set vm.min_free_kbytes to 256000 | |
echo "vm.min_free_kbytes=256000" | |
vm_min_free_kbytes=256000 | |
elif test "${memtotal}" -lt 8388608; then | |
# if the system has less than 8GB of memory, set vm.min_free_kbytes to 512000 | |
echo "vm.min_free_kbytes=512000" | |
vm_min_free_kbytes=512000 | |
else | |
# if the system has 8GB+ memory, set vm.min_free_kbytes to 1048576 | |
echo "vm.min_free_kbytes=1048576" | |
vm_min_free_kbytes=1048576 | |
fi | |
cat <<EOF | sudo tee /etc/sysctl.d/99-ec2-nat-instance.conf | |
# https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/enhanced-networking-ena.html | |
# https://github.com/amzn/amzn-drivers/blob/master/kernel/linux/ena/ENA_Linux_Best_Practices.rst | |
# https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ena-express.html | |
# https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/9/html/monitoring_and_managing_system_status_and_performance/tuning-the-network-performance_monitoring-and-managing-system-status-and-performance | |
# Minimize console logging level for kernel printk messages | |
# 4 4 1 7 works too | |
kernel.printk=3 4 1 7 | |
# By default, the kernel.panic tunable is set to 0 and so the host | |
# does not reboot automatically if the kernel panics. | |
# | |
# To ensure that the node reboots automatically after it panics, | |
# this tunable must be set to a non zero value. | |
kernel.panic=10 | |
# Ensure that your reserved kernel memory is sufficient to sustain a | |
# high rate of packet buffer allocations (the default value may be too small). | |
# | |
# As a rule of thumb, you should set this value to between 1-3% of available | |
# system memory, and adjust this value up or down to meet the needs of your | |
# application. | |
vm.min_free_kbytes="${vm_min_free_kbytes}" | |
vm.swappiness=10 | |
# Suppress logging of net_ratelimit callback | |
net.core.message_cost=0 | |
# Enable BPF JIT compiler | |
net.core.bpf_jit_enable=1 | |
# https://access.redhat.com/solutions/1241943 | |
# https://access.redhat.com/solutions/30453 | |
net.core.netdev_max_backlog=1000 | |
net.core.somaxconn=4096 | |
#net.core.netdev_max_backlog=2000 | |
#net.core.somaxconn=8192 | |
#net.core.netdev_budget=600 | |
#net.core.netdev_budget_usecs=4000 | |
# fq can be used as a drop in replacement for pfifo_fast. | |
# http://man7.org/linux/man-pages/man8/tc-fq.8.html | |
# https://www.bufferbloat.net/projects/codel/wiki/ | |
# | |
# * fq is a better choice for end hosts because of it supports tcp pacing, | |
# which is a requirement for the bbr congestion control algorithm. | |
# * fq is best for fat servers with tcp-heavy workloads and particularly at | |
# 10GigE speeds or above | |
# | |
# * fq-codel is a better choice for forwarding/routers which don't originate | |
# local traffic, hypervisors and best general purpose qdisc. | |
# | |
# Note: For queue management, sch_fq was recommended instead of fq_codel as of linux 3.12. | |
# Note: Required to enable BBR for congestion control | |
# Note: Default is pfifo_fast on AL2 | |
net.core.default_qdisc=fq_codel | |
# TCP BBR | |
# This is not an official Google product LOL | |
# https://github.com/google/bbr/blob/master/README | |
# Note: BBR will support fq_codel after linux-4.13. | |
# Note: BBR must be used with fq qdisc with pacing enabled, since pacing is integral to the BBR design | |
# and implementation. BBR without pacing would not function properly and may incur unnecessary | |
# high packet loss rates. | |
# Note: Default is cubic on AL2 | |
net.ipv4.tcp_congestion_control=bbr | |
# Negotiate TCP ECN for active and passive connections | |
# | |
# Turn on ECN as this will let AQM sort out the congestion backpressure without | |
# incurring packet losses and retransmissions. | |
# | |
# In order to make best used of this we really need ECN-enablement | |
# sysctl net.ipv4.tcp_ecn on end-hosts. | |
# | |
# https://github.com/systemd/systemd/pull/9143 | |
# https://github.com/systemd/systemd/issues/9748 | |
#net.ipv4.tcp_ecn=1 | |
net.ipv4.tcp_ecn=2 | |
net.ipv4.tcp_ecn_fallback=1 | |
# Turn on MultiPath TCP | |
net.mptcp.enabled=1 | |
# Turn on tcp window scaling | |
net.ipv4.tcp_window_scaling=1 | |
# Recommended to enable this for hosts with jumbo frames | |
# i.e. mtu 9000+ enabled | |
net.ipv4.tcp_mtu_probing=1 | |
# https://blog.cloudflare.com/optimizing-the-linux-stack-for-mobile-web-per/ | |
# https://access.redhat.com/solutions/168483 | |
# Avoid falling back to slow start after a connection goes idle | |
# keeps our cwnd large with the keep alive connections (kernel > 3.6) | |
net.ipv4.tcp_slow_start_after_idle=0 | |
# Bump the local port range | |
net.ipv4.ip_local_port_range="1024 65535" | |
# Turn on IPv4 forwarding | |
net.ipv4.ip_forward=1 | |
#net.ipv4.conf.all.forwarding=1 | |
#net.ipv4.conf.default.forwarding=1 | |
# Turn on IPv6 forwading | |
#net.ipv6.conf.all.forwarding=1 | |
#net.ipv6.conf.default.forwarding=1 | |
# DO NOT send redirects as this is not our job | |
net.ipv4.conf.eth0.send_redirects=0 | |
EOF | |
sudo sysctl --system | |
# udev rules | |
cat <<'EOR' | sudo tee /etc/udev/rules.d/51-ec2-net-tuning.rules | |
# Increase the rx and tx ring buffer size, use ethtool -g eth0 to verify | |
# https://serverfault.com/a/975746 | |
SUBSYSTEM=="net", ACTION=="add|change", KERNEL=="eth*|en*", DRIVERS=="ena", RUN+="/usr/sbin/ethtool -G %k rx 4096 tx 1024" | |
EOR | |
sudo systemctl daemon-reload | |
#sudo systemctl restart systemd-udevd | |
#sudo udevadm test /sys/class/net/eth0 | |
sudo udevadm control --reload-rules \ | |
&& sudo udevadm trigger --attr-match=subsystem=net | |
# nftables | |
sudo /sbin/modprobe nft_nat | |
sudo /sbin/modprobe nft_chain_nat | |
sudo alternatives --set iptables /usr/sbin/iptables-nft | |
sudo alternatives --set ip6tables /usr/sbin/ip6tables-nft | |
sudo alternatives --set arptables /usr/sbin/arptables-nft | |
sudo alternatives --set ebtables /usr/sbin/ebtables-nft | |
systemctl enable --now nftables | |
# verify | |
iptables --version | |
nft list ruleset | |
cat <<'EOF' | sudo tee /etc/nftables/ec2-nat-instance.conf | |
#!/usr/sbin/nft -f | |
flush ruleset | |
define INT_IF = eth1 | |
define EXT_IF = eth1 | |
table inet firewall { | |
chain inbound { | |
#type filter hook input priority 0; policy drop; | |
#ct state established,related accept | |
#ct state invalid drop | |
iifname lo accept | |
#ip protocol icmp limit rate 4/second accept | |
#ip6 nexthdr ipv6-icmp limit rate 4/second accept | |
#ip protocol igmp limit rate 4/second accept | |
#iifname $INT_IF udp dport { 67, 68 } accept | |
#iifname $INT_IF tcp dport 22 ip saddr $INT_NET accept | |
#log prefix "[nftables] Inbound Denied: " flags all counter drop | |
} | |
chain forward { | |
#type filter hook forward priority 0; policy drop; | |
#ct state established,related accept | |
#ct state invalid drop | |
#ip saddr { $INT_NET, $EXT_HOST } accept | |
#ip daddr $WEB tcp dport 80 limit rate 10/second log prefix "[nftables] Web Server Access: " accept | |
#log prefix "[nftables] Forward Denied: " flags all counter drop | |
} | |
chain outbound { | |
#type filter hook output priority 0; policy accept; | |
} | |
} | |
table ip nat { | |
chain prerouting { | |
type nat hook prerouting priority -100; policy accept; | |
} | |
chain postrouting { | |
type nat hook postrouting priority 100; policy accept; | |
oifname { "eth1" } masquerade # "eth1" is our external interface | |
} | |
} | |
EOF | |
cat <<'EOF' | sudo tee -a /etc/sysconfig/nftables.conf | |
include "/etc/nftables/ec2-nat-instance.nft" | |
EOF | |
sudo systemctl reboot |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment