-
-
Save elico/492d8f75f584ec1bed98b2a054a02cbb to your computer and use it in GitHub Desktop.
#!/usr/bin/env bash | |
DEST_NET="192.168.111.0/24" | |
NEXT_HOPS="2" | |
NEXT_HOP_1="192.168.126.202" | |
NEXT_HOP_2="192.168.126.203" | |
NEXT_HOP_1_TABLE="202" | |
NEXT_HOP_2_TABLE="203" | |
NFTABLES="/usr/sbin/nft" | |
IPTABLES="/sbin/iptables" | |
IP="/sbin/ip" | |
LAN="eth0" | |
WAN="eth1" | |
## Disabling Reverse path filter | |
for i in /proc/sys/net/ipv4/conf/*/rp_filter | |
do | |
echo $i | |
cat $i | |
echo 0 > $i | |
done | |
DTABLE="${NEXT_HOP_1_TABLE}" | |
$IP route del ${DEST_NET} | |
$IP route flush table ${DTABLE} | |
$IP route show | grep -Ev '^default' \ | |
| while read ROUTE ; do | |
$IP route add table ${DTABLE} ${ROUTE} | |
done | |
$IP route add default via ${NEXT_HOP_1} table ${DTABLE} | |
DTABLE="${NEXT_HOP_2_TABLE}" | |
$IP route flush table ${DTABLE} | |
$IP route show | grep -Ev "^default" \ | |
| while read ROUTE ; do | |
$IP route add table ${DTABLE} ${ROUTE} | |
done | |
$IP route add default via ${NEXT_HOP_2} table ${DTABLE} | |
$IP route add ${DEST_NET} via ${NEXT_HOP_1} | |
#NAT | |
${NFTABLES} add table nat | |
${NFTABLES} add chain ip nat postrouting '{ type nat hook postrouting priority 100; policy accept; }' | |
${NFTABLES} add rule nat postrouting oif ${WAN} masquerade | |
# MANGLE | |
${NFTABLES} add table mangle | |
${NFTABLES} add chain ip mangle prerouting '{ type filter hook prerouting priority -150; policy accept; }' | |
${NFTABLES} add chain ip mangle input '{ type filter hook input priority -150; policy accept; }' | |
${NFTABLES} add chain ip mangle forward '{ type filter hook forward priority -150; policy accept; }' | |
${NFTABLES} add chain ip mangle output '{ type route hook output priority -150; policy accept; }' | |
${NFTABLES} add chain ip mangle postrouting '{ type filter hook postrouting priority -150; policy accept; }' | |
${NFTABLES} add chain ip mangle wan1 | |
${NFTABLES} add rule ip mangle wan1 counter ct mark set 0x1 | |
${NFTABLES} add chain ip mangle wan2 | |
${NFTABLES} add rule ip mangle wan2 counter ct mark set 0x2 | |
# 5-tuple/flow/PCC LOAD Balance | |
${NFTABLES} add chain ip mangle PCC_OUT_TCP | |
${NFTABLES} add rule ip mangle PCC_OUT_TCP counter jhash ip saddr . tcp sport . ip daddr . tcp dport mod 2 vmap { 0 : jump wan1, 1 : jump wan2 } | |
${NFTABLES} add chain ip mangle PCC_OUT_UDP | |
${NFTABLES} add rule ip mangle PCC_OUT_UDP counter jhash ip saddr . udp sport . ip daddr . udp dport mod 2 vmap { 0 : jump wan1, 1 : jump wan2 } | |
${NFTABLES} add chain ip mangle PCC_OUT_OTHERS | |
${NFTABLES} add rule ip mangle PCC_OUT_OTHERS counter ip protocol { tcp, udp } return | |
${NFTABLES} add rule ip mangle PCC_OUT_OTHERS counter jhash ip saddr . ip daddr mod 2 vmap { 0 : jump wan1, 1 : jump wan2 } | |
${NFTABLES} add rule ip mangle prerouting counter meta mark set ct mark | |
${NFTABLES} add rule ip mangle prerouting ct mark != 0x0 counter ct mark set mark | |
${NFTABLES} add rule ip mangle prerouting iifname "${LAN}" ip protocol tcp ct state new counter jump PCC_OUT_TCP | |
${NFTABLES} add rule ip mangle prerouting iifname "${LAN}" ip protocol udp ct state new counter jump PCC_OUT_UDP | |
${NFTABLES} add rule ip mangle prerouting iifname "${LAN}" ct state new counter jump PCC_OUT_OTHERS | |
${NFTABLES} add rule ip mangle prerouting ct mark 0x1 counter meta mark set 0x1 | |
${NFTABLES} add rule ip mangle prerouting ct mark 0x2 counter meta mark set 0x2 | |
${NFTABLES} add rule ip mangle postrouting counter ct mark set mark | |
$IP rule|grep "from all fwmark 0x1 lookup ${NEXT_HOP_1_TABLE}" >/dev/null | |
if [ "$?" -eq "1" ]; then | |
$IP rule add fwmark 1 table ${NEXT_HOP_1_TABLE} | |
fi | |
$IP rule|grep "from all fwmark 0x2 lookup ${NEXT_HOP_2_TABLE}" >/dev/null | |
if [ "$?" -eq "1" ]; then | |
$IP rule add fwmark 2 table ${NEXT_HOP_2_TABLE} | |
fi |
Hi,
thanks for your answer.
Could you please update your nftables-rules-dump-putput.txt with your last rule set ? I think you have only updated you .rb script... Does your final rule set change ?
I have some services on firewall/router itself (openvpn for example), and i have to begin my mangle / OUTPUT chain with : "ct mark != 0x0 counter meta mark set ct mark" to allow this services to work as expected.
My complete mangle / OUTPUT chain is :
chain OUTPUT {
type route hook output priority mangle; policy accept;
# necessary for local services (ovpn)
ct mark != 0x0 counter meta mark set ct mark
# force reroute-check DHCPC RENEW skgid process via its own iface
udp sport 68 udp dport 67 meta skgid $skgid_wan1 counter jump MWAN1_SL comment "mwan1_dhcpc_skgid"
udp sport 68 udp dport 67 meta skgid $skgid_wan2 counter jump MWAN2_SL comment "mwan2_dhcpc_skgid"
#udp sport 68 udp dport 67 meta skgid $skgid_wan3 counter jump MWAN3_SL comment "mwan3_dhcpc_skgid"
}
Moreover, if i don't have "ct mark != 0x0 counter meta mark set ct mark" at the END of mangle / PREROUTING, the packets from internet clients to router/firewall itself services (openvpn), are NOT marked as expected (but it works). I think it is not clean... Finally, my complete mangle / PREROUTING is :
chain PREROUTING {
type filter hook prerouting priority mangle; policy accept;
iifname $iface_wan1 ct state new counter jump MWAN1 comment "mwan1_orange1"
iifname $iface_wan2 ct state new counter jump MWAN2 comment "mwan2_orange2"
iifname $iface_wan3 ct state new counter jump MWAN3 comment "mwan3_lte"
iifname $iface_beta ct state new counter jump MWAN5 comment "mwan5_beta"
iifname $iface_vanisher1 ct state new counter jump MWAN6 comment "mwan6_vanisher_orange1"
iifname $iface_vanisher2 ct state new counter jump MWAN7 comment "mwan7_vanisher_orange2"
iifname $iface_vanisher3 ct state new counter jump MWAN9 comment "mwan9_vanisher_lte"
ct state new counter jump MWAN
ct mark != 0x0 counter meta mark set ct mark
}
I do not understand why is required to begin mangle / PREROUTING with "ct mark != 0x0 counter ct mark set mark"...
Finally, i have a very (too much) complicated daemon script to monitor nft tables, i will simplify and refactor it following your recommendations :)
Do you prefer to continue this on your email ?
thanks.
@cyayon it's up to you if email or not.
I will try later to upgrade the nftables-rules-dump-putput.txt in the repo.
Since netfilter(nftables/iptables) have jumps and goto you can partition the tables and update only specific parts of it using a nft script.
Since nftables nft does atomic changes(compared to iptables which is not) you are guaranteed that once you change a vmap or another part it will not affect traffic and will not cause distribution of service.
I don't know how you check or how you do things and feel free to share more if you would like on email.
The main difference can be seen at:
elico/mwan-nft-lb-example@22e6eb2#diff-68054fdcdf4d0108a2b62e83360a742fb3b8334f4bb93b6f285ca27403eca11a
For a simple ruleset with 10-50 +- commands you can use a single nft add rule
or any similar single command action.
But when you have 150 commands it takes a lot of time to run and is also prune to time race conditions.
With 1000+ nft commans you must use a nft script to be performance wise.
Comparing the 100 rules/commands to 1.2 k commands which I used, it took more then 30 seconds compared to 1-2 seconds inside a nft script.
Be in touch!
thanks.
@cyayon I updated the rules.
Take a peek at:
https://github.com/elico/mwan-nft-lb-example/blob/main/run-lab.sh
It creates a full lab in Linux Namespaces.
You might need to tune the code a bit to make it work on you environment.
The scripts create multiple routers and a client that simulates a network with 10 GWs.
thanks !
@cyayon Hi,
Thanks, Due to your questions I had to think this script to find out if there is a better way to implement this ruleset or another.
So I have updated the example repository at: https://github.com/elico/mwan-nft-lb-example
The reason I am using
meta mark set ct mark
andct mark != 0x0 counter ct mark set mark
is for a counter.The real one that we need at the beginning is only the:
ct mark != 0x0 counter ct mark set mark
For the marking to work persistently we need to first create a CT mark, then stamp a mark on the packet so it would be routed properly.
However we don't want to do this LB again from 0, on the postrouting we save the mark into the CT table.
The above was for the first packet of a "connection" ie in the
new
state. So let say a three way handshake in nft should be like this:The restore mark should be the first rule in the prerouting or else the connection mark will not be attached to the packet.
In your ruleset you are using:
Which might be good enough in many scenarios to avoid remarking etc.
I don't know why the counters for the
counter packets 1 bytes 108 ct state established,related return
in the MWAN is in 1 exactly but never mind this.Let me walk throw the MWAN rules and see
let say to port 22 and port 80
new connection to port 22 will hit { 4 or 5 or 6 }
new connection to port 80 will hit { 4 or 5 or 6 or 7 or 8 or 9 }
The last hit..
I believe the vmap solution is the best since it guarantees a hit without any misses.
Another good thing would be that in a case of a link down you only need to replace one thing, the vmap content.
You can leave the routing mark rules and tables as is but change the vmap to use only the links/gw which are in UP state.
I don't know what daemon you have but you can simplify it like this:
Run a ping\else monitoring daemon per GW\interface, every deamon will have a "state" file like:
/var/run/wan_monitor/up/{wan1,wan2,wan3..}
if the wan interface is down then remove the state file, if it's up touch the file.
and a simple text file at:
/var/run/wan_monitor/status
The main daemon will do something like this in any language you want:
for sites/systems like pcloud which requires the same srcip you would probably want to balance with a:
srcip+dstip or srcip+dstip+port
It will balance out connections however only to the same exact destination.
For example 8.8.8.8 and 8.8.4.4 and 1.1.1.1 will probably be balanced across 3 wan connections for the same client.
However it's possible that a VOD service has a multiple subnets and their redirector and auth service is on x.1.z.1 while the streaming service is on x.2.y.1.
The vod auth service knows about the client ip of wan1 and when the client access x.2.y.1 it will be routed via wan2 or up.
The auth token to the streaming service contains information on the client IP of wan1 and might block access since the request is from wan2.
For regular users persistence wan out-band interface for 30 minutes per client srcip is acceptable.
There are other techniques to LB and I cannot answer since it requires actual sys/net admin hours and I cannot just give them freely.. food.. car.. apartment etc.
Anyway my email and mobile are available publicly: [email protected] , +972-5-28704261