Skip to content

Instantly share code, notes, and snippets.

@kwilczynski
Last active March 22, 2021 00:23
Show Gist options
  • Save kwilczynski/a80fbc45332a142cc0d6e4081056a327 to your computer and use it in GitHub Desktop.
Save kwilczynski/a80fbc45332a142cc0d6e4081056a327 to your computer and use it in GitHub Desktop.
Linux IPVS thundering herd mitigation
diff --git a/include/linux/ip_vs.h b/include/linux/ip_vs.h
index 4deb383..3d4e7af 100644
--- a/include/linux/ip_vs.h
+++ b/include/linux/ip_vs.h
@@ -27,6 +27,18 @@
#define IP_VS_DEST_F_AVAILABLE 0x0001 /* server is available */
#define IP_VS_DEST_F_OVERLOAD 0x0002 /* server is overloaded */
+ /*
+ * Advisory flags for slow start.
+ *
+ * The absolute value size of the weight change will be stored
+ * in dest->slow_start_data. The flag and slow_start_data may
+ * be used and modified by the scheduler to effect slow start.
+ */
+#define IP_VS_DEST_F_WEIGHT_INC 0x0004 /* Weight has been increaced */
+#define IP_VS_DEST_F_WEIGHT_DEC 0x0008 /* Weight has been increaced */
+#define IP_VS_DEST_F_WEIGHT_MASK \
+ (IP_VS_DEST_F_WEIGHT_INC|IP_VS_DEST_F_WEIGHT_DEC)
+
/*
* IPVS sync daemon states
*/
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index de527d1..894ad55 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -673,6 +673,10 @@ struct ip_vs_dest {
__be16 vport; /* virtual port number */
union nf_inet_addr vaddr; /* virtual IP address */
__u32 vfwmark; /* firewall mark of service */
+
+ /* for slow start */
+ atomic_t slow_start_data;
+ atomic_t slow_start_data2;
};
@@ -1209,6 +1213,9 @@ ip_vs_find_dest(struct net *net, int af, const union nf_inet_addr *daddr,
__be16 dport, const union nf_inet_addr *vaddr, __be16 vport,
__u16 protocol, __u32 fwmark);
extern struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp);
+extern inline unsigned int
+ip_vs_conn_slow_start_dest_handicap(struct ip_vs_dest *dest,
+ struct ip_vs_service *svc);
/*
@@ -1407,7 +1414,7 @@ static inline void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp)
#endif
static inline unsigned int
-ip_vs_dest_conn_overhead(struct ip_vs_dest *dest)
+ip_vs_dest_conn_overhead(struct ip_vs_dest *dest, struct ip_vs_service *svc)
{
/*
* We think the overhead of processing active connections is 256
@@ -1416,7 +1423,8 @@ ip_vs_dest_conn_overhead(struct ip_vs_dest *dest)
* use the following formula to estimate the overhead now:
* dest->activeconns*256 + dest->inactconns
*/
- return (atomic_read(&dest->activeconns) << 8) +
+ return ((atomic_read(&dest->activeconns) +
+ ip_vs_conn_slow_start_dest_handicap(dest, svc)) << 8) +
atomic_read(&dest->inactconns);
}
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 5c318f7..85ecc88 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -52,6 +52,8 @@
#include <net/ip_vs.h>
+EXPORT_SYMBOL_GPL(ip_vs_conn_slow_start_dest_handicap);
+
/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
static DEFINE_MUTEX(__ip_vs_mutex);
@@ -764,6 +766,21 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
{
struct netns_ipvs *ipvs = net_ipvs(svc->net);
int conn_flags;
+ int old_weight;
+
+ /* set hints for slow start */
+ dest->flags &= ~IP_VS_DEST_F_WEIGHT_MASK;
+
+ old_weight = atomic_read(&dest->weight);
+
+ if (old_weight < udest->weight) {
+ atomic_set(&dest->slow_start_data, udest->weight - old_weight);
+ dest->flags |= IP_VS_DEST_F_WEIGHT_INC;
+ }
+ else if (old_weight > udest->weight) {
+ atomic_set(&dest->slow_start_data, old_weight - udest->weight);
+ dest->flags |= IP_VS_DEST_F_WEIGHT_DEC;
+ }
/* set the weight and the flags */
atomic_set(&dest->weight, udest->weight);
@@ -3472,6 +3489,130 @@ out:
return ret;
}
+static void
+__ip_vs_conn_set_slow_start(struct ip_vs_dest *dest, struct ip_vs_service *svc)
+{
+ __u32 ss_handicap;
+ __u32 ss_shift;
+ __u32 ndest;
+ __u32 w = 0;
+ __u32 dest_w = 0;
+ struct list_head *l, *e;
+ struct ip_vs_dest *d;
+
+ /*
+ * If the weight is zero just set the slow_start hint and data to
+ * zero too as they won't be used.
+ */
+
+ if((dest->flags & IP_VS_DEST_F_WEIGHT_DEC) ||
+ !(dest_w = atomic_read(&dest->weight))) {
+#ifdef CONFIG_IP_VS_DEBUG
+ IP_VS_DBG(1, "CONN slow_start: null\n");
+#endif
+ atomic_set(&dest->slow_start_data, 0);
+ atomic_set(&dest->slow_start_data2, 0);
+ return;
+ }
+
+ /*
+ * Calculate a weighted number of connections this server would
+ * have if all the currently active connections were redistributed
+ * limited to a maximum of 64k.
+ */
+ l = &svc->destinations;
+
+ ss_handicap = 0;
+ ndest = 0;
+
+ for (e = l->next; e != l; e = e->next) {
+ d = list_entry(e, struct ip_vs_dest, n_list);
+ w = atomic_read(&d->weight);
+
+ if (w < 1 || d == dest) {
+ continue;
+ }
+
+ ndest++;
+
+ /* Try to avoid overflowint ss_handicap */
+ ss_shift = atomic_read(&d->activeconns);
+
+ if (ss_shift & 0xffff0000)
+ ss_shift = 0xffff;
+
+ ss_shift = (ss_shift << 16 ) / (w & 0xffff);
+
+ if (~0L - ss_handicap < ss_shift) {
+ ss_handicap = ~0L;
+ break;
+ }
+
+ ss_handicap += ss_shift;
+ }
+
+ if (ndest)
+ ss_handicap = (ss_handicap * dest_w / ndest) >> 16;
+
+ /* ss_shift = log_2((ss_handicap & 0xfff) >> 3) */
+ if (ss_handicap) {
+ __u32 i;
+ ss_shift = ss_handicap;;
+
+ for (i = 12; i > 0; i--) {
+ if (ss_shift & 0x8000)
+ break;
+
+ ss_shift <<= 1;
+ }
+
+ ss_shift = i;
+ ss_handicap <<= ss_shift;
+ }
+ else
+ ss_shift = 0;
+
+ atomic_set(&dest->slow_start_data, ss_handicap);
+ atomic_set(&dest->slow_start_data2, ss_shift);
+
+#ifdef CONFIG_IP_VS_DEBUG
+ IP_VS_DBG_BUF(1, "CONN slow_start_init: server %s:%u "
+ "handicap=%u (%u) shift=%u ndest=%u\n",
+ IP_VS_DBG_ADDR(dest->af, &dest->addr),
+ ntohs(dest->port), ss_handicap,
+ ss_handicap >> ss_shift, ss_shift, ndest);
+#endif
+}
+
+inline unsigned int
+ip_vs_conn_slow_start_dest_handicap(struct ip_vs_dest *dest,
+ struct ip_vs_service *svc)
+{
+ unsigned int handicap;
+
+ /* Set up slow_start if weight has recently changed */
+ if (unlikely(dest->flags & IP_VS_DEST_F_WEIGHT_MASK)) {
+ __ip_vs_conn_set_slow_start(dest, svc);
+ dest->flags &= ~IP_VS_DEST_F_WEIGHT_MASK;
+ }
+
+ handicap = atomic_read(&dest->slow_start_data);
+
+ if (unlikely(!handicap))
+ return 0;
+
+ handicap--;
+ atomic_set(&dest->slow_start_data, handicap);
+
+#ifdef CONFIG_IP_VS_DEBUG
+ if (unlikely(!handicap))
+ IP_VS_DBG_BUF(1, "CONN slow_start_end: server %s:%u ",
+ IP_VS_DBG_ADDR(dest->af, &dest->addr),
+ ntohs(dest->port));
+#endif
+
+ return handicap >> atomic_read(&dest->slow_start_data2);
+}
static struct genl_ops ip_vs_genl_ops[] __read_mostly = {
{
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index 0f16283..bc0927d 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -413,7 +413,7 @@ __ip_vs_lblc_schedule(struct ip_vs_service *svc)
continue;
if (atomic_read(&dest->weight) > 0) {
least = dest;
- loh = ip_vs_dest_conn_overhead(least);
+ loh = ip_vs_dest_conn_overhead(least, svc);
goto nextstage;
}
}
@@ -427,7 +427,7 @@ __ip_vs_lblc_schedule(struct ip_vs_service *svc)
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
- doh = ip_vs_dest_conn_overhead(dest);
+ doh = ip_vs_dest_conn_overhead(dest, svc);
if (loh * atomic_read(&dest->weight) >
doh * atomic_read(&least->weight)) {
least = dest;
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index eec797f..707a32a 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -161,7 +161,8 @@ static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set)
}
/* get weighted least-connection node in the destination set */
-static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
+static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set,
+ struct ip_vs_service *svc)
{
register struct ip_vs_dest_set_elem *e;
struct ip_vs_dest *dest, *least;
@@ -178,7 +179,7 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
if ((atomic_read(&least->weight) > 0)
&& (least->flags & IP_VS_DEST_F_AVAILABLE)) {
- loh = ip_vs_dest_conn_overhead(least);
+ loh = ip_vs_dest_conn_overhead(least, svc);
goto nextstage;
}
}
@@ -191,7 +192,7 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
- doh = ip_vs_dest_conn_overhead(dest);
+ doh = ip_vs_dest_conn_overhead(dest, svc);
if ((loh * atomic_read(&dest->weight) >
doh * atomic_read(&least->weight))
&& (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
@@ -213,7 +214,8 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set)
/* get weighted most-connection node in the destination set */
-static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
+static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set,
+ struct ip_vs_service *svc)
{
register struct ip_vs_dest_set_elem *e;
struct ip_vs_dest *dest, *most;
@@ -226,7 +228,7 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
list_for_each_entry(e, &set->list, list) {
most = e->dest;
if (atomic_read(&most->weight) > 0) {
- moh = ip_vs_dest_conn_overhead(most);
+ moh = ip_vs_dest_conn_overhead(most, svc);
goto nextstage;
}
}
@@ -236,7 +238,7 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set)
nextstage:
list_for_each_entry(e, &set->list, list) {
dest = e->dest;
- doh = ip_vs_dest_conn_overhead(dest);
+ doh = ip_vs_dest_conn_overhead(dest, svc);
/* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */
if ((moh * atomic_read(&dest->weight) <
doh * atomic_read(&most->weight))
@@ -583,7 +585,7 @@ __ip_vs_lblcr_schedule(struct ip_vs_service *svc)
if (atomic_read(&dest->weight) > 0) {
least = dest;
- loh = ip_vs_dest_conn_overhead(least);
+ loh = ip_vs_dest_conn_overhead(least, svc);
goto nextstage;
}
}
@@ -597,7 +599,7 @@ __ip_vs_lblcr_schedule(struct ip_vs_service *svc)
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
- doh = ip_vs_dest_conn_overhead(dest);
+ doh = ip_vs_dest_conn_overhead(dest, svc);
if (loh * atomic_read(&dest->weight) >
doh * atomic_read(&least->weight)) {
least = dest;
@@ -662,7 +664,7 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
/* Get the least loaded destination */
read_lock(&en->set.lock);
- dest = ip_vs_dest_set_min(&en->set);
+ dest = ip_vs_dest_set_min(&en->set, svc);
read_unlock(&en->set.lock);
/* More than one destination + enough time passed by, cleanup */
@@ -672,7 +674,7 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
struct ip_vs_dest *m;
write_lock(&en->set.lock);
- m = ip_vs_dest_set_max(&en->set);
+ m = ip_vs_dest_set_max(&en->set, svc);
if (m)
ip_vs_dest_set_erase(&en->set, m);
write_unlock(&en->set.lock);
diff --git a/net/netfilter/ipvs/ip_vs_lc.c b/net/netfilter/ipvs/ip_vs_lc.c
index f391819..743d7b7 100644
--- a/net/netfilter/ipvs/ip_vs_lc.c
+++ b/net/netfilter/ipvs/ip_vs_lc.c
@@ -46,7 +46,7 @@ ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
if ((dest->flags & IP_VS_DEST_F_OVERLOAD) ||
atomic_read(&dest->weight) == 0)
continue;
- doh = ip_vs_dest_conn_overhead(dest);
+ doh = ip_vs_dest_conn_overhead(dest, svc);
if (!least || doh < loh) {
least = dest;
loh = doh;
diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c
index bc1bfc4..074e61b 100644
--- a/net/netfilter/ipvs/ip_vs_wlc.c
+++ b/net/netfilter/ipvs/ip_vs_wlc.c
@@ -55,7 +55,7 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) &&
atomic_read(&dest->weight) > 0) {
least = dest;
- loh = ip_vs_dest_conn_overhead(least);
+ loh = ip_vs_dest_conn_overhead(least, svc);
goto nextstage;
}
}
@@ -69,7 +69,7 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
list_for_each_entry_continue(dest, &svc->destinations, n_list) {
if (dest->flags & IP_VS_DEST_F_OVERLOAD)
continue;
- doh = ip_vs_dest_conn_overhead(dest);
+ doh = ip_vs_dest_conn_overhead(dest, svc);
if (loh * atomic_read(&dest->weight) >
doh * atomic_read(&least->weight)) {
least = dest;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment