Created
May 26, 2015 22:13
-
-
Save rzezeski/ba4753873d54c3a744d4 to your computer and use it in GitHub Desktop.
usr/src/uts/common/inet diff (gate to joyent)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Only in illumos-joyent/usr/src/uts/common/inet: inet_hash.h | |
diff -ur illumos-gate/usr/src/uts/common/inet/ip/ip_attr.c illumos-joyent/usr/src/uts/common/inet/ip/ip_attr.c | |
--- illumos-gate/usr/src/uts/common/inet/ip/ip_attr.c 2015-05-26 22:07:54.760515401 +0000 | |
+++ illumos-joyent/usr/src/uts/common/inet/ip/ip_attr.c 2015-05-26 22:08:56.031054703 +0000 | |
@@ -909,6 +909,11 @@ | |
*/ | |
if (ixa->ixa_free_flags & IXA_FREE_CRED) | |
crhold(ixa->ixa_cred); | |
+ | |
+ /* | |
+ * There is no cleanup in progress on this new copy. | |
+ */ | |
+ ixa->ixa_tcpcleanup = IXATC_IDLE; | |
} | |
/* | |
diff -ur illumos-gate/usr/src/uts/common/inet/ip/ip_if.c illumos-joyent/usr/src/uts/common/inet/ip/ip_if.c | |
--- illumos-gate/usr/src/uts/common/inet/ip/ip_if.c 2015-05-26 22:07:54.768694777 +0000 | |
+++ illumos-joyent/usr/src/uts/common/inet/ip/ip_if.c 2015-05-26 22:08:56.041157550 +0000 | |
@@ -25,6 +25,9 @@ | |
* Copyright 2013 Joyent, Inc. | |
* Copyright (c) 2014, OmniTI Computer Consulting, Inc. All rights reserved. | |
*/ | |
+/* | |
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved. | |
+ */ | |
/* | |
* This file contains the interface control functions for IP. | |
diff -ur illumos-gate/usr/src/uts/common/inet/ip/ip_squeue.c illumos-joyent/usr/src/uts/common/inet/ip/ip_squeue.c | |
--- illumos-gate/usr/src/uts/common/inet/ip/ip_squeue.c 2015-05-26 22:07:54.776765157 +0000 | |
+++ illumos-joyent/usr/src/uts/common/inet/ip/ip_squeue.c 2015-05-26 22:08:56.050809608 +0000 | |
@@ -163,7 +163,7 @@ | |
{ | |
squeue_t *sqp; | |
- sqp = squeue_create(ip_squeue_worker_wait, pri); | |
+ sqp = squeue_create(ip_squeue_worker_wait, pri, B_TRUE); | |
ASSERT(sqp != NULL); | |
if (ip_squeue_create_callback != NULL) | |
ip_squeue_create_callback(sqp); | |
diff -ur illumos-gate/usr/src/uts/common/inet/ip/ip_tunables.c illumos-joyent/usr/src/uts/common/inet/ip/ip_tunables.c | |
--- illumos-gate/usr/src/uts/common/inet/ip/ip_tunables.c 2015-05-26 22:07:54.777275286 +0000 | |
+++ illumos-joyent/usr/src/uts/common/inet/ip/ip_tunables.c 2015-05-26 22:08:56.051415067 +0000 | |
@@ -20,6 +20,7 @@ | |
*/ | |
/* | |
* Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. | |
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved. | |
* Copyright (c) 2013 by Delphix. All rights reserved. | |
* Copyright (c) 2012, Joyent, Inc. All rights reserved. | |
*/ | |
diff -ur illumos-gate/usr/src/uts/common/inet/ip/ipsecesp.c illumos-joyent/usr/src/uts/common/inet/ip/ipsecesp.c | |
--- illumos-gate/usr/src/uts/common/inet/ip/ipsecesp.c 2015-05-26 22:07:54.782342290 +0000 | |
+++ illumos-joyent/usr/src/uts/common/inet/ip/ipsecesp.c 2015-05-26 22:08:56.057572680 +0000 | |
@@ -234,8 +234,7 @@ | |
{ | |
espstack->esp_ksp = kstat_create_netstack("ipsecesp", 0, "esp_stat", | |
"net", KSTAT_TYPE_NAMED, | |
- sizeof (esp_kstats_t) / sizeof (kstat_named_t), | |
- KSTAT_FLAG_PERSISTENT, stackid); | |
+ sizeof (esp_kstats_t) / sizeof (kstat_named_t), 0, stackid); | |
if (espstack->esp_ksp == NULL || espstack->esp_ksp->ks_data == NULL) | |
return (B_FALSE); | |
diff -ur illumos-gate/usr/src/uts/common/inet/ipf/ip_fil_solaris.c illumos-joyent/usr/src/uts/common/inet/ipf/ip_fil_solaris.c | |
--- illumos-gate/usr/src/uts/common/inet/ipf/ip_fil_solaris.c 2015-05-26 22:07:54.799744331 +0000 | |
+++ illumos-joyent/usr/src/uts/common/inet/ipf/ip_fil_solaris.c 2015-05-26 22:08:56.078426896 +0000 | |
@@ -83,6 +83,14 @@ | |
static int ipf_hook6_loop_in __P((hook_event_token_t, hook_data_t, | |
void *)); | |
static int ipf_hook6 __P((hook_data_t, int, int, void *)); | |
+static int ipf_hookvndl3v4_in __P((hook_event_token_t, hook_data_t, | |
+ void *)); | |
+static int ipf_hookvndl3v6_in __P((hook_event_token_t, hook_data_t, | |
+ void *)); | |
+static int ipf_hookvndl3v4_out __P((hook_event_token_t, hook_data_t, | |
+ void *)); | |
+static int ipf_hookvndl3v6_out __P((hook_event_token_t, hook_data_t, | |
+ void *)); | |
extern int ipf_geniter __P((ipftoken_t *, ipfgeniter_t *, ipf_stack_t *)); | |
extern int ipf_frruleiter __P((void *, int, void *, ipf_stack_t *)); | |
@@ -152,6 +160,16 @@ | |
char *hook6_loop_out = "ipfilter_hook6_loop_out"; | |
char *hook6_loop_out_gz = "ipfilter_hook6_loop_out_gz"; | |
+/* vnd IPv4/v6 hook names */ | |
+char *hook4_vnd_in = "ipfilter_hookvndl3v4_in"; | |
+char *hook4_vnd_in_gz = "ipfilter_hookvndl3v4_in_gz"; | |
+char *hook6_vnd_in = "ipfilter_hookvndl3v6_in"; | |
+char *hook6_vnd_in_gz = "ipfilter_hookvndl3v6_in_gz"; | |
+char *hook4_vnd_out = "ipfilter_hookvndl3v4_out"; | |
+char *hook4_vnd_out_gz = "ipfilter_hookvndl3v4_out_gz"; | |
+char *hook6_vnd_out = "ipfilter_hookvndl3v6_out"; | |
+char *hook6_vnd_out_gz = "ipfilter_hookvndl3v6_out_gz"; | |
+ | |
/* ------------------------------------------------------------------------ */ | |
/* Function: ipldetach */ | |
/* Returns: int - 0 == success, else error. */ | |
@@ -248,6 +266,31 @@ | |
ifs->ifs_ipf_ipv4 = NULL; | |
} | |
+ /* | |
+ * Remove VND hooks | |
+ */ | |
+ if (ifs->ifs_ipf_vndl3v4 != NULL) { | |
+ UNDO_HOOK(ifs_ipf_vndl3v4, ifs_hookvndl3v4_physical_in, | |
+ NH_PHYSICAL_IN, ifs_ipfhookvndl3v4_in); | |
+ UNDO_HOOK(ifs_ipf_vndl3v4, ifs_hookvndl3v4_physical_out, | |
+ NH_PHYSICAL_OUT, ifs_ipfhookvndl3v4_out); | |
+ | |
+ if (net_protocol_release(ifs->ifs_ipf_vndl3v4) != 0) | |
+ goto detach_failed; | |
+ ifs->ifs_ipf_vndl3v4 = NULL; | |
+ } | |
+ | |
+ if (ifs->ifs_ipf_vndl3v6 != NULL) { | |
+ UNDO_HOOK(ifs_ipf_vndl3v6, ifs_hookvndl3v6_physical_in, | |
+ NH_PHYSICAL_IN, ifs_ipfhookvndl3v6_in); | |
+ UNDO_HOOK(ifs_ipf_vndl3v6, ifs_hookvndl3v6_physical_out, | |
+ NH_PHYSICAL_OUT, ifs_ipfhookvndl3v6_out); | |
+ | |
+ if (net_protocol_release(ifs->ifs_ipf_vndl3v6) != 0) | |
+ goto detach_failed; | |
+ ifs->ifs_ipf_vndl3v6 = NULL; | |
+ } | |
+ | |
#undef UNDO_HOOK | |
#ifdef IPFDEBUG | |
@@ -445,6 +488,48 @@ | |
} | |
/* | |
+ * Add VND INET hooks | |
+ */ | |
+ ifs->ifs_ipf_vndl3v4 = net_protocol_lookup(id, NHF_VND_INET); | |
+ if (ifs->ifs_ipf_vndl3v4 == NULL) | |
+ goto hookup_failed; | |
+ | |
+ HOOK_INIT_GZ_BEFORE(ifs->ifs_ipfhookvndl3v4_in, ipf_hookvndl3v4_in, | |
+ hook4_vnd_in, hook4_vnd_in_gz, ifs); | |
+ HOOK_INIT_GZ_AFTER(ifs->ifs_ipfhookvndl3v4_out, ipf_hookvndl3v4_out, | |
+ hook4_vnd_out, hook4_vnd_out_gz, ifs); | |
+ ifs->ifs_hookvndl3v4_physical_in = (net_hook_register(ifs->ifs_ipf_vndl3v4, | |
+ NH_PHYSICAL_IN, ifs->ifs_ipfhookvndl3v4_in) == 0); | |
+ if (!ifs->ifs_hookvndl3v4_physical_in) | |
+ goto hookup_failed; | |
+ | |
+ ifs->ifs_hookvndl3v4_physical_out = (net_hook_register(ifs->ifs_ipf_vndl3v4, | |
+ NH_PHYSICAL_OUT, ifs->ifs_ipfhookvndl3v4_out) == 0); | |
+ if (!ifs->ifs_hookvndl3v4_physical_out) | |
+ goto hookup_failed; | |
+ | |
+ | |
+ /* | |
+ * VND INET6 hooks | |
+ */ | |
+ ifs->ifs_ipf_vndl3v6 = net_protocol_lookup(id, NHF_VND_INET6); | |
+ if (ifs->ifs_ipf_vndl3v6 == NULL) | |
+ goto hookup_failed; | |
+ | |
+ HOOK_INIT_GZ_BEFORE(ifs->ifs_ipfhookvndl3v6_in, ipf_hookvndl3v6_in, | |
+ hook6_vnd_in, hook6_vnd_in_gz, ifs); | |
+ HOOK_INIT_GZ_AFTER(ifs->ifs_ipfhookvndl3v6_out, ipf_hookvndl3v6_out, | |
+ hook6_vnd_out, hook6_vnd_out_gz, ifs); | |
+ ifs->ifs_hookvndl3v6_physical_in = (net_hook_register(ifs->ifs_ipf_vndl3v6, | |
+ NH_PHYSICAL_IN, ifs->ifs_ipfhookvndl3v6_in) == 0); | |
+ if (!ifs->ifs_hookvndl3v6_physical_in) | |
+ goto hookup_failed; | |
+ | |
+ ifs->ifs_hookvndl3v6_physical_out = (net_hook_register(ifs->ifs_ipf_vndl3v6, | |
+ NH_PHYSICAL_OUT, ifs->ifs_ipfhookvndl3v6_out) == 0); | |
+ if (!ifs->ifs_hookvndl3v6_physical_out) | |
+ goto hookup_failed; | |
+ /* | |
* Reacquire ipf_global, now it is safe. | |
*/ | |
WRITE_ENTER(&ifs->ifs_ipf_global); | |
@@ -1011,7 +1096,6 @@ | |
return ENXIO; | |
unit = isp->ipfs_minor; | |
- | |
/* | |
* ipf_find_stack returns with a read lock on ifs_ipf_global | |
*/ | |
@@ -2045,6 +2129,42 @@ | |
} | |
/* ------------------------------------------------------------------------ */ | |
+/* Function: ipf_hookvndl3_in */ | |
+/* Returns: int - 0 == packet ok, else problem, free packet if not done */ | |
+/* Parameters: event(I) - pointer to event */ | |
+/* info(I) - pointer to hook information for firewalling */ | |
+/* */ | |
+/* The vnd hooks are private hooks to ON. They represents a layer 2 */ | |
+/* datapath generally used to implement virtual machines. The driver sends */ | |
+/* along L3 packets of either type IP or IPv6. The ethertype to distinguish */ | |
+/* them is in the upper 16 bits while the remaining bits are the */ | |
+/* traditional packet hook flags. */ | |
+/* */ | |
+/* They end up calling the appropriate traditional ip hooks. */ | |
+/* ------------------------------------------------------------------------ */ | |
+/*ARGSUSED*/ | |
+int ipf_hookvndl3v4_in(hook_event_token_t token, hook_data_t info, void *arg) | |
+{ | |
+ return ipf_hook4_in(token, info, arg); | |
+} | |
+ | |
+int ipf_hookvndl3v6_in(hook_event_token_t token, hook_data_t info, void *arg) | |
+{ | |
+ return ipf_hook6_in(token, info, arg); | |
+} | |
+ | |
+/*ARGSUSED*/ | |
+int ipf_hookvndl3v4_out(hook_event_token_t token, hook_data_t info, void *arg) | |
+{ | |
+ return ipf_hook4_out(token, info, arg); | |
+} | |
+ | |
+int ipf_hookvndl3v6_out(hook_event_token_t token, hook_data_t info, void *arg) | |
+{ | |
+ return ipf_hook6_out(token, info, arg); | |
+} | |
+ | |
+/* ------------------------------------------------------------------------ */ | |
/* Function: ipf_hook4_loop_in */ | |
/* Returns: int - 0 == packet ok, else problem, free packet if not done */ | |
/* Parameters: event(I) - pointer to event */ | |
diff -ur illumos-gate/usr/src/uts/common/inet/ipf/ipf.conf illumos-joyent/usr/src/uts/common/inet/ipf/ipf.conf | |
--- illumos-gate/usr/src/uts/common/inet/ipf/ipf.conf 2015-05-26 22:07:54.805361628 +0000 | |
+++ illumos-joyent/usr/src/uts/common/inet/ipf/ipf.conf 2015-05-26 22:08:56.085109881 +0000 | |
@@ -1,3 +1,8 @@ | |
# | |
# | |
name="ipf" parent="pseudo" instance=0; | |
+ | |
+# Increase the state table limits. fr_statemax should be ~70% of fr_statesize, | |
+# and both should be prime numbers | |
+fr_statesize=151007; | |
+fr_statemax=113279; | |
diff -ur illumos-gate/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h illumos-joyent/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h | |
--- illumos-gate/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h 2015-05-26 22:07:54.810795313 +0000 | |
+++ illumos-joyent/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h 2015-05-26 22:08:56.091596242 +0000 | |
@@ -125,6 +125,10 @@ | |
hook_t *ifs_ipfhook6_loop_in; | |
hook_t *ifs_ipfhook6_loop_out; | |
hook_t *ifs_ipfhook6_nicevents; | |
+ hook_t *ifs_ipfhookvndl3v4_in; | |
+ hook_t *ifs_ipfhookvndl3v6_in; | |
+ hook_t *ifs_ipfhookvndl3v4_out; | |
+ hook_t *ifs_ipfhookvndl3v6_out; | |
/* flags to indicate whether hooks are registered. */ | |
boolean_t ifs_hook4_physical_in; | |
@@ -137,10 +141,16 @@ | |
boolean_t ifs_hook6_nic_events; | |
boolean_t ifs_hook6_loopback_in; | |
boolean_t ifs_hook6_loopback_out; | |
+ boolean_t ifs_hookvndl3v4_physical_in; | |
+ boolean_t ifs_hookvndl3v6_physical_in; | |
+ boolean_t ifs_hookvndl3v4_physical_out; | |
+ boolean_t ifs_hookvndl3v6_physical_out; | |
int ifs_ipf_loopback; | |
net_handle_t ifs_ipf_ipv4; | |
net_handle_t ifs_ipf_ipv6; | |
+ net_handle_t ifs_ipf_vndl3v4; | |
+ net_handle_t ifs_ipf_vndl3v6; | |
/* ip_auth.c */ | |
int ifs_fr_authsize; | |
diff -ur illumos-gate/usr/src/uts/common/inet/ipf/solaris.c illumos-joyent/usr/src/uts/common/inet/ipf/solaris.c | |
--- illumos-gate/usr/src/uts/common/inet/ipf/solaris.c 2015-05-26 22:07:54.811581187 +0000 | |
+++ illumos-joyent/usr/src/uts/common/inet/ipf/solaris.c 2015-05-26 22:08:56.092453707 +0000 | |
@@ -625,7 +625,6 @@ | |
/* | |
* Destroy things for ipf for one stack. | |
*/ | |
-/* ARGSUSED */ | |
static void | |
ipf_stack_destroy_one(const netid_t id, ipf_stack_t *ifs) | |
{ | |
Only in illumos-joyent/usr/src/uts/common/inet/sockmods: datafilt.c | |
diff -ur illumos-gate/usr/src/uts/common/inet/sockmods/socksctp.c illumos-joyent/usr/src/uts/common/inet/sockmods/socksctp.c | |
--- illumos-gate/usr/src/uts/common/inet/sockmods/socksctp.c 2015-05-26 22:07:54.836946288 +0000 | |
+++ illumos-joyent/usr/src/uts/common/inet/sockmods/socksctp.c 2015-05-26 22:08:56.122541817 +0000 | |
@@ -21,6 +21,7 @@ | |
/* | |
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. | |
+ * Copyright (c) 2015 Joyent, Inc. All rights reserved. | |
*/ | |
#include <sys/types.h> | |
@@ -403,7 +404,6 @@ | |
} | |
if (name == NULL || namelen == 0) { | |
- mutex_exit(&so->so_lock); | |
error = EINVAL; | |
eprintsoline(so, error); | |
goto done; | |
diff -ur illumos-gate/usr/src/uts/common/inet/squeue.c illumos-joyent/usr/src/uts/common/inet/squeue.c | |
--- illumos-gate/usr/src/uts/common/inet/squeue.c 2015-05-26 22:07:54.838608054 +0000 | |
+++ illumos-joyent/usr/src/uts/common/inet/squeue.c 2015-05-26 22:08:56.124547236 +0000 | |
@@ -23,7 +23,7 @@ | |
*/ | |
/* | |
- * Copyright 2012 Joyent, Inc. All rights reserved. | |
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved. | |
*/ | |
/* | |
@@ -61,6 +61,10 @@ | |
* connection are processed on that squeue. The connection ("conn") to | |
* squeue mapping is stored in "conn_t" member "conn_sqp". | |
* | |
+ * If the squeue is not related to TCP/IP, then the value of sqp->sq_isip is | |
+ * false and it will not have an associated conn_t, which means many aspects of | |
+ * the system, such as polling and swtiching squeues will not be used. | |
+ * | |
* Since the processing of the connection cuts across multiple layers | |
* but still allows packets for different connnection to be processed on | |
* other CPU/squeues, squeues are also termed as "Vertical Perimeter" or | |
@@ -244,7 +248,7 @@ | |
/* ARGSUSED */ | |
squeue_t * | |
-squeue_create(clock_t wait, pri_t pri) | |
+squeue_create(clock_t wait, pri_t pri, boolean_t isip) | |
{ | |
squeue_t *sqp = kmem_cache_alloc(squeue_cache, KM_SLEEP); | |
@@ -260,11 +264,36 @@ | |
sqp->sq_enter = squeue_enter; | |
sqp->sq_drain = squeue_drain; | |
+ sqp->sq_isip = isip; | |
return (sqp); | |
} | |
/* | |
+ * We need to kill the threads and then clean up. We should VERIFY that | |
+ * polling is disabled so we don't have to worry about disassociating from | |
+ * MAC/IP/etc. | |
+ */ | |
+void | |
+squeue_destroy(squeue_t *sqp) | |
+{ | |
+ kt_did_t worker, poll; | |
+ mutex_enter(&sqp->sq_lock); | |
+ VERIFY(!(sqp->sq_state & (SQS_POLL_THR_QUIESCED | | |
+ SQS_POLL_QUIESCE_DONE | SQS_PAUSE | SQS_EXIT))); | |
+ worker = sqp->sq_worker->t_did; | |
+ poll = sqp->sq_poll_thr->t_did; | |
+ sqp->sq_state |= SQS_EXIT; | |
+ cv_signal(&sqp->sq_poll_cv); | |
+ cv_signal(&sqp->sq_worker_cv); | |
+ mutex_exit(&sqp->sq_lock); | |
+ | |
+ thread_join(poll); | |
+ thread_join(worker); | |
+ kmem_cache_free(squeue_cache, sqp); | |
+} | |
+ | |
+/* | |
* Bind squeue worker thread to the specified CPU, given by CPU id. | |
* If the CPU id value is -1, bind the worker thread to the value | |
* specified in sq_bind field. If a thread is already bound to a | |
@@ -475,18 +504,21 @@ | |
* Handle squeue switching. More details in the | |
* block comment at the top of the file | |
*/ | |
- if (connp->conn_sqp == sqp) { | |
+ if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) { | |
SQUEUE_DBG_SET(sqp, mp, proc, connp, | |
tag); | |
- connp->conn_on_sqp = B_TRUE; | |
+ if (sqp->sq_isip == B_TRUE) | |
+ connp->conn_on_sqp = B_TRUE; | |
DTRACE_PROBE3(squeue__proc__start, squeue_t *, | |
sqp, mblk_t *, mp, conn_t *, connp); | |
(*proc)(connp, mp, sqp, ira); | |
DTRACE_PROBE2(squeue__proc__end, squeue_t *, | |
sqp, conn_t *, connp); | |
- connp->conn_on_sqp = B_FALSE; | |
+ if (sqp->sq_isip == B_TRUE) { | |
+ connp->conn_on_sqp = B_FALSE; | |
+ CONN_DEC_REF(connp); | |
+ } | |
SQUEUE_DBG_CLEAR(sqp); | |
- CONN_DEC_REF(connp); | |
} else { | |
SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, | |
connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE); | |
@@ -513,7 +545,7 @@ | |
return; | |
} | |
} else { | |
- if (ira != NULL) { | |
+ if (sqp->sq_isip == B_TRUE && ira != NULL) { | |
mblk_t *attrmp; | |
ASSERT(cnt == 1); | |
@@ -587,7 +619,8 @@ | |
if (!(sqp->sq_state & SQS_REENTER) && | |
(process_flag != SQ_FILL) && (sqp->sq_first == NULL) && | |
(sqp->sq_run == curthread) && (cnt == 1) && | |
- (connp->conn_on_sqp == B_FALSE)) { | |
+ (sqp->sq_isip == B_FALSE || | |
+ connp->conn_on_sqp == B_FALSE)) { | |
sqp->sq_state |= SQS_REENTER; | |
mutex_exit(&sqp->sq_lock); | |
@@ -602,15 +635,21 @@ | |
* Handle squeue switching. More details in the | |
* block comment at the top of the file | |
*/ | |
- if (connp->conn_sqp == sqp) { | |
- connp->conn_on_sqp = B_TRUE; | |
+ if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) { | |
+ SQUEUE_DBG_SET(sqp, mp, proc, connp, | |
+ tag); | |
+ if (sqp->sq_isip == B_TRUE) | |
+ connp->conn_on_sqp = B_TRUE; | |
DTRACE_PROBE3(squeue__proc__start, squeue_t *, | |
sqp, mblk_t *, mp, conn_t *, connp); | |
(*proc)(connp, mp, sqp, ira); | |
DTRACE_PROBE2(squeue__proc__end, squeue_t *, | |
sqp, conn_t *, connp); | |
- connp->conn_on_sqp = B_FALSE; | |
- CONN_DEC_REF(connp); | |
+ if (sqp->sq_isip == B_TRUE) { | |
+ connp->conn_on_sqp = B_FALSE; | |
+ CONN_DEC_REF(connp); | |
+ } | |
+ SQUEUE_DBG_CLEAR(sqp); | |
} else { | |
SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, | |
connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE); | |
@@ -631,7 +670,7 @@ | |
#ifdef DEBUG | |
mp->b_tag = tag; | |
#endif | |
- if (ira != NULL) { | |
+ if (sqp->sq_isip && ira != NULL) { | |
mblk_t *attrmp; | |
ASSERT(cnt == 1); | |
@@ -779,7 +818,7 @@ | |
mp->b_prev = NULL; | |
/* Is there an ip_recv_attr_t to handle? */ | |
- if (ip_recv_attr_is_mblk(mp)) { | |
+ if (sqp->sq_isip == B_TRUE && ip_recv_attr_is_mblk(mp)) { | |
mblk_t *attrmp = mp; | |
ASSERT(attrmp->b_cont != NULL); | |
@@ -804,20 +843,25 @@ | |
/* | |
- * Handle squeue switching. More details in the | |
- * block comment at the top of the file | |
+ * Handle squeue switching. More details in the block comment at | |
+ * the top of the file. non-IP squeues cannot switch, as there | |
+ * is no conn_t. | |
*/ | |
- if (connp->conn_sqp == sqp) { | |
+ if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) { | |
SQUEUE_DBG_SET(sqp, mp, proc, connp, | |
mp->b_tag); | |
- connp->conn_on_sqp = B_TRUE; | |
+ if (sqp->sq_isip == B_TRUE) | |
+ connp->conn_on_sqp = B_TRUE; | |
DTRACE_PROBE3(squeue__proc__start, squeue_t *, | |
sqp, mblk_t *, mp, conn_t *, connp); | |
(*proc)(connp, mp, sqp, ira); | |
DTRACE_PROBE2(squeue__proc__end, squeue_t *, | |
sqp, conn_t *, connp); | |
- connp->conn_on_sqp = B_FALSE; | |
- CONN_DEC_REF(connp); | |
+ if (sqp->sq_isip == B_TRUE) { | |
+ connp->conn_on_sqp = B_FALSE; | |
+ CONN_DEC_REF(connp); | |
+ } | |
+ SQUEUE_DBG_CLEAR(sqp); | |
} else { | |
SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, connp, ira, | |
SQ_FILL, SQTAG_SQUEUE_CHANGE); | |
@@ -1051,6 +1095,11 @@ | |
cv_wait(async, lock); | |
CALLB_CPR_SAFE_END(&cprinfo, lock); | |
+ if (sqp->sq_state & SQS_EXIT) { | |
+ mutex_exit(lock); | |
+ thread_exit(); | |
+ } | |
+ | |
ctl_state = sqp->sq_state & (SQS_POLL_THR_CONTROL | | |
SQS_POLL_THR_QUIESCED); | |
if (ctl_state != 0) { | |
@@ -1076,6 +1125,9 @@ | |
(SQS_PROC|SQS_POLLING|SQS_GET_PKTS)) == | |
(SQS_PROC|SQS_POLLING|SQS_GET_PKTS)); | |
+ /* Only IP related squeues should reach this point */ | |
+ VERIFY(sqp->sq_isip == B_TRUE); | |
+ | |
poll_again: | |
sq_rx_ring = sqp->sq_rx_ring; | |
sq_get_pkts = sq_rx_ring->rr_rx; | |
@@ -1205,6 +1257,7 @@ | |
ill_rx_ring_t *rx_ring; | |
ASSERT(MUTEX_HELD(&sqp->sq_lock)); | |
+ VERIFY(sqp->sq_isip == B_TRUE); | |
if (sqp->sq_state & SQS_POLL_RESTART) { | |
/* Restart implies a previous quiesce. */ | |
@@ -1316,6 +1369,11 @@ | |
for (;;) { | |
for (;;) { | |
+ if (sqp->sq_state & SQS_EXIT) { | |
+ mutex_exit(lock); | |
+ thread_exit(); | |
+ } | |
+ | |
/* | |
* If the poll thread has handed control to us | |
* we need to break out of the wait. | |
@@ -1412,6 +1470,7 @@ | |
again: | |
sqp = connp->conn_sqp; | |
+ VERIFY(sqp->sq_isip == B_TRUE); | |
mutex_enter(&sqp->sq_lock); | |
if (sqp->sq_first == NULL && !(sqp->sq_state & SQS_PROC)) { | |
@@ -1487,6 +1546,7 @@ | |
squeue_synch_exit(conn_t *connp) | |
{ | |
squeue_t *sqp = connp->conn_sqp; | |
+ VERIFY(sqp->sq_isip == B_TRUE); | |
mutex_enter(&sqp->sq_lock); | |
if (sqp->sq_run == curthread) { | |
diff -ur illumos-gate/usr/src/uts/common/inet/tcp_impl.h illumos-joyent/usr/src/uts/common/inet/tcp_impl.h | |
--- illumos-gate/usr/src/uts/common/inet/tcp_impl.h 2015-05-26 22:07:54.851683245 +0000 | |
+++ illumos-joyent/usr/src/uts/common/inet/tcp_impl.h 2015-05-26 22:08:56.140696474 +0000 | |
@@ -20,7 +20,7 @@ | |
*/ | |
/* | |
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. | |
- * Copyright (c) 2011, Joyent Inc. All rights reserved. | |
+ * Copyright (c) 2013, Joyent Inc. All rights reserved. | |
* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. | |
* Copyright (c) 2013, 2014 by Delphix. All rights reserved. | |
*/ | |
@@ -61,9 +61,9 @@ | |
* by setting it to 0. | |
*/ | |
#define TCP_XMIT_LOWATER 4096 | |
-#define TCP_XMIT_HIWATER 49152 | |
+#define TCP_XMIT_HIWATER 128000 | |
#define TCP_RECV_LOWATER 2048 | |
-#define TCP_RECV_HIWATER 128000 | |
+#define TCP_RECV_HIWATER 1048576 | |
/* | |
* Bind hash list size and has function. It has to be a power of 2 for | |
diff -ur illumos-gate/usr/src/uts/common/inet/udp/udp_opt_data.c illumos-joyent/usr/src/uts/common/inet/udp/udp_opt_data.c | |
--- illumos-gate/usr/src/uts/common/inet/udp/udp_opt_data.c 2015-05-26 22:07:54.855570156 +0000 | |
+++ illumos-joyent/usr/src/uts/common/inet/udp/udp_opt_data.c 2015-05-26 22:08:56.145346851 +0000 | |
@@ -21,6 +21,7 @@ | |
/* | |
* Copyright 2009 Sun Microsystems, Inc. All rights reserved. | |
* Use is subject to license terms. | |
+ * Copyright 2015, Joyent, Inc. | |
*/ | |
#include <sys/types.h> | |
@@ -292,6 +293,7 @@ | |
}, | |
{ UDP_NAT_T_ENDPOINT, IPPROTO_UDP, OA_RW, OA_RW, OP_PRIVPORT, 0, sizeof (int), | |
0 }, | |
+{ UDP_SRCPORT_HASH, IPPROTO_UDP, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int), 0 } | |
}; | |
/* | |
diff -ur illumos-gate/usr/src/uts/common/inet/udp/udp.c illumos-joyent/usr/src/uts/common/inet/udp/udp.c | |
--- illumos-gate/usr/src/uts/common/inet/udp/udp.c 2015-05-26 22:07:54.855029314 +0000 | |
+++ illumos-joyent/usr/src/uts/common/inet/udp/udp.c 2015-05-26 22:08:56.144824894 +0000 | |
@@ -22,6 +22,7 @@ | |
* Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. | |
* Copyright 2013 Nexenta Systems, Inc. All rights reserved. | |
* Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved. | |
+ * Copyright 2015, Joyent, Inc. | |
*/ | |
/* Copyright (c) 1990 Mentat Inc. */ | |
@@ -76,7 +77,8 @@ | |
#include <inet/ipclassifier.h> | |
#include <sys/squeue_impl.h> | |
#include <inet/ipnet.h> | |
-#include <sys/ethernet.h> | |
+#include <sys/vxlan.h> | |
+#include <inet/inet_hash.h> | |
#include <sys/tsol/label.h> | |
#include <sys/tsol/tnet.h> | |
@@ -346,6 +348,89 @@ | |
typedef union T_primitives *t_primp_t; | |
/* | |
+ * Various protocols that encapsulate UDP have no real use for the source port. | |
+ * Instead, they want to vary the source port to provide better equal-cost | |
+ * multipathing and other systems that use fanout. Consider something like | |
+ * VXLAN. If you're actually sending multiple different streams to a single | |
+ * host, if you don't vary the source port, then the tuple of ( SRC IP, DST IP, | |
+ * SRC Port, DST Port) will always be the same. | |
+ * | |
+ * Here, we return a port to hash this to, if we know how to hash it. If for | |
+ * some reason we can't perform an L4 hash, then we just return the default | |
+ * value, usually the default port. After we determine the hash we transform it | |
+ * so that it's in the range of [ min, max ]. | |
+ * | |
+ * We'd like to avoid a pull up for the sake of performing the hash. If the | |
+ * first mblk_t doesn't have the full protocol header, then we just send it to | |
+ * the default. If for some reason we have an encapsulated packet that has its | |
+ * protocol header in different parts of an mblk_t, then we'll go with the | |
+ * default port. This means that that if a driver isn't consistent about how it | |
+ * generates the frames for a given flow, it will not always be consistently | |
+ * hashed. That should be an uncommon event. | |
+ */ | |
+uint16_t | |
+udp_srcport_hash(mblk_t *mp, int type, uint16_t min, uint16_t max, | |
+ uint16_t def) | |
+{ | |
+ size_t szused = 0; | |
+ struct ether_header *ether; | |
+ struct ether_vlan_header *vether; | |
+ ip6_t *ip6h; | |
+ ipha_t *ipha; | |
+ uint16_t sap; | |
+ uint64_t hash; | |
+ uint32_t mod; | |
+ | |
+ ASSERT(min <= max); | |
+ | |
+ if (type != UDP_HASH_VXLAN) | |
+ return (def); | |
+ | |
+ if (!IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t))) | |
+ return (def); | |
+ | |
+ /* | |
+ * The following logic is VXLAN specific to get at the header, if we | |
+ * have formats, eg. GENEVE, then we should ignore this. | |
+ * | |
+ * The kernel overlay device often puts a first mblk_t for the data | |
+ * which is just the encap. If so, then we're going to use that and try | |
+ * to avoid a pull up. | |
+ */ | |
+ if (MBLKL(mp) == VXLAN_HDR_LEN) { | |
+ if (mp->b_cont == NULL) | |
+ return (def); | |
+ mp = mp->b_cont; | |
+ ether = (struct ether_header *)mp->b_rptr; | |
+ } else if (MBLKL(mp) < VXLAN_HDR_LEN) { | |
+ return (def); | |
+ } else { | |
+ szused = VXLAN_HDR_LEN; | |
+ ether = (struct ether_header *)((uintptr_t)mp->b_rptr + szused); | |
+ } | |
+ | |
+ /* Can we hold a MAC header? */ | |
+ if (MBLKL(mp) + szused < sizeof (struct ether_header)) | |
+ return (def); | |
+ | |
+ /* | |
+ * We need to lie about the starting offset into the message block for | |
+ * convenience. Undo it at the end. We know that inet_pkt_hash() won't | |
+ * modify the mblk_t. | |
+ */ | |
+ mp->b_rptr += szused; | |
+ hash = inet_pkt_hash(DL_ETHER, mp, INET_PKT_HASH_L2 | | |
+ INET_PKT_HASH_L3 | INET_PKT_HASH_L4); | |
+ mp->b_rptr -= szused; | |
+ | |
+ if (hash == 0) | |
+ return (def); | |
+ | |
+ mod = max - min + 1; | |
+ return ((hash % mod) + min); | |
+} | |
+ | |
+/* | |
* Return the next anonymous port in the privileged port range for | |
* bind checking. | |
* | |
@@ -1583,6 +1668,11 @@ | |
*i1 = udp->udp_rcvhdr ? 1 : 0; | |
mutex_exit(&connp->conn_lock); | |
return (sizeof (int)); | |
+ case UDP_SRCPORT_HASH: | |
+ mutex_enter(&connp->conn_lock); | |
+ *i1 = udp->udp_vxlanhash; | |
+ mutex_exit(&connp->conn_lock); | |
+ return (sizeof (int)); | |
} | |
} | |
mutex_enter(&connp->conn_lock); | |
@@ -1718,6 +1808,26 @@ | |
udp->udp_rcvhdr = onoff; | |
mutex_exit(&connp->conn_lock); | |
return (0); | |
+ case UDP_SRCPORT_HASH: | |
+ /* | |
+ * This should have already been verified, but double | |
+ * check. | |
+ */ | |
+ if ((error = secpolicy_ip_config(cr, B_FALSE)) != 0) { | |
+ return (error); | |
+ } | |
+ | |
+ /* First see if the val is something we understand */ | |
+ if (*i1 != UDP_HASH_DISABLE && *i1 != UDP_HASH_VXLAN) | |
+ return (EINVAL); | |
+ | |
+ if (!checkonly) { | |
+ mutex_enter(&connp->conn_lock); | |
+ udp->udp_vxlanhash = *i1; | |
+ mutex_exit(&connp->conn_lock); | |
+ } | |
+ /* Fully handled this option. */ | |
+ return (0); | |
} | |
break; | |
} | |
@@ -2001,13 +2111,25 @@ | |
uint32_t cksum; | |
udp_t *udp = connp->conn_udp; | |
boolean_t insert_spi = udp->udp_nat_t_endpoint; | |
+ boolean_t hash_srcport = udp->udp_vxlanhash; | |
uint_t ulp_hdr_len; | |
+ uint16_t srcport; | |
data_len = msgdsize(data_mp); | |
ulp_hdr_len = UDPH_SIZE; | |
if (insert_spi) | |
ulp_hdr_len += sizeof (uint32_t); | |
+ /* | |
+ * If we have source port hashing going on, determine the hash before | |
+ * we modify the mblk_t. | |
+ */ | |
+ if (hash_srcport == B_TRUE) { | |
+ srcport = udp_srcport_hash(mp, UDP_HASH_VXLAN, | |
+ IPPORT_DYNAMIC_MIN, IPPORT_DYNAMIC_MAX, | |
+ ntohs(connp->conn_lport)); | |
+ } | |
+ | |
mp = conn_prepend_hdr(ixa, ipp, v6src, v6dst, IPPROTO_UDP, flowinfo, | |
ulp_hdr_len, data_mp, data_len, us->us_wroff_extra, &cksum, errorp); | |
if (mp == NULL) { | |
@@ -2019,7 +2141,11 @@ | |
ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length; | |
udpha = (udpha_t *)(mp->b_rptr + ixa->ixa_ip_hdr_length); | |
- udpha->uha_src_port = connp->conn_lport; | |
+ if (hash_srcport == B_TRUE) { | |
+ udpha->uha_src_port = htons(srcport); | |
+ } else { | |
+ udpha->uha_src_port = connp->conn_lport; | |
+ } | |
udpha->uha_dst_port = dstport; | |
udpha->uha_checksum = 0; | |
udpha->uha_length = htons(data_len); | |
@@ -3194,6 +3320,7 @@ | |
udp_t *udp = connp->conn_udp; | |
udp_stack_t *us = udp->udp_us; | |
boolean_t insert_spi = udp->udp_nat_t_endpoint; | |
+ boolean_t hash_srcport = udp->udp_vxlanhash; | |
uint_t pktlen; | |
uint_t alloclen; | |
uint_t copylen; | |
@@ -3202,10 +3329,21 @@ | |
udpha_t *udpha; | |
uint32_t cksum; | |
ip_pkt_t *ipp; | |
+ uint16_t srcport; | |
ASSERT(MUTEX_HELD(&connp->conn_lock)); | |
/* | |
+ * If we have source port hashing going on, determine the hash before | |
+ * we modify the mblk_t. | |
+ */ | |
+ if (hash_srcport == B_TRUE) { | |
+ srcport = udp_srcport_hash(mp, UDP_HASH_VXLAN, | |
+ IPPORT_DYNAMIC_MIN, IPPORT_DYNAMIC_MAX, | |
+ ntohs(connp->conn_lport)); | |
+ } | |
+ | |
+ /* | |
* Copy the header template and leave space for an SPI | |
*/ | |
copylen = connp->conn_ht_iphc_len; | |
@@ -3303,6 +3441,9 @@ | |
*((uint32_t *)(udpha + 1)) = 0; | |
udpha->uha_dst_port = dstport; | |
+ if (hash_srcport == B_TRUE) | |
+ udpha->uha_src_port = htons(srcport); | |
+ | |
return (mp); | |
} | |
diff -ur illumos-gate/usr/src/uts/common/inet/udp_impl.h illumos-joyent/usr/src/uts/common/inet/udp_impl.h | |
--- illumos-gate/usr/src/uts/common/inet/udp_impl.h 2015-05-26 22:07:54.856236567 +0000 | |
+++ illumos-joyent/usr/src/uts/common/inet/udp_impl.h 2015-05-26 22:08:56.146163362 +0000 | |
@@ -20,6 +20,7 @@ | |
*/ | |
/* | |
* Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. | |
+ * Copyright 2015, Joyent, Inc. | |
*/ | |
#ifndef _UDP_IMPL_H | |
@@ -178,8 +179,11 @@ | |
udp_issocket : 1, /* socket mode; sockfs is on top */ | |
udp_nat_t_endpoint : 1, /* UDP_NAT_T_ENDPOINT option */ | |
udp_rcvhdr : 1, /* UDP_RCVHDR option */ | |
+ udp_vxlanhash: 1, /* UDP_SRCPORT_HASH option */ | |
+ /* Because there's only VXLAN, cheat */ | |
+ /* and only use a single bit */ | |
- udp_pad_to_bit_31 : 29; | |
+ udp_pad_to_bit_31 : 28; | |
/* Following 2 fields protected by the uf_lock */ | |
struct udp_s *udp_bind_hash; /* Bind hash chain */ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment