Skip to content

Instantly share code, notes, and snippets.

@userid
Created September 17, 2016 15:03
Show Gist options
  • Select an option

  • Save userid/49737fb237dac1026d13932fcb218e9b to your computer and use it in GitHub Desktop.

Select an option

Save userid/49737fb237dac1026d13932fcb218e9b to your computer and use it in GitHub Desktop.
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 89bcfe8..9298207 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -533,8 +533,10 @@ struct sk_buff {
* layer. Please put your private variables there. If you
* want to keep them across layers you have to do a skb_clone()
* first. This is owned by whoever has the skb queued ATM.
+ *
+ * Increased the CB to hold pointer to an FEC structure.
*/
- char cb[48] __aligned(8);
+ char cb[56] __aligned(8);
unsigned long _skb_refdst;
#ifdef CONFIG_XFRM
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 4345d49..ccc0e91 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -79,6 +79,24 @@ struct tcp_sack_block {
#define TCP_FACK_ENABLED (1 << 1) /*1 = FACK is enabled locally*/
#define TCP_DSACK_SEEN (1 << 2) /*1 = DSACK was received from peer*/
+/* Flags transmitted in the first FEC option byte after magic bytes
+ * (except if option is used for negotiation) */
+#define TCP_FEC_RECOVERY_CWR 0x80 /* Recovery triggered CWR */
+#define TCP_FEC_RECOVERY_SUCCESSFUL 0x40 /* Local recovery done */
+#define TCP_FEC_RECOVERY_FAILED 0x20 /* Local recovery failed */
+#define TCP_FEC_ENCODED 0x10 /* Packet is FEC-encoded */
+
+struct tcp_fec {
+ u8 type; /* Requested FEC type (negotiation only,
+ * see net/tcp_fec.h for type defs) */
+ u32 enc_seq; /* Sequence number of first encoded byte */
+ u32 enc_len; /* Encoding length */
+ u32 lost_seq; /* Sequence number of first lost byte */
+ u32 lost_len; /* Loss length */
+ u8 flags; /* See flag definitions above */
+ bool saw_fec; /* FEC option was retrieved from packet */
+};
+
struct tcp_options_received {
/* PAWS/RTTM data */
long ts_recent_stamp;/* Time we stored ts_recent (for aging) */
@@ -95,12 +113,14 @@ struct tcp_options_received {
u8 num_sacks; /* Number of SACK blocks */
u16 user_mss; /* mss requested by user in ioctl */
u16 mss_clamp; /* Maximal mss, negotiated at connection setup */
+ struct tcp_fec fec; /* FEC-related parameters */
};
static inline void tcp_clear_options(struct tcp_options_received *rx_opt)
{
rx_opt->tstamp_ok = rx_opt->sack_ok = 0;
rx_opt->wscale_ok = rx_opt->snd_wscale = 0;
+ memset(&(rx_opt->fec), 0, sizeof(struct tcp_fec));
}
/* This is the max number of SACKS that we'll generate and process. It's safe
@@ -327,6 +347,24 @@ struct tcp_sock {
* socket. Used to retransmit SYNACKs etc.
*/
struct request_sock *fastopen_rsk;
+
+/* TCP FEC parameters
+ * type - negotiated FEC type to be used
+ * next_seq - next sequence which was not FEC-encoded before
+ * lost_len - bytes after rcv_nxt considered lost
+ * flags - see TCP_FEC_* flag definitions above
+ * bytes_rcv_queue - number of bytes stored in queued SKBs
+ * rcv_queue - copies from the socket's receive queue kept for
+ * FEC recovery
+ */
+ struct {
+ u8 type;
+ u32 next_seq;
+ u32 lost_len;
+ u8 flags;
+ u32 bytes_rcv_queue;
+ struct sk_buff_head rcv_queue;
+ } fec;
};
enum tsq_flags {
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 06d0d0f..063aa59 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -138,6 +138,7 @@ struct inet_connection_sock {
#define ICSK_TIME_PROBE0 3 /* Zero window probe timer */
#define ICSK_TIME_EARLY_RETRANS 4 /* Early retransmit timer */
#define ICSK_TIME_LOSS_PROBE 5 /* Tail loss probe timer */
+#define ICSK_TIME_FEC 6 /* FEC delayed send timer */
static inline struct inet_connection_sock *inet_csk(const struct sock *sk)
{
@@ -228,7 +229,8 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what,
}
if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0 ||
- what == ICSK_TIME_EARLY_RETRANS || what == ICSK_TIME_LOSS_PROBE) {
+ what == ICSK_TIME_EARLY_RETRANS || what == ICSK_TIME_LOSS_PROBE ||
+ what == ICSK_TIME_FEC) {
icsk->icsk_pending = what;
icsk->icsk_timeout = jiffies + when;
sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index 610fa9e..1c1b4ba 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -63,6 +63,8 @@ struct request_sock {
struct sock *sk;
u32 secid;
u32 peer_secid;
+ u8 fec_type; /* Encoding type (see
+ * net/tcp_fec.h) */
};
static inline struct request_sock *reqsk_alloc(const struct request_sock_ops *ops)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index d59f206..f894889 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -182,6 +182,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
* experimental options. See draft-ietf-tcpm-experimental-options-00.txt
*/
#define TCPOPT_FASTOPEN_MAGIC 0xF989
+#define TCPOPT_FEC_MAGIC 0xDC60
/*
* TCP option lengths
@@ -195,6 +196,11 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
#define TCPOLEN_FASTOPEN_BASE 2
#define TCPOLEN_EXP_FASTOPEN_BASE 4
+/*
+ * !!! TCP FEC patch !!!
+ */
+#define TCPOLEN_EXP_FEC_BASE 4
+
/* But this is what stacks really send out. */
#define TCPOLEN_TSTAMP_ALIGNED 12
#define TCPOLEN_WSCALE_ALIGNED 4
@@ -204,6 +210,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
#define TCPOLEN_SACK_PERBLOCK 8
#define TCPOLEN_MD5SIG_ALIGNED 20
#define TCPOLEN_MSS_ALIGNED 4
+#define TCPOLEN_EXP_FEC_NEGOTIATION_ALIGNED 8
/* Flags in tp->nonagle */
#define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */
@@ -230,6 +237,15 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
#define TFO_SERVER_WO_SOCKOPT1 0x400
#define TFO_SERVER_WO_SOCKOPT2 0x800
+/*
+ * !!! TCP FEC patch !!!
+ */
+
+/* Maximum number of in-order bytes kept in the receiver's buffer for FEC
+ * recoveries. The sender will never send more than this in a single FEC
+ * packet. */
+#define FEC_RCV_QUEUE_LIMIT 16000
+
extern struct inet_timewait_death_row tcp_death_row;
/* sysctl variables for tcp */
@@ -274,6 +290,12 @@ extern int sysctl_tcp_thin_dupack;
extern int sysctl_tcp_early_retrans;
extern int sysctl_tcp_limit_output_bytes;
extern int sysctl_tcp_challenge_ack_limit;
+
+/*
+ * !!! TCP FEC patch !!!
+ */
+extern int sysctl_tcp_fec;
+
extern unsigned int sysctl_tcp_notsent_lowat;
extern int sysctl_tcp_min_tso_segs;
extern int sysctl_tcp_autocorking;
@@ -725,6 +747,7 @@ struct tcp_skb_cb {
__u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */
/* 1 byte hole */
__u32 ack_seq; /* Sequence number ACK'd */
+ struct tcp_fec *fec; /* FEC parameters */
};
#define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0]))
@@ -1131,6 +1154,11 @@ static inline void tcp_openreq_init(struct request_sock *req,
ireq->ecn_ok = 0;
ireq->ir_rmt_port = tcp_hdr(skb)->source;
ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
+
+ /*
+ * !!! TCP FEC patch !!!
+ */
+ req->fec_type = rx_opt->fec.type;
}
extern void tcp_openreq_init_rwin(struct request_sock *req,
diff --git a/include/net/tcp_fec.h b/include/net/tcp_fec.h
new file mode 100644
index 0000000..38f2c40
--- /dev/null
+++ b/include/net/tcp_fec.h
@@ -0,0 +1,106 @@
+#ifndef _TCP_FEC_H
+#define _TCP_FEC_H
+
+#include <net/tcp.h>
+#include <asm/unaligned.h>
+
+/* FEC-encoding types (8 bits, internal) */
+#define TCP_FEC_TYPE_NONE 0 /* FEC disabled */
+#define TCP_FEC_TYPE_XOR_ALL 1 /* XOR every MSS length segment */
+#define TCP_FEC_TYPE_XOR_SKIP_1 2 /* XOR every other MSS length
+ * segment */
+
+#define TCP_FEC_NUM_TYPES 3
+
+/* Delay transmission of FEC packets (delay defined in tcp_fec_arm_timer()) */
+#define TCP_FEC_DELAYED_SEND 1
+
+/*
+ * Returns true if FEC is enabled for the socket
+ */
+static inline bool tcp_fec_is_enabled(const struct tcp_sock *tp)
+{
+ return unlikely(tp->fec.type > 0);
+}
+
+/*
+ * Returns true if the current packet in the buffer is FEC-encoded
+ */
+static inline bool tcp_fec_is_encoded(const struct tcp_sock *tp)
+{
+ return unlikely((tp->rx_opt.fec.flags & TCP_FEC_ENCODED) &&
+ (tp->rx_opt.fec.saw_fec));
+}
+
+/*
+ * Decodes FEC parameters and stores them in the FEC struct
+ * @seq - sequence number of the packet
+ * @ack_seq - ACKed sequence number
+ * @is_syn - true, if option was attached to a packet with a SYN flag
+ * @ptr - points to the first byte of the FEC option after kind, length,
+ * and possible magic bytes
+ * @len - option length (without kind, length, magic bytes)
+ */
+int tcp_fec_decode_option(struct tcp_fec *fec, u32 seq, u32 ack_seq,
+ bool is_syn, const unsigned char *ptr,
+ unsigned int len);
+
+/*
+ * Encodes FEC parameters to wire format
+ * Pointer points to the first byte of the FEC option after kind, length,
+ * and possible magic bytes (pointer will be moved to first unoccupied byte)
+ */
+int tcp_fec_encode_option(struct tcp_sock *tp, struct tcp_fec *fec,
+ __be32 **ptr);
+
+/*
+ * Processes the current packet in the buffer (treated as FEC packet)
+ */
+int tcp_fec_process(struct sock *sk, struct sk_buff *skb);
+
+/*
+ * Checks the received options for loss indicators and acts upon them.
+ * In particular, the function handles window reduction requests and processes
+ * tail loss indicators.
+ * Returns: 1, if window is reduced - 0, otherwise
+ */
+int tcp_fec_check_ack(struct sock *sk, u32 ack_seq);
+
+/*
+ * Since data in the socket's receive queue can get consumed by other parties
+ * we need to keep extra references these SKBs until they are no longer
+ * required for possible future recoveries.
+ * @skb - buffer which is moved to the receive queue
+ */
+int tcp_fec_update_queue(struct sock *sk, struct sk_buff *skb);
+
+/*
+ * Disables FEC for this connection (includes clearing references
+ * to buffers in receive queue)
+ */
+void tcp_fec_disable(struct sock *sk);
+
+/* Arms the timer for a delayed FEC transmission if there is
+ * no earlier timeout defined (i.e. retransmission timeout)
+ */
+void tcp_fec_arm_timer(struct sock *sk);
+
+/* The FEC timer fired. Force an FEC transmission for the
+ * last unencoded burst. Rearm the RTO timer (which was switched
+ * out when setting the FEC timer). Set a new FEC timer if there
+ * is pending unencoded data.
+ */
+void tcp_fec_timer(struct sock *sk);
+
+/* If FEC packets transmissions are delayed set a timer
+ * (if not already set), otherwise invoke the FEC mechanism
+ * immediately
+ */
+int tcp_fec_invoke(struct sock *sk);
+
+/* Invoke the FEC mechanism set for the connection;
+ * Create and sends out FEC packets
+ */
+int tcp_fec_invoke_nodelay(struct sock *sk);
+
+#endif
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 3b97183..d679733 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -113,6 +113,11 @@ enum {
#define TCP_TIMESTAMP 24
#define TCP_NOTSENT_LOWAT 25 /* limit number of unsent bytes in write queue */
+/*
+ * !!! TCP FEC patch !!!
+ */
+#define TCP_FEC 79 /* Forward error correction */
+
struct tcp_repair_opt {
__u32 opt_code;
__u32 opt_val;
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 518c04e..6aa32ca 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -6,7 +6,7 @@ obj-y := route.o inetpeer.o protocol.o \
ip_input.o ip_fragment.o ip_forward.o ip_options.o \
ip_output.o ip_sockglue.o inet_hashtables.o \
inet_timewait_sock.o inet_connection_sock.o \
- tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
+ tcp.o tcp_fec.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
tcp_offload.o datagram.o raw.o udp.o udplite.o \
udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 0d438fb..9cfa3d3 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -183,7 +183,8 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
- icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
+ icsk->icsk_pending == ICSK_TIME_LOSS_PROBE ||
+ icsk->icsk_pending == ICSK_TIME_FEC) {
r->idiag_timer = 1;
r->idiag_retrans = icsk->icsk_retransmits;
r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index eeb17b3..9c58530 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -28,6 +28,7 @@
static int zero;
static int one = 1;
+static int two = 2;
static int four = 4;
static int gso_max_segs = GSO_MAX_SEGS;
static int tcp_retr1_max = 255;
@@ -810,6 +811,15 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = &one
},
+ {
+ .procname = "tcp_fec",
+ .data = &sysctl_tcp_fec,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ .extra1 = &zero,
+ .extra2 = &two,
+ },
{ }
};
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index b8ff562..1a2dab5 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -276,6 +276,8 @@
#include <net/ip.h>
#include <net/sock.h>
+#include <net/tcp_fec.h>
+
#include <asm/uaccess.h>
#include <asm/ioctls.h>
#include <net/busy_poll.h>
@@ -2565,6 +2567,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
else
tp->tsoffset = val - tcp_time_stamp;
break;
+ case TCP_FEC:
+ if (sysctl_tcp_fec && val >= 0 && val < TCP_FEC_NUM_TYPES)
+ tp->fec.type = val;
+ else
+ err = -EINVAL;
+ break;
case TCP_NOTSENT_LOWAT:
tp->notsent_lowat = val;
sk->sk_write_space(sk);
@@ -2792,6 +2800,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
case TCP_TIMESTAMP:
val = tcp_time_stamp + tp->tsoffset;
break;
+ case TCP_FEC:
+ val = tp->fec.type;
+ break;
case TCP_NOTSENT_LOWAT:
val = tp->notsent_lowat;
break;
diff --git a/net/ipv4/tcp_fec.c b/net/ipv4/tcp_fec.c
new file mode 100644
index 0000000..53711cb
--- /dev/null
+++ b/net/ipv4/tcp_fec.c
@@ -0,0 +1,1253 @@
+#include <net/tcp_fec.h>
+
+/* Codes for incoming FEC packet processing */
+#define FEC_NO_LOSS 1
+#define FEC_LOSS_UNRECOVERED 2
+#define FEC_LOSS_RECOVERED 3
+
+/* Receiver routines */
+static int tcp_fec_process_xor(struct sock *sk, const struct sk_buff *skb,
+ unsigned int block_skip);
+static int tcp_fec_recover(struct sock *sk, const struct sk_buff *skb,
+ unsigned char *data, u32 seq, int len);
+static void tcp_fec_send_ack(struct sock *sk, const struct sk_buff *skb,
+ int recovery_status);
+static void tcp_fec_reduce_window(struct sock *sk);
+static void tcp_fec_mark_skbs_lost(struct sock *sk);
+static bool tcp_fec_update_decoded_option(struct sk_buff *skb);
+static struct sk_buff *tcp_fec_make_decoded_pkt(struct sock *sk,
+ const struct sk_buff *skb, unsigned char *dec_data,
+ u32 seq, unsigned int len);
+
+/* Sender routines */
+static int tcp_fec_create(struct sock *sk, struct sk_buff_head *list);
+static int tcp_fec_create_xor(struct sock *sk, struct sk_buff_head *list,
+ unsigned int first_seq, unsigned int block_len,
+ unsigned int block_skip,
+ unsigned int max_encoded_per_pkt);
+static struct sk_buff *tcp_fec_make_encoded_pkt(struct sock *sk,
+ struct tcp_fec *fec, unsigned char *enc_data,
+ u32 seq);
+static int tcp_fec_xmit_all(struct sock *sk, struct sk_buff_head *list);
+static int tcp_fec_xmit(struct sock *sk, struct sk_buff *skb);
+
+/* Buffer access routine */
+static unsigned int tcp_fec_get_next_block(struct sock *sk,
+ struct sk_buff **skb, struct sk_buff_head *queue,
+ u32 seq, unsigned int block_len,
+ unsigned char *block);
+
+/* Have to define this signature here since the actual function was static
+ * and tcp_output.c has no corresponding header file
+ */
+extern int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
+ gfp_t gfp_mask);
+
+/* Decodes FEC parameters and stores them in the FEC struct
+ * @seq - sequence number of the packet
+ * @ack_seq - ACKed sequence number
+ * @is_syn - true, if option was attached to a packet with a SYN flag
+ * @ptr - points to the first byte of the FEC option after kind, length,
+ * and possible magic bytes
+ * @len - option length (without kind, length, magic bytes)
+ */
+int tcp_fec_decode_option(struct tcp_fec *fec, u32 seq, u32 ack_seq,
+ bool is_syn, const unsigned char *ptr,
+ unsigned int len)
+{
+ /* reset / initialize option values which should be evaluated
+ * with EVERY incoming packet
+ */
+ fec->flags = 0;
+ fec->saw_fec = 1;
+
+ if (len == 1) {
+ /* Short option */
+ u8 val = *((u8 *) ptr);
+ if (is_syn) {
+ /* Negotiation */
+ fec->type = val;
+ } else {
+ /* Regular packet */
+ fec->flags = val;
+ }
+
+ return 0;
+ }
+
+ if (len == 4) {
+ /* Long option */
+ u32 val = get_unaligned_be32(ptr);
+ fec->flags = val >> 24;
+
+ if (fec->flags & TCP_FEC_ENCODED) {
+ fec->enc_seq = seq;
+ fec->enc_len = val & 0xFFFFFF;
+ } else if (fec->flags & TCP_FEC_RECOVERY_FAILED) {
+ fec->lost_seq = ack_seq;
+ fec->lost_len = val & 0xFFFFFF;
+ } else {
+ return -EINVAL;
+ }
+
+ return 0;
+ }
+
+ /* Invalid option length */
+ return -EINVAL;
+}
+
+/* Encodes FEC parameters to wire format
+ * @ptr - Encoded option is written to this memory location (and the pointer
+ * is advanced to the next unoccupied byte, 4-byte aligned)
+ * Returns the length of the encoded option (including alignment)
+ */
+int tcp_fec_encode_option(struct tcp_sock *tp, struct tcp_fec *fec,
+ __be32 **ptr)
+{
+ int len;
+
+ fec->flags |= tp->fec.flags;
+ fec->lost_len = tp->fec.lost_len;
+ tp->fec.flags &= ~TCP_FEC_RECOVERY_CWR;
+ tp->fec.flags &= ~TCP_FEC_RECOVERY_FAILED;
+
+ /* Encode fixed option part (option kind, length, and magic bytes) */
+ if (fec->flags & (TCP_FEC_ENCODED | TCP_FEC_RECOVERY_FAILED))
+ len = 4 + TCPOLEN_EXP_FEC_BASE; /* Long option */
+ else
+ len = 1 + TCPOLEN_EXP_FEC_BASE; /* Short option */
+
+ **ptr = htonl((TCPOPT_EXP << 24) | (len << 16) | TCPOPT_FEC_MAGIC);
+ (*ptr)++;
+
+ if ((fec->flags & TCP_FEC_ENCODED) &&
+ (fec->flags & TCP_FEC_RECOVERY_FAILED)) {
+ /* TODO Special case: need to separate loss indication
+ * from encoding or make option 12 bytes long
+ * This can only happen if a node receives and sends FEC
+ * data
+ */
+ fec->flags &= ~TCP_FEC_RECOVERY_FAILED;
+ }
+
+ if (fec->flags & TCP_FEC_ENCODED) {
+ /* FEC-encoded packets carry:
+ * <Flags:8, Encoding length:24>
+ */
+ **ptr = htonl((fec->flags << 24) |
+ (fec->enc_len));
+ (*ptr)++;
+ return 8;
+ } else if (fec->flags & TCP_FEC_RECOVERY_FAILED) {
+ /* Packets with failed recovery indication carry:
+ * <Flags:8, Bytes after ACKed seq lost:24>
+ */
+ **ptr = htonl((fec->flags << 24) |
+ (fec->lost_len));
+ (*ptr)++;
+ return 8;
+ } else if (fec->type) {
+ /* Negotiation packets carry: <Encoding type:8> */
+ **ptr = htonl((fec->type << 24) |
+ (TCPOPT_NOP << 16) |
+ (TCPOPT_NOP << 8) |
+ TCPOPT_NOP);
+ (*ptr)++;
+ return 8;
+ } else {
+ /* All other packets carry: <Flags:8> */
+ **ptr = htonl((fec->flags << 24) |
+ (TCPOPT_NOP << 16) |
+ (TCPOPT_NOP << 8) |
+ TCPOPT_NOP);
+ (*ptr)++;
+ return 8;
+ }
+}
+
+/* Processes the current packet in the buffer, treated as an FEC packet
+ * (assumes that options were already processed)
+ */
+int tcp_fec_process(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp;
+ struct tcphdr *th;
+ int recovery_status, err;
+ u32 end_seq;
+
+ tp = tcp_sk(sk);
+ th = tcp_hdr(skb);
+ recovery_status = 0;
+
+ /* drop packet if packet is not encoded */
+ if (!(tp->rx_opt.fec.flags & TCP_FEC_ENCODED))
+ return -1;
+
+ /* check if all encoded packets were already received */
+ end_seq = tp->rx_opt.fec.enc_seq + tp->rx_opt.fec.enc_len;
+ if (!after(end_seq, tp->rcv_nxt)) {
+ tcp_fec_send_ack(sk, skb, FEC_NO_LOSS);
+ return 0;
+ }
+
+ /* linearize the SKB (for easier payload access) */
+ err = skb_linearize(skb);
+ if (err)
+ return err;
+
+ /* data recovery */
+ switch (tp->fec.type) {
+ case TCP_FEC_TYPE_NONE:
+ return -1;
+ case TCP_FEC_TYPE_XOR_ALL:
+ recovery_status = tcp_fec_process_xor(sk, skb, 0);
+ break;
+ case TCP_FEC_TYPE_XOR_SKIP_1:
+ recovery_status = tcp_fec_process_xor(sk, skb, 1);
+ break;
+ }
+
+ /* TODO error handling; -ENOMEM, etc. - disable FEC? */
+ if (recovery_status < 0)
+ return recovery_status;
+
+ /* Send an explicit ACK if recovery failed */
+ if (recovery_status == FEC_LOSS_UNRECOVERED)
+ tcp_fec_send_ack(sk, skb, recovery_status);
+
+ return 0;
+}
+
+/* Checks the received options for loss indicators and acts upon them.
+ * In particular, the function handles recovery flags (indicators for
+ * successful and failed recoveries, tail losses)
+ * Returns: 1, if ACK contains a loss indicator
+ */
+int tcp_fec_check_ack(struct sock *sk, u32 ack_seq)
+{
+ struct tcp_sock *tp;
+
+ tp = tcp_sk(sk);
+
+ /* Clear local recovery indication (and ECN CWR demand)
+ * if it was ACKED by the other node
+ */
+ if (tp->rx_opt.fec.flags & TCP_FEC_RECOVERY_CWR) {
+ tp->fec.flags &= ~TCP_FEC_RECOVERY_SUCCESSFUL;
+ tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+ }
+
+ /* Check for tail loss indicators
+ * This happens when FEC was unable to recover the lost data and
+ * thus only sends an ACK with the loss range back. Everything not
+ * ACKed/SACKed now, is considered lost now.
+ */
+ if (tp->rx_opt.fec.flags & TCP_FEC_RECOVERY_FAILED) {
+ tcp_fec_mark_skbs_lost(sk);
+ return 1;
+ }
+
+ /* Check if the remote endpoint successfully recovered data,
+ * if so we trigger a window reduction
+ */
+ if (tp->rx_opt.fec.flags & TCP_FEC_RECOVERY_SUCCESSFUL) {
+ /* Ignore flag if window was already reduced for the current
+ * loss episode or if previous reduction was not signaled
+ * yet (no outgoing packets)
+ */
+ if (after(ack_seq, tp->high_seq) &&
+ !(tp->fec.flags & TCP_FEC_RECOVERY_CWR)) {
+ tcp_fec_reduce_window(sk);
+ tp->fec.flags |= TCP_FEC_RECOVERY_CWR;
+ }
+
+ return 1;
+ }
+
+ return 0;
+}
+
+/* Since data in the socket's receive queue can get consumed by other parties
+ * we need to clone these SKBs until they are no longer required for possible
+ * future recoveries. This function is called after the TCP header has been
+ * removed from the SKB already. All parameters required for recovery are
+ * stored in the SKB's control buffer.
+ * @skb - buffer which is moved to the receive queue
+ */
+int tcp_fec_update_queue(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp;
+ struct sk_buff *cskb;
+ u32 data_len;
+ int extra_bytes, err;
+ tp = tcp_sk(sk);
+
+ /* clone the SKB and add it to the FEC receive queue
+ * (a simple extra reference to the SKB is not sufficient since
+ * since SKBs can only be queued on one list at a time)
+ */
+ cskb = skb_clone(skb, GFP_ATOMIC);
+ if (cskb == NULL)
+ return -ENOMEM;
+
+ /* linearize the SKB (for easier payload access) */
+ err = skb_linearize(cskb);
+ if (err)
+ return err;
+
+ data_len = skb->len;
+ if (!data_len) {
+ kfree_skb(cskb);
+ return 0;
+ }
+
+ skb_queue_tail(&tp->fec.rcv_queue, cskb);
+ tp->fec.bytes_rcv_queue += data_len;
+
+ /* check if we can dereference old SKBs (as long as we have enough
+ * data for future recoveries)
+ */
+ extra_bytes = tp->fec.bytes_rcv_queue - FEC_RCV_QUEUE_LIMIT;
+ while (extra_bytes > 0) {
+ cskb = skb_peek(&tp->fec.rcv_queue);
+ if (cskb == NULL)
+ return -EINVAL;
+
+ data_len = TCP_SKB_CB(cskb)->end_seq - TCP_SKB_CB(cskb)->seq;
+ if (data_len > extra_bytes) {
+ break;
+ } else {
+ extra_bytes -= data_len;
+ tp->fec.bytes_rcv_queue -= data_len;
+ skb_unlink(cskb, &tp->fec.rcv_queue);
+ kfree_skb(cskb);
+ }
+ }
+
+ return 0;
+}
+
+/* Disables FEC for this connection (includes clearing references
+ * to buffers in receive queue)
+ */
+void tcp_fec_disable(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!tcp_fec_is_enabled(tp))
+ return;
+
+ tp->fec.type = 0;
+ tp->fec.bytes_rcv_queue = 0;
+ skb_queue_purge(&tp->fec.rcv_queue);
+}
+
+/* Processes the current packet in the buffer, treated as an FEC packet
+ * with XOR-encoded payload (assumes that options were already processed)
+ * Returns: negative code, if an error occurred;
+ * positive code, otherwise (recovery status)
+ * @block_skip - Number of unencoded blocks between two encoded blocks
+ */
+static int tcp_fec_process_xor(struct sock *sk, const struct sk_buff *skb,
+ unsigned int block_skip)
+{
+ struct sk_buff *pskb;
+ struct tcp_sock *tp;
+ struct tcphdr *th;
+ u32 next_seq, end_seq, rec_seq;
+ unsigned char *data, *block;
+ unsigned int i, offset, data_len, block_len, rec_len;
+ bool seen_loss;
+ int ret;
+
+ pskb = NULL;
+ tp = tcp_sk(sk);
+ th = tcp_hdr(skb);
+ next_seq = tp->rx_opt.fec.enc_seq;
+ end_seq = next_seq + tp->rx_opt.fec.enc_len;
+ block_len = skb->len - tcp_hdrlen(skb);
+ seen_loss = false;
+ offset = 0;
+
+ /* memory allocation for decoding / recovered SKB data */
+ data = kmalloc(2 * block_len, GFP_ATOMIC);
+ if (data == NULL)
+ return -ENOMEM;
+
+ block = data + block_len;
+
+ /* copy FEC payload (skip TCP header) */
+ memcpy(data, skb->data + tcp_hdrlen(skb), block_len);
+
+ /* process in-sequence data */
+ while ((data_len = tcp_fec_get_next_block(sk, &pskb,
+ &tp->fec.rcv_queue, next_seq,
+ min(block_len, end_seq - next_seq),
+ block))) {
+ next_seq += data_len;
+
+ /* XOR with existing payload */
+ for (i = 0; i < data_len; i++)
+ data[i] ^= block[i];
+
+ /* we could no read a whole MSS block, which means we
+ * reached the end of the queue or end of range which the
+ * FEC packet covers
+ */
+ if (data_len < block_len)
+ break;
+
+ /* skip unencoded blocks if there is more data encoded */
+ if (end_seq - next_seq > 0)
+ next_seq += block_len * block_skip;
+ }
+
+ /* check if all encoded bytes were already received */
+ if (next_seq == end_seq) {
+ kfree(data);
+ return FEC_NO_LOSS;
+ }
+
+ /* we always recover one whole MSS block (otherwise slicing
+ * would introduce a lot of additional complexity here) and handle
+ * cut out already received sequences later
+ */
+ rec_seq = next_seq;
+ rec_len = min(block_len, end_seq - rec_seq);
+ offset = data_len;
+ if ((rec_seq + rec_len) == end_seq)
+ goto recover;
+
+ next_seq += block_len * (block_skip + 1);
+ pskb = NULL;
+
+ /* read a possibly partial (smaller than MSS) block to fill up the
+ * previously unfilled block and achieve alignment again
+ */
+ data_len = tcp_fec_get_next_block(sk, &pskb, &tp->out_of_order_queue,
+ next_seq, block_len - offset, block);
+
+ next_seq += data_len;
+
+ /* check if we could not read as much data as requested */
+ if ((next_seq != end_seq) && (data_len < (block_len - offset)))
+ goto clean;
+
+ /* XOR with existing payload */
+ for (i = 0; i < data_len; i++)
+ data[i+offset] ^= block[i];
+
+ /* skip unencoded blocks if there is more data encoded */
+ if (end_seq - next_seq > 0)
+ next_seq += block_len * block_skip;
+
+ /* read all necessary blocks to finish decoding */
+ while ((data_len = tcp_fec_get_next_block(sk, &pskb,
+ &tp->out_of_order_queue, next_seq,
+ min(block_len, end_seq - next_seq),
+ block))) {
+ next_seq += data_len;
+
+ /* XOR with existing payload */
+ for (i = 0; i < data_len; i++)
+ data[i] ^= block[i];
+
+ /* we could not read a whole MSS block, which means we reached
+ * the end of the queue or end of range which the FEC packet
+ * covers
+ */
+ if (data_len < block_len)
+ break;
+
+ /* skip unencoded blocks if there is more data encoded */
+ if (end_seq - next_seq > 0)
+ next_seq += block_len * block_skip;
+ }
+
+ /* check if additional losses were observed (cannot recover) */
+ if (next_seq != end_seq)
+ goto clean;
+
+recover:
+ /* create and process recovered packets */
+ for (i = 0; i < rec_len; i++)
+ block[i] = data[(offset + i) % block_len];
+
+ if (block_skip && ((block_len - offset) < rec_len)) {
+ /* recover non-consecutive sequence ranges (only when
+ * slicing is used)
+ */
+ u32 second_seq;
+ unsigned int second_seq_len, first_seq_len;
+
+ first_seq_len = block_len - offset;
+ second_seq = rec_seq + first_seq_len + block_len * block_skip;
+ second_seq_len = rec_len - first_seq_len;
+
+ ret = tcp_fec_recover(sk, skb, block, rec_seq, first_seq_len);
+ if (ret >= 0) {
+ int second_ret = tcp_fec_recover(sk, skb,
+ block + first_seq_len,
+ second_seq, second_seq_len);
+ if (second_ret < 0 || !ret)
+ ret = second_ret;
+ }
+ } else {
+ ret = tcp_fec_recover(sk, skb, block, rec_seq, rec_len);
+ }
+
+ kfree(data);
+ return ret ? ret : FEC_LOSS_RECOVERED;
+
+clean:
+ kfree(data);
+ return FEC_LOSS_UNRECOVERED;
+}
+
+/* Create a recovered packet and forward it to the reception routine */
+static int tcp_fec_recover(struct sock *sk, const struct sk_buff *skb,
+ unsigned char *data, u32 seq, int len)
+{
+ struct sk_buff *rskb;
+ struct tcp_sock *tp;
+
+ tp = tcp_sk(sk);
+
+ /* We will notify the remote node that recovery was successful */
+ tp->fec.flags |= TCP_FEC_RECOVERY_SUCCESSFUL;
+
+ /* Check if we received some tail of the recovered sequence already
+ * by looking at the current SACK blocks (we don't want to recover
+ * more data than necessary to prevent DSACKS)
+ */
+ if (tcp_is_sack(tp)) {
+ int i;
+ for (i = 0; i < tp->rx_opt.num_sacks; i++) {
+ if (before(tp->selective_acks[i].start_seq,
+ seq + len) &&
+ !before(tp->selective_acks[i].end_seq,
+ seq + len)) {
+ len = tp->selective_acks[i].start_seq - seq;
+ break;
+ }
+ }
+ }
+
+ /* We might have prematurely asked for a recovery in the case where the
+ * whole recovery sequence is already covered by SACKs
+ */
+ if (len <= 0)
+ return FEC_NO_LOSS;
+
+ /* Create decoded packet and forward to reception routine */
+ rskb = tcp_fec_make_decoded_pkt(sk, skb, data, seq, len);
+ if (rskb == NULL)
+ return -EINVAL;
+
+ tcp_rcv_established(sk, rskb, tcp_hdr(rskb), rskb->len);
+ return 0;
+}
+
+/* Sends an ACK for the FEC packet and encodes any congestion or
+ * and/or recovery information
+ */
+static void tcp_fec_send_ack(struct sock *sk, const struct sk_buff *skb,
+ int recovery_status)
+{
+ struct tcp_sock *tp;
+ u32 end_seq;
+
+ tp = tcp_sk(sk);
+
+ /* Right now we only need an outgoing ACK if FEC recovery failed,
+ * in all other cases ACKs are implicitly generated
+ */
+ switch (recovery_status) {
+ case FEC_LOSS_UNRECOVERED:
+ end_seq = tp->rx_opt.fec.enc_seq + tp->rx_opt.fec.enc_len;
+ tp->fec.flags |= TCP_FEC_RECOVERY_FAILED;
+ tp->fec.lost_len = end_seq - tp->rcv_nxt;
+ tcp_send_ack(sk);
+ break;
+ }
+}
+
+/* Reduces the congestion window (similar to completed fast recovery)
+ * If the node is already in recovery mode, undo is disabled to enforce
+ * the window reduction upon completion
+ */
+static void tcp_fec_reduce_window(struct sock *sk)
+{
+ struct tcp_sock *tp;
+ const struct inet_connection_sock *icsk;
+
+ tp = tcp_sk(sk);
+ icsk = inet_csk(sk);
+
+ if (icsk->icsk_ca_state < TCP_CA_CWR) {
+ tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
+ if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH) {
+ tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
+ tp->snd_cwnd_stamp = tcp_time_stamp;
+ }
+
+ /* Any future window reduction requests are ignored until
+ * snd_nxt is ACKed
+ */
+ tp->high_seq = tp->snd_nxt;
+ tp->undo_marker = 0;
+ } else {
+ /* Socket is in some congestion mode and we only need to make
+ * sure that window reduction is executed when recovery
+ * is finished
+ */
+ tp->undo_marker = 0;
+ }
+}
+
+/* The incoming ACK indicates a failed recovery.
+ * Mark all unacked SKBs in the loss range as lost.
+ * TODO With interleaved coding, we have the additional constraint
+ * that the SKBs in the loss range also must have been encoded the
+ * triggering FEC packet, and for that we need to keep some info
+ * about FEC packets on the sender side
+ */
+static void tcp_fec_mark_skbs_lost(struct sock *sk)
+{
+ struct tcp_sock *tp;
+ struct sk_buff *skb;
+ u32 start_seq, end_seq;
+
+ tp = tcp_sk(sk);
+ skb = tp->lost_skb_hint ? tp->lost_skb_hint : tcp_write_queue_head(sk);
+
+ /* All SKBs falling completely in the range are marked */
+ start_seq = tp->rx_opt.fec.lost_seq;
+ end_seq = tp->rx_opt.fec.lost_seq + tp->rx_opt.fec.lost_len;
+
+ tcp_for_write_queue_from(skb, sk) {
+ if (skb == tcp_send_head(sk))
+ break;
+
+ /* Past loss range */
+ if (!before(TCP_SKB_CB(skb)->seq, end_seq))
+ break;
+
+ /* SKB not (fully) within range */
+ if (before(TCP_SKB_CB(skb)->seq, start_seq) ||
+ after(TCP_SKB_CB(skb)->end_seq, end_seq))
+ continue;
+
+ /* SKB already marked */
+ if (TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))
+ continue;
+
+ /* Verify retransmit hint before marking
+ * (see tcp_verify_retransmit_hint(),
+ * copied since method defined static in tcp_input.c)
+ */
+ if ((tp->retransmit_skb_hint == NULL) ||
+ before(TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
+ tp->retransmit_skb_hint = skb;
+
+ if (!tp->lost_out ||
+ after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high))
+ tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
+
+ /* Mark SKB as lost (see tcp_skb_mark_lost()) */
+ tp->lost_out += tcp_skb_pcount(skb);
+ TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+ }
+
+ tcp_verify_left_out(tp);
+}
+
+/* Searches for the FEC option in the packet header and replaces
+ * the long option with a short one padded by NOPs.
+ * This is done to convert the option used by an encoded packet
+ * to the option used by a recovered packet.
+ */
+static bool tcp_fec_update_decoded_option(struct sk_buff *skb)
+{
+ struct tcphdr *th;
+ unsigned char *ptr;
+ int length;
+
+ th = tcp_hdr(skb);
+ ptr = (unsigned char *) (th + 1);
+ length = (th->doff * 4) - sizeof(struct tcphdr);
+
+ while (length > 0) {
+ int opcode = *ptr++;
+ int opsize;
+
+ switch (opcode) {
+ case TCPOPT_EOL:
+ return 0;
+ case TCPOPT_NOP:
+ length--;
+ continue;
+ default:
+ opsize = *ptr++;
+ if (opsize < 2 || opsize > length)
+ return 0;
+
+ if (opcode == TCPOPT_EXP &&
+ get_unaligned_be16(ptr) == TCPOPT_FEC_MAGIC) {
+ /* Update FEC option:
+ * 1. Convert long option into short option
+ * 2. Clear ENCODED flag (keep other flags)
+ * 3. Replace option value (long option) by NOPs
+ */
+ u32 *fec_opt_start = (u32 *) (ptr - 2);
+ *fec_opt_start = htonl((
+ get_unaligned_be32(fec_opt_start) &
+ 0xFF00FFFF) | 0x00050000);
+ *(fec_opt_start + 1) = htonl((
+ get_unaligned_be32(fec_opt_start + 1) &
+ 0xEF000000) | 0x00010101);
+
+ return 1;
+ }
+
+ ptr += opsize - 2;
+ length -= opsize;
+ }
+ }
+
+ return 0;
+}
+
+/* Allocates an SKB for data we want to forward to reception routines
+ * (recovered data) by making a copy of the FEC SKB and replacing the data
+ * part, all other segments (options, etc.) are preserved
+ */
+static struct sk_buff *tcp_fec_make_decoded_pkt(struct sock *sk,
+ const struct sk_buff *skb,
+ unsigned char *dec_data,
+ u32 seq, unsigned int len)
+{
+ struct tcp_sock *tp;
+ struct sk_buff *nskb;
+
+ tp = tcp_sk(sk);
+ nskb = skb_copy(skb, GFP_ATOMIC);
+ if (nskb == NULL)
+ return NULL;
+
+ /* Update FEC option for the new packet */
+ if (!tcp_fec_update_decoded_option(nskb)) {
+ /* TODO Do we need this catch? Technically we don't reach this
+ * method if there is no FEC option in the header.
+ */
+ return NULL;
+ }
+
+ /* check if we received some tail of the recovered sequence already
+ * by looking at the current SACK blocks (we don't want to recover
+ * more data than necessary to prevent DSACKS)
+ */
+ if (tcp_is_sack(tp)) {
+ int i;
+ for (i = 0; i < tp->rx_opt.num_sacks; i++) {
+ if (before(tp->selective_acks[i].start_seq,
+ seq + len) &&
+ !before(tp->selective_acks[i].end_seq,
+ seq + len)) {
+ len = tp->selective_acks[i].start_seq - seq;
+ break;
+ }
+ }
+ }
+
+ /* trim data section to fit recovered sequence if necessary */
+ if (len < (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq))
+ skb_trim(nskb, len + tcp_hdrlen(nskb));
+
+ /* fix the sequence numbers */
+ tcp_hdr(nskb)->seq = htonl(seq);
+ tcp_hdr(nskb)->ack_seq = htonl(tp->snd_una);
+ TCP_SKB_CB(nskb)->seq = seq;
+ TCP_SKB_CB(nskb)->end_seq = seq + len;
+
+ /* replace SKB payload with recovered data */
+ memcpy(nskb->data + tcp_hdrlen(nskb), dec_data, len);
+
+ /* packets used for recovery had their checksums checked already */
+ nskb->ip_summed = CHECKSUM_UNNECESSARY;
+
+ return nskb;
+}
+
+/* Gets the next byte block from an SKB queue (any SKB which is touched
+ * in this procedure will be linearized to simplify payload access)
+ * @skb - Points to SKB from which previous block was extracted (useful
+ * for successive calls to this function, which avoids moving through
+ * the whole queue again)
+ * @queue - SKB queue to read from (SKB has to point to an element on this
+ * queue)
+ * @seq - Sequence number of first byte in the block
+ * @block_len
+ * @block
+ *
+ * Returns the bytes written to the block memory
+ */
+static unsigned int tcp_fec_get_next_block(struct sock *sk,
+ struct sk_buff **skb,
+ struct sk_buff_head *queue, u32 seq,
+ unsigned int block_len, unsigned char *block)
+{
+ unsigned int cur_len, offset, num_bytes;
+ int err;
+ u32 end_seq;
+
+ cur_len = 0;
+
+ /* Get first SKB of the write queue and specify next sequence to
+ * encode
+ */
+ if (*skb == NULL) {
+ *skb = skb_peek(queue);
+ if (*skb == NULL)
+ return 0;
+ }
+
+ /* move to SKB which stores the next sequence to encode */
+ while (*skb) {
+ /* If we observe an RST/SYN, we stop here to avoid
+ * handling corner cases
+ */
+ if (TCP_SKB_CB(*skb)->tcp_flags &
+ (TCPHDR_RST |
+ TCPHDR_SYN))
+ return 0;
+ if (!before(seq, TCP_SKB_CB(*skb)->seq) &&
+ before(seq, TCP_SKB_CB(*skb)->end_seq))
+ break;
+ if (*skb == skb_peek_tail(queue)) {
+ *skb = NULL;
+ break;
+ }
+
+ *skb = skb_queue_next(queue, *skb);
+ }
+
+ if (*skb == NULL)
+ return 0;
+
+ /* copy bytes from SKBs (connected sequences) */
+ while (*skb && (cur_len < block_len)) {
+ err = skb_linearize(*skb);
+ if (err)
+ return err;
+
+ /* Deal with the end seq number being incremented by
+ * one if the FIN flag is set (we don't want to encode this)
+ */
+ end_seq = TCP_SKB_CB(*skb)->end_seq;
+ if (TCP_SKB_CB(*skb)->tcp_flags & TCPHDR_FIN)
+ end_seq--;
+
+ if ((seq >= TCP_SKB_CB(*skb)->seq) && (seq < end_seq)) {
+ /* Copy data depending on:
+ * - remaining space in the block
+ * - remaining data in the SKB
+ */
+ offset = seq - TCP_SKB_CB(*skb)->seq;
+ num_bytes = min(block_len - cur_len,
+ end_seq - seq);
+
+ memcpy(block + cur_len, (*skb)->data + offset,
+ num_bytes);
+ cur_len += num_bytes;
+ seq += num_bytes;
+ }
+
+ if (*skb == skb_peek_tail(queue) || cur_len >= block_len)
+ break;
+
+ *skb = skb_queue_next(queue, *skb);
+ }
+
+ return cur_len;
+}
+
+/* Arms the timer for a delayed FEC transmission if there is
+ * no earlier timeout defined (i.e. retransmission timeout)
+ */
+void tcp_fec_arm_timer(struct sock *sk)
+{
+ struct inet_connection_sock *icsk;
+ struct tcp_sock *tp;
+ u32 delta, timeout, rtt;
+
+ icsk = inet_csk(sk);
+ tp = tcp_sk(sk);
+
+ /* Only arm a timer if connection is established */
+ if (sk->sk_state != TCP_ESTABLISHED)
+ return;
+
+ /* Forward next sequence to be encoded if unencoded data was acked */
+ if (after(tp->snd_una, tp->fec.next_seq))
+ tp->fec.next_seq = tp->snd_una;
+
+ /* Don't arm the timer if there is no unencoded data left */
+ if (!before(tp->fec.next_seq, tp->snd_nxt))
+ return;
+
+ /* TODO handle other timers which might be armed;
+ * EARLY_RETRANS? LOSS_PROBE?
+ */
+
+ /* Compute timeout (currently 0.25 * RTT) */
+ rtt = tp->srtt_us >> 3;
+ timeout = rtt >> 2;
+
+ /* Compute delay between transmission of original packet and this call
+ * (difference is subtracted from timeout value)
+ */
+ delta = 0;
+ if (delta > timeout) {
+ tcp_fec_invoke_nodelay(sk);
+ return;
+ } else if (delta > 0) {
+ timeout -= delta;
+ }
+
+ /* Do not replace a timeout occurring earlier */
+ if (jiffies + timeout >= icsk->icsk_timeout)
+ return;
+
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_FEC, timeout, TCP_RTO_MAX);
+}
+
+/* The FEC timer fired. Force an FEC transmission for the
+ * last unencoded burst. Rearm the RTO timer (which was switched
+ * out when setting the FEC timer). Set a new FEC timer if there
+ * is pending unencoded data.
+ */
+void tcp_fec_timer(struct sock *sk)
+{
+ struct inet_connection_sock *icsk;
+ struct tcp_sock *tp;
+
+ icsk = inet_csk(sk);
+ tp = tcp_sk(sk);
+
+ tcp_fec_invoke_nodelay(sk);
+
+ icsk->icsk_pending = 0;
+ tcp_rearm_rto(sk);
+
+ tcp_fec_arm_timer(sk);
+}
+
+/* If FEC packet transmissions are delayed set a timer
+ * (if not already set), otherwise invoke the FEC mechanism
+ * immediately
+ */
+int tcp_fec_invoke(struct sock *sk)
+{
+ struct inet_connection_sock *icsk;
+ struct tcp_sock *tp;
+
+ icsk = inet_csk(sk);
+ tp = tcp_sk(sk);
+
+#ifndef TCP_FEC_DELAYED_SEND
+ return tcp_fec_invoke_nodelay(sk);
+#else
+ /* Set the timer for sending an FEC packet if no FEC
+ * timer is active yet
+ */
+ if (!icsk->icsk_pending || icsk->icsk_pending != ICSK_TIME_FEC)
+ tcp_fec_arm_timer(sk);
+#endif
+
+ return 0;
+}
+
+/* Invokes the FEC mechanism set for the connection;
+ * Creates and sends out FEC packets
+ */
+int tcp_fec_invoke_nodelay(struct sock *sk)
+{
+ int err;
+ struct sk_buff_head *list;
+ struct sk_buff *skb;
+ struct tcp_fec *fec;
+
+ list = kmalloc(sizeof(struct sk_buff_head), GFP_ATOMIC);
+ if (list == NULL)
+ return -ENOMEM;
+
+ skb_queue_head_init(list);
+ err = tcp_fec_create(sk, list);
+ if (err)
+ goto clean;
+
+ err = tcp_fec_xmit_all(sk, list);
+ if (err)
+ goto clean;
+
+clean:
+ /* Purge all SKBs (purge FEC structs first) */
+ skb = (struct sk_buff *) list;
+ while (!skb_queue_is_last(list, skb)) {
+ skb = skb_queue_next(list, skb);
+ fec = TCP_SKB_CB(skb)->fec;
+ if (fec != NULL) {
+ kfree(fec);
+ TCP_SKB_CB(skb)->fec = NULL;
+ }
+ }
+
+ skb_queue_purge(list);
+ kfree(list);
+
+ /* TODO error handling; -ENOMEM, etc. - disable FEC? */
+
+ return err;
+}
+
+/* Creates one or more FEC packets (can depend on the FEC type used)
+ * and puts them in a queue
+ * @list: queue head
+ */
+static int tcp_fec_create(struct sock *sk, struct sk_buff_head *list)
+{
+ struct tcp_sock *tp;
+ unsigned int first_seq, block_len;
+ int err;
+
+ tp = tcp_sk(sk);
+
+ /* Update the pointer to the first byte to be encoded next
+ * (this only matters when a packet was ACKed before it was
+ * encoded)
+ */
+ if (after(tp->snd_una, tp->fec.next_seq))
+ tp->fec.next_seq = tp->snd_una;
+
+ first_seq = tp->fec.next_seq;
+ block_len = tcp_current_mss(sk);
+
+ switch (tp->fec.type) {
+ case TCP_FEC_TYPE_NONE:
+ return 0;
+ case TCP_FEC_TYPE_XOR_ALL:
+ return tcp_fec_create_xor(sk, list, first_seq,
+ block_len, 0,
+ FEC_RCV_QUEUE_LIMIT - block_len);
+ case TCP_FEC_TYPE_XOR_SKIP_1:
+ err = tcp_fec_create_xor(sk, list, first_seq, block_len, 1,
+ FEC_RCV_QUEUE_LIMIT - block_len);
+ if (err)
+ return err;
+
+ return tcp_fec_create_xor(sk, list, first_seq + block_len,
+ block_len, 1,
+ FEC_RCV_QUEUE_LIMIT - block_len);
+ }
+
+ return 0;
+}
+
+/* Creates FEC packet(s) using XOR encoding
+ * (allocates memory for the FEC structs)
+ * @first_seq - Sequence number of first byte to be encoded
+ * @block_len - Block length (typically MSS)
+ * @block_skip - Number of unencoded blocks between two encoded blocks
+ * @max_encoded_per_pkt - maximum number of blocks encoded per packet
+ * (0, if unlimited)
+ */
+static int tcp_fec_create_xor(struct sock *sk, struct sk_buff_head *list,
+ unsigned int first_seq, unsigned int block_len,
+ unsigned int block_skip,
+ unsigned int max_encoded_per_pkt)
+{
+ struct tcp_sock *tp;
+ struct sk_buff *skb, *fskb;
+ struct tcp_fec *fec;
+ unsigned int c_encoded; /* Number of currently encoded blocks
+ not yet added to an FEC packet */
+ unsigned int next_seq; /* Next byte to encode */
+ unsigned int i;
+ unsigned char *data, *block;
+ u16 data_len;
+
+ tp = tcp_sk(sk);
+ skb = NULL;
+ c_encoded = 0;
+ next_seq = first_seq;
+
+ /* memory allocation
+ * data - used temporarily to obtain byte blocks and store the payload
+ (is freed before returning; we need two blocks here to store
+ the previously XORed data that has not been added to an FEC
+ packet yet, and the new to-be XORed data extracted from one
+ or more existing buffers)
+
+ * fec - used to store the FEC parameters
+ (is freed after the corresponding packet is forwarded to the
+ transmission routine)
+ */
+ data = kmalloc(2 * block_len, GFP_ATOMIC);
+ if (data == NULL)
+ return -ENOMEM;
+
+ fec = kmalloc(sizeof(struct tcp_fec), GFP_ATOMIC);
+ if (fec == NULL) {
+ kfree(data);
+ return -ENOMEM;
+ }
+
+ memset(data, 0, 2 * block_len);
+ memset(fec, 0, sizeof(struct tcp_fec));
+
+ block = data + block_len;
+
+ /* encode data blocks
+ * XXX atomicity check?
+ */
+ fec->enc_seq = next_seq;
+ while ((data_len = tcp_fec_get_next_block(sk, &skb,
+ &sk->sk_write_queue, next_seq,
+ min(block_len, tp->snd_nxt - next_seq),
+ block))) {
+ /* Check if we reached the encoding limit; then create packet
+ * with current payload and add it to the queue
+ */
+ if (max_encoded_per_pkt > 0 &&
+ c_encoded >= max_encoded_per_pkt) {
+ fskb = tcp_fec_make_encoded_pkt(sk, fec, data,
+ block_len);
+ if (fskb == NULL) {
+ kfree(data);
+ kfree(fec);
+ return -EINVAL;
+ }
+
+ skb_queue_tail(list, fskb);
+ memset(data, 0, block_len);
+ c_encoded = 0;
+
+ /* memory allocation for the FEC struct of the next
+ * packet
+ */
+ fec = kmalloc(sizeof(struct tcp_fec), GFP_ATOMIC);
+ if (fec == NULL) {
+ kfree(data);
+ return -ENOMEM;
+ }
+
+ memset(fec, 0, sizeof(struct tcp_fec));
+ fec->enc_seq = next_seq;
+ }
+
+ next_seq += data_len;
+ fec->enc_len = next_seq - fec->enc_seq;
+
+ /* encode block into existing payload (XOR) */
+ for (i = 0; i < data_len; i++)
+ data[i] ^= block[i];
+
+ c_encoded++;
+
+ /* skip over blocks which are not requested for encoding */
+ next_seq += block_len * block_skip;
+ }
+
+ /* create final packet if some data was selected for encoding */
+ if (c_encoded > 0) {
+ fskb = tcp_fec_make_encoded_pkt(sk, fec, data, block_len);
+ if (fskb == NULL) {
+ kfree(data);
+ kfree(fec);
+ return -EINVAL;
+ }
+
+ skb_queue_tail(list, fskb);
+ } else {
+ kfree(fec);
+ }
+
+ tp->fec.next_seq = next_seq;
+ kfree(data);
+
+ return 0;
+}
+
+/* Allocates an SKB for data we want to send and assigns
+ * the necessary options and fields
+ */
+static struct sk_buff *tcp_fec_make_encoded_pkt(struct sock *sk,
+ struct tcp_fec *fec,
+ unsigned char *enc_data,
+ unsigned int len)
+{
+ struct sk_buff *skb;
+ unsigned char *data;
+
+ /* See tcp_make_synack(); 15 probably for tail pointer etc.? */
+ len = min(len, fec->enc_len);
+ skb = alloc_skb(MAX_TCP_HEADER + 15 + len, GFP_ATOMIC);
+ if (skb == NULL)
+ return NULL;
+
+ /* Reserve space for headers */
+ skb_reserve(skb, MAX_TCP_HEADER);
+
+ /* Specify sequence number and FEC struct address in control buffer */
+ fec->flags |= TCP_FEC_ENCODED;
+ TCP_SKB_CB(skb)->seq = fec->enc_seq;
+ TCP_SKB_CB(skb)->fec = fec;
+
+ /* Enable ACK flag (required for all data packets) */
+ TCP_SKB_CB(skb)->tcp_flags = TCPHDR_ACK;
+
+ /* Set GSO parameters */
+ skb_shinfo(skb)->gso_segs = 1;
+ skb_shinfo(skb)->gso_size = 0;
+ skb_shinfo(skb)->gso_type = 0;
+
+ /* Append payload to SKB */
+ data = skb_put(skb, len);
+ memcpy(data, enc_data, len);
+
+ skb->ip_summed = CHECKSUM_PARTIAL;
+
+ return skb;
+}
+
+/* Transmit all FEC packets in a list */
+static int tcp_fec_xmit_all(struct sock *sk, struct sk_buff_head *list)
+{
+ struct sk_buff *skb;
+ int err;
+
+ if (list == NULL || skb_queue_empty(list))
+ return 0;
+
+ skb = (struct sk_buff *) list;
+ while (!skb_queue_is_last(list, skb)) {
+ skb = skb_queue_next(list, skb);
+ err = tcp_fec_xmit(sk, skb);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+/* Transmits an FEC packet */
+static int tcp_fec_xmit(struct sock *sk, struct sk_buff *skb)
+{
+ /* TODO timers? no retransmissions, but want to deactivate FEC
+ * if we never get any FEC ACKs back
+ */
+ return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
+}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index f83ddf9..b640461 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -70,6 +70,7 @@
#include <linux/kernel.h>
#include <net/dst.h>
#include <net/tcp.h>
+#include <net/tcp_fec.h>
#include <net/inet_common.h>
#include <linux/ipsec.h>
#include <asm/unaligned.h>
@@ -106,6 +107,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */
#define FLAG_DATA_SACKED 0x20 /* New SACK. */
#define FLAG_ECE 0x40 /* ECE in this ACK */
+#define FLAG_FEC_CWR_REQUESTED 0x80 /* cwnd reduction requested */
#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
#define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */
#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
@@ -115,8 +117,9 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
-#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE)
+#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE|FLAG_FEC_CWR_REQUESTED)
#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED)
+#define FLAG_CONGESTION (FLAG_ECE|FLAG_FEC_CWR_REQUESTED)
#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
@@ -2546,7 +2549,11 @@ void tcp_enter_cwr(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
tp->prior_ssthresh = 0;
- if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
+
+ /*
+ * !!! TCP FEC patch !!!
+ */
+ if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR && after(tp->snd_una, tp->high_seq)) {
tp->undo_marker = 0;
tcp_init_cwnd_reduction(sk);
tcp_set_ca_state(sk, TCP_CA_CWR);
@@ -2968,6 +2975,12 @@ void tcp_rearm_rto(struct sock *sk)
if (tp->fastopen_rsk)
return;
+ /* Don't rearm the timer if an FEC timer is active.
+ * The FEC handler will rearm the timer once the event is handled.
+ */
+ if (icsk->icsk_pending == ICSK_TIME_FEC)
+ return;
+
if (!tp->packets_out) {
inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
} else {
@@ -3228,16 +3241,23 @@ static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag)
/* Decide wheather to run the increase function of congestion control. */
static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
{
+ const struct tcp_sock *tp = tcp_sk(sk);
if (tcp_in_cwnd_reduction(sk))
return false;
+ /*
+ * !!! TCP FEC patch !!!
+ */
+ if ((flag & FLAG_CONGESTION) && !(tp->snd_cwnd < tp->snd_ssthresh))
+ return false;
+
/* If reordering is high then always grow cwnd whenever data is
* delivered regardless of its ordering. Otherwise stay conservative
* and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/
* new SACK or ECE mark may first advance cwnd here and later reduce
* cwnd in tcp_fastretrans_alert() based on more states.
*/
- if (tcp_sk(sk)->reordering > sysctl_tcp_reordering)
+ if (tp->reordering > sysctl_tcp_reordering)
return flag & FLAG_FORWARD_PROGRESS;
return flag & FLAG_DATA_ACKED;
@@ -3425,6 +3445,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
icsk->icsk_retransmits = 0;
}
+ /* Check if FEC expects and executes a window reduction */
+ if (tcp_fec_is_enabled(tp) && tcp_fec_check_ack(sk, ack))
+ flag |= FLAG_FEC_CWR_REQUESTED;
+
prior_fackets = tp->fackets_out;
/* ts_recent update must be made after we are sure that the packet
@@ -3656,6 +3680,20 @@ void tcp_parse_options(const struct sk_buff *skb,
break;
case TCPOPT_EXP:
+ /*
+ * !!! TCP FEC patch !!!
+ */
+ if (sysctl_tcp_fec &&
+ get_unaligned_be16(ptr) ==
+ TCPOPT_FEC_MAGIC) {
+ tcp_fec_decode_option(&(opt_rx->fec),
+ ntohl(th->seq),
+ ntohl(th->ack_seq), th->syn,
+ ptr + 2,
+ opsize - TCPOLEN_EXP_FEC_BASE);
+ break;
+ }
+
/* Fast Open option shares code 254 using a
* 16 bits magic number.
*/
@@ -4173,6 +4211,12 @@ static void tcp_ofo_queue(struct sock *sk)
tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
TCP_SKB_CB(skb)->end_seq);
+ /*
+ * !!! TCP FEC patch !!!
+ */
+ if (tcp_fec_is_enabled(tp))
+ tcp_fec_update_queue(sk, skb);
+
tail = skb_peek_tail(&sk->sk_receive_queue);
eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
@@ -4410,6 +4454,9 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
goto out_of_window;
/* Ok. In sequence. In window. */
+ if (tcp_fec_is_enabled(tp))
+ tcp_fec_update_queue(sk, skb);
+
if (tp->ucopy.task == current &&
tp->copied_seq == tp->rcv_nxt && tp->ucopy.len &&
sock_owned_by_user(sk) && !tp->urg_data) {
@@ -4715,6 +4762,12 @@ static int tcp_prune_queue(struct sock *sk)
tp->copied_seq, tp->rcv_nxt);
sk_mem_reclaim(sk);
+ /* Disable FEC if it was enabled to prevent keeping data
+ * in the receive queue longer than necessary
+ */
+ if (tcp_fec_is_enabled(tp))
+ tcp_fec_disable(sk);
+
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
return 0;
@@ -4998,6 +5051,21 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
/* Reset is accepted even if it did not pass PAWS. */
}
+ /* Special processing if FEC is enabled */
+ if (tcp_fec_is_enabled(tp)) {
+ if (tcp_fec_is_encoded(tp)) {
+ tcp_fec_process(sk, skb);
+ goto discard;
+ } else if (!tp->rx_opt.fec.saw_fec && th->ack &&
+ sk->sk_state == TCP_LAST_ACK) {
+ /* TODO Sometimes the FEC option is not appended to the
+ * FIN-ACK packet; socket options cleared?
+ */
+ tcp_ack(sk, skb, FLAG_SLOWPATH);
+ goto discard;
+ }
+ }
+
/* Step 1: check sequence number */
if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
/* RFC793, page 37: "In all states except SYN-SENT, all reset
@@ -5099,6 +5167,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
*/
tp->rx_opt.saw_tstamp = 0;
+ tp->rx_opt.fec.saw_fec = 0;
/* pred_flags is 0xS?10 << 16 + snd_wnd
* if header_prediction is to be made
@@ -5461,6 +5530,15 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
if (tcp_is_sack(tp) && sysctl_tcp_fack)
tcp_enable_fack(tp);
+ /*
+ * FEC negotiation
+ * Disable FEC if both ends do not agree on the FEC type used
+ */
+ if (tp->fec.type != tp->rx_opt.fec.type) {
+ tp->fec.type = 0;
+ tp->rx_opt.fec.type = 0;
+ }
+
tcp_mtup_init(sk);
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
tcp_initialize_rcv_mss(sk);
@@ -5735,6 +5813,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
tcp_initialize_rcv_mss(sk);
tcp_fast_path_on(tp);
+
+ /* SYN requested FEC usage */
+ if (tp->rx_opt.fec.type > 0)
+ tp->fec.type = tp->rx_opt.fec.type;
+
break;
case TCP_FIN_WAIT1: {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index d886b60..5efbc2e 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -73,6 +73,9 @@
#include <net/timewait_sock.h>
#include <net/xfrm.h>
#include <net/secure_seq.h>
+
+#include <net/tcp_fec.h>
+
#include <net/tcp_memcontrol.h>
#include <net/busy_poll.h>
@@ -212,6 +215,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
+ memset(&(tp->rx_opt.fec), 0, sizeof(struct tcp_fec));
+
/* Socket identity is still unknown (sport may be zero).
* However we set state to SYN-SENT and not releasing socket
* lock select source port, enter ourselves into the hash tables and
@@ -2270,7 +2275,8 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
- icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
+ icsk->icsk_pending == ICSK_TIME_LOSS_PROBE ||
+ icsk->icsk_pending == ICSK_TIME_FEC) {
timer_active = 1;
timer_expires = icsk->icsk_timeout;
} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 927586e..f59faf9 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -552,6 +552,15 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
newtp->fastopen_rsk = NULL;
newtp->syn_data_acked = 0;
+ newtp->high_seq = newtp->snd_nxt;
+
+ /* TCP FEC option */
+ newtp->rx_opt.fec.type = sysctl_tcp_fec ? req->fec_type : 0;
+ newtp->fec.type = newtp->fec.flags = 0;
+ newtp->fec.next_seq = newtp->snd_nxt;
+ newtp->fec.bytes_rcv_queue = 0;
+ skb_queue_head_init(&newtp->fec.rcv_queue);
+
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);
}
return newsk;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index ddd2a6f..7791899 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -37,6 +37,7 @@
#define pr_fmt(fmt) "TCP: " fmt
#include <net/tcp.h>
+#include <net/tcp_fec.h>
#include <linux/compiler.h>
#include <linux/gfp.h>
@@ -65,6 +66,12 @@ int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS;
/* By default, RFC2861 behavior. */
int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
+/*
+ * !!! TCP FEC patch !!!
+ */
+int sysctl_tcp_fec __read_mostly;
+
+
unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX;
EXPORT_SYMBOL(sysctl_tcp_notsent_lowat);
@@ -422,6 +429,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
#define OPTION_MD5 (1 << 2)
#define OPTION_WSCALE (1 << 3)
#define OPTION_FAST_OPEN_COOKIE (1 << 8)
+#define OPTION_FEC (1 << 9)
struct tcp_out_options {
u16 options; /* bit field of OPTION_* */
@@ -432,6 +440,7 @@ struct tcp_out_options {
__u8 *hash_location; /* temporary pointer, overloaded */
__u32 tsval, tsecr; /* need to include OPTION_TS */
struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */
+ struct tcp_fec fec; /* FEC parameters */
};
/* Write previously computed TCP options to the packet.
@@ -540,6 +549,9 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
}
ptr += (len + 3) >> 2;
}
+
+ if (unlikely(OPTION_FEC & options))
+ tcp_fec_encode_option(tp, &(opts->fec), &ptr);
}
/* Compute TCP options for SYN packets. This is not the final
@@ -607,6 +619,14 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
}
}
+ /* Prepare for FEC negotation if requested */
+ if (unlikely(tcp_fec_is_enabled(tp)) &&
+ remaining >= TCPOLEN_EXP_FEC_NEGOTIATION_ALIGNED) {
+ opts->options |= OPTION_FEC;
+ opts->fec.type = tp->fec.type;
+ remaining -= TCPOLEN_EXP_FEC_NEGOTIATION_ALIGNED;
+ }
+
return MAX_TCP_OPTION_SPACE - remaining;
}
@@ -671,6 +691,16 @@ static unsigned int tcp_synack_options(struct sock *sk,
}
}
+ /* Handle request for FEC support from other side
+ * (respond with same FEC option if FEC is locally supported)
+ */
+ if (sysctl_tcp_fec && unlikely(req->fec_type) &&
+ remaining >= TCPOLEN_EXP_FEC_NEGOTIATION_ALIGNED) {
+ opts->options |= OPTION_FEC;
+ opts->fec.type = req->fec_type;
+ remaining -= TCPOLEN_EXP_FEC_NEGOTIATION_ALIGNED;
+ }
+
return MAX_TCP_OPTION_SPACE - remaining;
}
@@ -681,6 +711,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
struct tcp_out_options *opts,
struct tcp_md5sig_key **md5)
{
+ struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL;
struct tcp_sock *tp = tcp_sk(sk);
unsigned int size = 0;
unsigned int eff_sacks;
@@ -715,6 +746,19 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
}
+ /* Prepare option if connection has FEC enabled */
+ if (tcp_fec_is_enabled(tp)) {
+ opts->options |= OPTION_FEC;
+ if (tcb && tcb->fec)
+ opts->fec = *(tcb->fec);
+
+ /* regardless of packet type we need 4 more bytes
+ * including alignment
+ */
+ size += 4;
+ size += TCPOLEN_EXP_FEC_BASE;
+ }
+
return size;
}
@@ -895,7 +939,7 @@ void tcp_wfree(struct sk_buff *skb)
* We are working here with either a clone of the original
* SKB, or a fresh unique copy made by the retransmit engine.
*/
-static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
+int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
gfp_t gfp_mask)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -2055,6 +2099,9 @@ repair:
break;
}
+ if (tcp_fec_is_enabled(tp))
+ tcp_fec_invoke(sk);
+
if (likely(sent_pkts)) {
if (tcp_in_cwnd_reduction(sk))
tp->prr_out += sent_pkts;
@@ -3153,6 +3200,12 @@ int tcp_connect(struct sock *sk)
*/
tp->snd_nxt = tp->write_seq;
tp->pushed_seq = tp->write_seq;
+
+ /* Initialize FEC members */
+ tp->fec.next_seq = tp->snd_nxt;
+ tp->fec.bytes_rcv_queue = 0;
+ skb_queue_head_init(&tp->fec.rcv_queue);
+
TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
/* Timer for repeating the SYN until an answer. */
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index dceaacc..b78ea8f 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -21,6 +21,7 @@
#include <linux/module.h>
#include <linux/gfp.h>
#include <net/tcp.h>
+#include <net/tcp_fec.h>
int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES;
int sysctl_tcp_synack_retries __read_mostly = TCP_SYNACK_RETRIES;
@@ -478,7 +479,15 @@ out_reset_timer:
if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0))
__sk_dst_reset(sk);
-out:;
+out:
+ /* FEC will switch out the RTO timer if a delayed FEC transmission
+ * should happen earlier than this. RTO timer will be switched in
+ * once the FEC timer fired.
+ * FEC transmissions during a loss episode require that the sysctl
+ * value is >= 2.
+ */
+ if (tcp_fec_is_enabled(tp) && sysctl_tcp_fec >= 2)
+ tcp_fec_arm_timer(sk);
}
void tcp_write_timer_handler(struct sock *sk)
@@ -503,6 +512,9 @@ void tcp_write_timer_handler(struct sock *sk)
case ICSK_TIME_LOSS_PROBE:
tcp_send_loss_probe(sk);
break;
+ case ICSK_TIME_FEC:
+ tcp_fec_timer(sk);
+ break;
case ICSK_TIME_RETRANS:
icsk->icsk_pending = 0;
tcp_retransmit_timer(sk);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index c5078c5..d5205c6 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -288,6 +288,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
+ memset(&(tp->rx_opt.fec), 0, sizeof(struct tcp_fec));
+
inet->inet_dport = usin->sin6_port;
tcp_set_state(sk, TCP_SYN_SENT);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment