userid · September 17, 2016 15:03
diff --git a/TCP FEC patch modified to compile.patch b/TCP FEC patch modified to compile.patch
 diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
 index 89bcfe8..9298207 100644
 --- a/include/linux/skbuff.h
 +++ b/include/linux/skbuff.h
 @@ -533,8 +533,10 @@ struct sk_buff {
 	 * layer. Please put your private variables there. If you
 	 * want to keep them across layers you have to do a skb_clone()
 	 * first. This is owned by whoever has the skb queued ATM.
 +	 *
 +	 * Increased the CB to hold pointer to an FEC structure.
 	 */
 -	char			cb[48] __aligned(8);
 +	char			cb[56] __aligned(8);
 
 	unsigned long		_skb_refdst;
 #ifdef CONFIG_XFRM
 diff --git a/include/linux/tcp.h b/include/linux/tcp.h
 index 4345d49..ccc0e91 100644
 --- a/include/linux/tcp.h
 +++ b/include/linux/tcp.h
 @@ -79,6 +79,24 @@ struct tcp_sack_block {
 #define TCP_FACK_ENABLED  (1 << 1)   /*1 = FACK is enabled locally*/
 #define TCP_DSACK_SEEN    (1 << 2)   /*1 = DSACK was received from peer*/
 
 +/* Flags transmitted in the first FEC option byte after magic bytes
 + * (except if option is used for negotiation) */
 +#define TCP_FEC_RECOVERY_CWR		0x80	/* Recovery triggered CWR */
 +#define TCP_FEC_RECOVERY_SUCCESSFUL	0x40	/* Local recovery done	  */
 +#define TCP_FEC_RECOVERY_FAILED		0x20	/* Local recovery failed  */
 +#define TCP_FEC_ENCODED			0x10	/* Packet is FEC-encoded  */
 +
 +struct tcp_fec {
 +	u8	type;		/* Requested FEC type (negotiation only,
 +				 * see net/tcp_fec.h for type defs)	 */
 +	u32	enc_seq;	/* Sequence number of first encoded byte */
 +	u32	enc_len;	/* Encoding length			 */
 +	u32	lost_seq;	/* Sequence number of first lost byte	 */
 +	u32	lost_len;	/* Loss length				 */
 +	u8	flags;		/* See flag definitions above		 */
 +	bool	saw_fec;	/* FEC option was retrieved from packet	 */
 +};
 +
 struct tcp_options_received {
 /*	PAWS/RTTM data	*/
 	long	ts_recent_stamp;/* Time we stored ts_recent (for aging) */
 @@ -95,12 +113,14 @@ struct tcp_options_received {
 	u8	num_sacks;	/* Number of SACK blocks		*/
 	u16	user_mss;	/* mss requested by user in ioctl	*/
 	u16	mss_clamp;	/* Maximal mss, negotiated at connection setup */
 +	struct tcp_fec fec;	/* FEC-related parameters		*/
 };
 
 static inline void tcp_clear_options(struct tcp_options_received *rx_opt)
 {
 	rx_opt->tstamp_ok = rx_opt->sack_ok = 0;
 	rx_opt->wscale_ok = rx_opt->snd_wscale = 0;
 +	memset(&(rx_opt->fec), 0, sizeof(struct tcp_fec));
 }
 
 /* This is the max number of SACKS that we'll generate and process. It's safe
 @@ -327,6 +347,24 @@ struct tcp_sock {
 	 * socket. Used to retransmit SYNACKs etc.
 	 */
 	struct request_sock *fastopen_rsk;
 +
 +/* TCP FEC parameters
 + *	type - negotiated FEC type to be used
 + *	next_seq - next sequence which was not FEC-encoded before
 + *	lost_len - bytes after rcv_nxt considered lost
 + *	flags - see TCP_FEC_* flag definitions above
 + *	bytes_rcv_queue - number of bytes stored in queued SKBs
 + *	rcv_queue - copies from the socket's receive queue kept for
 + *		FEC recovery
 + */
 +	struct {
 +		u8 type;
 +		u32 next_seq;
 +		u32 lost_len;
 +		u8 flags;
 +		u32 bytes_rcv_queue;
 +		struct sk_buff_head rcv_queue;
 +	} fec;
 };
 
 enum tsq_flags {
 diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
 index 06d0d0f..063aa59 100644
 --- a/include/net/inet_connection_sock.h
 +++ b/include/net/inet_connection_sock.h
 @@ -138,6 +138,7 @@ struct inet_connection_sock {
 #define ICSK_TIME_PROBE0	3	/* Zero window probe timer */
 #define ICSK_TIME_EARLY_RETRANS 4	/* Early retransmit timer */
 #define ICSK_TIME_LOSS_PROBE	5	/* Tail loss probe timer */
 +#define ICSK_TIME_FEC		6	/* FEC delayed send timer */
 
 static inline struct inet_connection_sock *inet_csk(const struct sock *sk)
 {
 @@ -228,7 +229,8 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what,
 	}
 
 	if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0 ||
 -	    what == ICSK_TIME_EARLY_RETRANS || what ==  ICSK_TIME_LOSS_PROBE) {
 +	    what == ICSK_TIME_EARLY_RETRANS || what ==  ICSK_TIME_LOSS_PROBE ||
 +	    what == ICSK_TIME_FEC) {
 		icsk->icsk_pending = what;
 		icsk->icsk_timeout = jiffies + when;
 		sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
 diff --git a/include/net/request_sock.h b/include/net/request_sock.h
 index 610fa9e..1c1b4ba 100644
 --- a/include/net/request_sock.h
 +++ b/include/net/request_sock.h
 @@ -63,6 +63,8 @@ struct request_sock {
 	struct sock			*sk;
 	u32				secid;
 	u32				peer_secid;
 +	u8				fec_type; /* Encoding type (see
 +						   * net/tcp_fec.h) */
 };
 
 static inline struct request_sock *reqsk_alloc(const struct request_sock_ops *ops)
 diff --git a/include/net/tcp.h b/include/net/tcp.h
 index d59f206..f894889 100644
 --- a/include/net/tcp.h
 +++ b/include/net/tcp.h
 @@ -182,6 +182,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
  * experimental options. See draft-ietf-tcpm-experimental-options-00.txt
  */
 #define TCPOPT_FASTOPEN_MAGIC	0xF989
 +#define TCPOPT_FEC_MAGIC	0xDC60
 
 /*
  *     TCP option lengths
 @@ -195,6 +196,11 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
 #define TCPOLEN_FASTOPEN_BASE  2
 #define TCPOLEN_EXP_FASTOPEN_BASE  4
 
 +/*
 + *	!!! TCP FEC patch !!!
 + */
 +#define TCPOLEN_EXP_FEC_BASE   4
 +
 /* But this is what stacks really send out. */
 #define TCPOLEN_TSTAMP_ALIGNED		12
 #define TCPOLEN_WSCALE_ALIGNED		4
 @@ -204,6 +210,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
 #define TCPOLEN_SACK_PERBLOCK		8
 #define TCPOLEN_MD5SIG_ALIGNED		20
 #define TCPOLEN_MSS_ALIGNED		4
 +#define TCPOLEN_EXP_FEC_NEGOTIATION_ALIGNED	8
 
 /* Flags in tp->nonagle */
 #define TCP_NAGLE_OFF		1	/* Nagle's algo is disabled */
 @@ -230,6 +237,15 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
 #define	TFO_SERVER_WO_SOCKOPT1	0x400
 #define	TFO_SERVER_WO_SOCKOPT2	0x800
 
 +/*
 + *      !!! TCP FEC patch !!!
 + */
 +
 +/* Maximum number of in-order bytes kept in the receiver's buffer for FEC
 + * recoveries. The sender will never send more than this in a single FEC
 + * packet. */
 +#define FEC_RCV_QUEUE_LIMIT    16000
 +
 extern struct inet_timewait_death_row tcp_death_row;
 
 /* sysctl variables for tcp */
 @@ -274,6 +290,12 @@ extern int sysctl_tcp_thin_dupack;
 extern int sysctl_tcp_early_retrans;
 extern int sysctl_tcp_limit_output_bytes;
 extern int sysctl_tcp_challenge_ack_limit;
 +
 +/*
 + *      !!! TCP FEC patch !!!
 + */
 +extern int sysctl_tcp_fec;
 +
 extern unsigned int sysctl_tcp_notsent_lowat;
 extern int sysctl_tcp_min_tso_segs;
 extern int sysctl_tcp_autocorking;
 @@ -725,6 +747,7 @@ struct tcp_skb_cb {
 	__u8		ip_dsfield;	/* IPv4 tos or IPv6 dsfield	*/
 	/* 1 byte hole */
 	__u32		ack_seq;	/* Sequence number ACK'd	*/
 +	struct tcp_fec	*fec;		/* FEC parameters		*/
 };
 
 #define TCP_SKB_CB(__skb)	((struct tcp_skb_cb *)&((__skb)->cb[0]))
 @@ -1131,6 +1154,11 @@ static inline void tcp_openreq_init(struct request_sock *req,
 	ireq->ecn_ok = 0;
 	ireq->ir_rmt_port = tcp_hdr(skb)->source;
 	ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
 +	
 +	/*
 +	 *      !!! TCP FEC patch !!!
 +	 */
 +	req->fec_type = rx_opt->fec.type;
 }
 
 extern void tcp_openreq_init_rwin(struct request_sock *req,
 diff --git a/include/net/tcp_fec.h b/include/net/tcp_fec.h
 new file mode 100644
 index 0000000..38f2c40
 --- /dev/null
 +++ b/include/net/tcp_fec.h
 @@ -0,0 +1,106 @@
 +#ifndef _TCP_FEC_H
 +#define _TCP_FEC_H
 +
 +#include <net/tcp.h>
 +#include <asm/unaligned.h>
 +
 +/* FEC-encoding types (8 bits, internal) */
 +#define TCP_FEC_TYPE_NONE		0 /* FEC disabled */
 +#define TCP_FEC_TYPE_XOR_ALL		1 /* XOR every MSS length segment */
 +#define TCP_FEC_TYPE_XOR_SKIP_1		2 /* XOR every other MSS length
 +					   * segment */
 +
 +#define TCP_FEC_NUM_TYPES		3
 +
 +/* Delay transmission of FEC packets (delay defined in tcp_fec_arm_timer()) */
 +#define TCP_FEC_DELAYED_SEND		1
 +
 +/*
 + * Returns true if FEC is enabled for the socket
 + */
 +static inline bool tcp_fec_is_enabled(const struct tcp_sock *tp)
 +{
 +	return unlikely(tp->fec.type > 0);
 +}
 +
 +/*
 + * Returns true if the current packet in the buffer is FEC-encoded
 + */
 +static inline bool tcp_fec_is_encoded(const struct tcp_sock *tp)
 +{
 +	return unlikely((tp->rx_opt.fec.flags & TCP_FEC_ENCODED) &&
 +			(tp->rx_opt.fec.saw_fec));
 +}
 +
 +/*
 + * Decodes FEC parameters and stores them in the FEC struct
 + * @seq - sequence number of the packet
 + * @ack_seq - ACKed sequence number
 + * @is_syn - true, if option was attached to a packet with a SYN flag
 + * @ptr - points to the first byte of the FEC option after kind, length,
 + *	  and possible magic bytes
 + * @len - option length (without kind, length, magic bytes)
 + */
 +int tcp_fec_decode_option(struct tcp_fec *fec, u32 seq, u32 ack_seq,
 +			bool is_syn, const unsigned char *ptr,
 +			unsigned int len);
 +
 +/*
 + * Encodes FEC parameters to wire format
 + * Pointer points to the first byte of the FEC option after kind, length,
 + * and possible magic bytes (pointer will be moved to first unoccupied byte)
 + */
 +int tcp_fec_encode_option(struct tcp_sock *tp, struct tcp_fec *fec,
 +			__be32 **ptr);
 +
 +/*
 + * Processes the current packet in the buffer (treated as FEC packet)
 + */
 +int tcp_fec_process(struct sock *sk, struct sk_buff *skb);
 +
 +/*
 + * Checks the received options for loss indicators and acts upon them.
 + * In particular, the function handles window reduction requests and processes
 + * tail loss indicators.
 + * Returns: 1, if window is reduced - 0, otherwise
 + */
 +int tcp_fec_check_ack(struct sock *sk, u32 ack_seq);
 +
 +/*
 + * Since data in the socket's receive queue can get consumed by other parties
 + * we need to keep extra references these SKBs until they are no longer
 + * required for possible future recoveries.
 + * @skb - buffer which is moved to the receive queue
 + */
 +int tcp_fec_update_queue(struct sock *sk, struct sk_buff *skb);
 +
 +/*
 + * Disables FEC for this connection (includes clearing references
 + * to buffers in receive queue)
 + */
 +void tcp_fec_disable(struct sock *sk);
 +
 +/* Arms the timer for a delayed FEC transmission if there is
 + * no earlier timeout defined (i.e. retransmission timeout)
 + */
 +void tcp_fec_arm_timer(struct sock *sk);
 +
 +/* The FEC timer fired. Force an FEC transmission for the
 + * last unencoded burst. Rearm the RTO timer (which was switched
 + * out when setting the FEC timer). Set a new FEC timer if there
 + * is pending unencoded data.
 + */
 +void tcp_fec_timer(struct sock *sk);
 +
 +/* If FEC packets transmissions are delayed set a timer
 + * (if not already set), otherwise invoke the FEC mechanism
 + * immediately
 + */
 +int tcp_fec_invoke(struct sock *sk);
 +
 +/* Invoke the FEC mechanism set for the connection;
 + * Create and sends out FEC packets
 + */
 +int tcp_fec_invoke_nodelay(struct sock *sk);
 +
 +#endif
 diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
 index 3b97183..d679733 100644
 --- a/include/uapi/linux/tcp.h
 +++ b/include/uapi/linux/tcp.h
 @@ -113,6 +113,11 @@ enum {
 #define TCP_TIMESTAMP		24
 #define TCP_NOTSENT_LOWAT	25	/* limit number of unsent bytes in write queue */
 
 +/*
 + *      !!! TCP FEC patch !!!
 + */
 +#define TCP_FEC                        79      /* Forward error correction */
 +
 struct tcp_repair_opt {
 	__u32	opt_code;
 	__u32	opt_val;
 diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
 index 518c04e..6aa32ca 100644
 --- a/net/ipv4/Makefile
 +++ b/net/ipv4/Makefile
 @@ -6,7 +6,7 @@ obj-y     := route.o inetpeer.o protocol.o \
 	     ip_input.o ip_fragment.o ip_forward.o ip_options.o \
 	     ip_output.o ip_sockglue.o inet_hashtables.o \
 	     inet_timewait_sock.o inet_connection_sock.o \
 -	     tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
 +	     tcp.o tcp_fec.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
 	     tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
 	     tcp_offload.o datagram.o raw.o udp.o udplite.o \
 	     udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
 diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
 index 0d438fb..9cfa3d3 100644
 --- a/net/ipv4/inet_diag.c
 +++ b/net/ipv4/inet_diag.c
 @@ -183,7 +183,8 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 
 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
 	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
 -	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
 +	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE ||
 +	    icsk->icsk_pending == ICSK_TIME_FEC) {
 		r->idiag_timer = 1;
 		r->idiag_retrans = icsk->icsk_retransmits;
 		r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
 diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
 index eeb17b3..9c58530 100644
 --- a/net/ipv4/sysctl_net_ipv4.c
 +++ b/net/ipv4/sysctl_net_ipv4.c
 @@ -28,6 +28,7 @@
 
 static int zero;
 static int one = 1;
 +static int two = 2;
 static int four = 4;
 static int gso_max_segs = GSO_MAX_SEGS;
 static int tcp_retr1_max = 255;
 @@ -810,6 +811,15 @@ static struct ctl_table ipv4_table[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &one
 	},
 +	{
 +		.procname	= "tcp_fec",
 +		.data		= &sysctl_tcp_fec,
 +		.maxlen		= sizeof(int),
 +		.mode		= 0644,
 +		.proc_handler	= proc_dointvec,
 +		.extra1		= &zero,
 +		.extra2		= &two,
 +	},
 	{ }
 };
 
 diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
 index b8ff562..1a2dab5 100644
 --- a/net/ipv4/tcp.c
 +++ b/net/ipv4/tcp.c
 @@ -276,6 +276,8 @@
 #include <net/ip.h>
 #include <net/sock.h>
 
 +#include <net/tcp_fec.h>
 +
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
 #include <net/busy_poll.h>
 @@ -2565,6 +2567,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		else
 			tp->tsoffset = val - tcp_time_stamp;
 		break;
 +	case TCP_FEC:
 +		if (sysctl_tcp_fec && val >= 0 && val < TCP_FEC_NUM_TYPES)
 +			tp->fec.type = val;
 +		else
 +			err = -EINVAL;
 +		break;
 	case TCP_NOTSENT_LOWAT:
 		tp->notsent_lowat = val;
 		sk->sk_write_space(sk);
 @@ -2792,6 +2800,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 	case TCP_TIMESTAMP:
 		val = tcp_time_stamp + tp->tsoffset;
 		break;
 +	case TCP_FEC:
 +		val = tp->fec.type;
 +		break;
 	case TCP_NOTSENT_LOWAT:
 		val = tp->notsent_lowat;
 		break;
 diff --git a/net/ipv4/tcp_fec.c b/net/ipv4/tcp_fec.c
 new file mode 100644
 index 0000000..53711cb
 --- /dev/null
 +++ b/net/ipv4/tcp_fec.c
 @@ -0,0 +1,1253 @@
 +#include <net/tcp_fec.h>
 +
 +/* Codes for incoming FEC packet processing */
 +#define FEC_NO_LOSS		1
 +#define FEC_LOSS_UNRECOVERED	2
 +#define FEC_LOSS_RECOVERED	3
 +
 +/* Receiver routines */
 +static int tcp_fec_process_xor(struct sock *sk, const struct sk_buff *skb,
 +			unsigned int block_skip);
 +static int tcp_fec_recover(struct sock *sk, const struct sk_buff *skb,
 +			unsigned char *data, u32 seq, int len);
 +static void tcp_fec_send_ack(struct sock *sk, const struct sk_buff *skb,
 +			int recovery_status);
 +static void tcp_fec_reduce_window(struct sock *sk);
 +static void tcp_fec_mark_skbs_lost(struct sock *sk);
 +static bool tcp_fec_update_decoded_option(struct sk_buff *skb);
 +static struct sk_buff *tcp_fec_make_decoded_pkt(struct sock *sk,
 +			const struct sk_buff *skb, unsigned char *dec_data,
 +			u32 seq, unsigned int len);
 +
 +/* Sender routines */
 +static int tcp_fec_create(struct sock *sk, struct sk_buff_head *list);
 +static int tcp_fec_create_xor(struct sock *sk, struct sk_buff_head *list,
 +			unsigned int first_seq, unsigned int block_len,
 +			unsigned int block_skip,
 +			unsigned int max_encoded_per_pkt);
 +static struct sk_buff *tcp_fec_make_encoded_pkt(struct sock *sk,
 +			struct tcp_fec *fec, unsigned char *enc_data,
 +			u32 seq);
 +static int tcp_fec_xmit_all(struct sock *sk, struct sk_buff_head *list);
 +static int tcp_fec_xmit(struct sock *sk, struct sk_buff *skb);
 +
 +/* Buffer access routine */
 +static unsigned int tcp_fec_get_next_block(struct sock *sk,
 +			struct sk_buff **skb, struct sk_buff_head *queue,
 +			u32 seq, unsigned int block_len,
 +			unsigned char *block);
 +
 +/* Have to define this signature here since the actual function was static
 + * and tcp_output.c has no corresponding header file
 + */
 +extern int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 +			gfp_t gfp_mask);
 +
 +/* Decodes FEC parameters and stores them in the FEC struct
 + * @seq - sequence number of the packet
 + * @ack_seq - ACKed sequence number
 + * @is_syn - true, if option was attached to a packet with a SYN flag
 + * @ptr - points to the first byte of the FEC option after kind, length,
 + *	  and possible magic bytes
 + * @len - option length (without kind, length, magic bytes)
 + */
 +int  tcp_fec_decode_option(struct tcp_fec *fec, u32 seq, u32 ack_seq,
 +				bool is_syn, const unsigned char *ptr,
 +				unsigned int len)
 +{
 +	/* reset / initialize option values which should be evaluated
 +	 * with EVERY incoming packet
 +	 */
 +	fec->flags = 0;
 +	fec->saw_fec = 1;
 +
 +	if (len == 1) {
 +		/* Short option */
 +		u8 val = *((u8 *) ptr);
 +		if (is_syn) {
 +			/* Negotiation */
 +			fec->type = val;
 +		} else {
 +			/* Regular packet */
 +			fec->flags = val;
 +		}
 +
 +		return 0;
 +	}
 +
 +	if (len == 4) {
 +		/* Long option */
 +		u32 val = get_unaligned_be32(ptr);
 +		fec->flags = val >> 24;
 +
 +		if (fec->flags & TCP_FEC_ENCODED) {
 +			fec->enc_seq = seq;
 +			fec->enc_len = val & 0xFFFFFF;
 +		} else if (fec->flags & TCP_FEC_RECOVERY_FAILED) {
 +			fec->lost_seq = ack_seq;
 +			fec->lost_len = val & 0xFFFFFF;
 +		} else {
 +			return -EINVAL;
 +		}
 +
 +		return 0;
 +	}
 +
 +	/* Invalid option length */
 +	return -EINVAL;
 +}
 +
 +/* Encodes FEC parameters to wire format
 + * @ptr - Encoded option is written to this memory location (and the pointer
 + *        is advanced to the next unoccupied byte, 4-byte aligned)
 + * Returns the length of the encoded option (including alignment)
 + */
 +int tcp_fec_encode_option(struct tcp_sock *tp, struct tcp_fec *fec,
 +			__be32 **ptr)
 +{
 +	int len;
 +
 +	fec->flags |= tp->fec.flags;
 +	fec->lost_len = tp->fec.lost_len;
 +	tp->fec.flags &= ~TCP_FEC_RECOVERY_CWR;
 +	tp->fec.flags &= ~TCP_FEC_RECOVERY_FAILED;
 +
 +	/* Encode fixed option part (option kind, length, and magic bytes) */
 +	if (fec->flags & (TCP_FEC_ENCODED | TCP_FEC_RECOVERY_FAILED))
 +		len = 4 + TCPOLEN_EXP_FEC_BASE; /* Long option */
 +	else
 +		len = 1 + TCPOLEN_EXP_FEC_BASE; /* Short option */
 +
 +	**ptr = htonl((TCPOPT_EXP << 24) | (len << 16) | TCPOPT_FEC_MAGIC);
 +	(*ptr)++;
 +
 +	if ((fec->flags & TCP_FEC_ENCODED) &&
 +	    (fec->flags & TCP_FEC_RECOVERY_FAILED)) {
 +		/* TODO Special case: need to separate loss indication
 +		 * from encoding or make option 12 bytes long
 +		 * This can only happen if a node receives and sends FEC
 +		 * data
 +		 */
 +		fec->flags &= ~TCP_FEC_RECOVERY_FAILED;
 +	}
 +
 +	if (fec->flags & TCP_FEC_ENCODED) {
 +		/* FEC-encoded packets carry:
 +		 * <Flags:8, Encoding length:24>
 +		 */
 +		**ptr = htonl((fec->flags << 24) |
 +			      (fec->enc_len));
 +		(*ptr)++;
 +		return 8;
 +	} else if (fec->flags & TCP_FEC_RECOVERY_FAILED) {
 +		/* Packets with failed recovery indication carry:
 +		 * <Flags:8, Bytes after ACKed seq lost:24>
 +		 */
 +		**ptr = htonl((fec->flags << 24) |
 +			      (fec->lost_len));
 +		(*ptr)++;
 +		return 8;
 +	} else if (fec->type) {
 +		/* Negotiation packets carry: <Encoding type:8> */
 +		**ptr = htonl((fec->type << 24) |
 +			      (TCPOPT_NOP << 16) |
 +			      (TCPOPT_NOP << 8) |
 +			      TCPOPT_NOP);
 +		(*ptr)++;
 +		return 8;
 +	} else {
 +		/* All other packets carry: <Flags:8> */
 +		**ptr = htonl((fec->flags << 24) |
 +			      (TCPOPT_NOP << 16) |
 +			      (TCPOPT_NOP << 8) |
 +			      TCPOPT_NOP);
 +		(*ptr)++;
 +		return 8;
 +	}
 +}
 +
 +/* Processes the current packet in the buffer, treated as an FEC packet
 + * (assumes that options were already processed)
 + */
 +int tcp_fec_process(struct sock *sk, struct sk_buff *skb)
 +{
 +	struct tcp_sock *tp;
 +	struct tcphdr *th;
 +	int recovery_status, err;
 +	u32 end_seq;
 +
 +	tp = tcp_sk(sk);
 +	th = tcp_hdr(skb);
 +	recovery_status = 0;
 +
 +	/* drop packet if packet is not encoded */
 +	if (!(tp->rx_opt.fec.flags & TCP_FEC_ENCODED))
 +		return -1;
 +
 +	/* check if all encoded packets were already received */
 +	end_seq = tp->rx_opt.fec.enc_seq + tp->rx_opt.fec.enc_len;
 +	if (!after(end_seq, tp->rcv_nxt)) {
 +		tcp_fec_send_ack(sk, skb, FEC_NO_LOSS);
 +		return 0;
 +	}
 +
 +	/* linearize the SKB (for easier payload access) */
 +	err = skb_linearize(skb);
 +	if (err)
 +		return err;
 +
 +	/* data recovery */
 +	switch (tp->fec.type) {
 +	case TCP_FEC_TYPE_NONE:
 +		return -1;
 +	case TCP_FEC_TYPE_XOR_ALL:
 +		recovery_status = tcp_fec_process_xor(sk, skb, 0);
 +		break;
 +	case TCP_FEC_TYPE_XOR_SKIP_1:
 +		recovery_status = tcp_fec_process_xor(sk, skb, 1);
 +		break;
 +	}
 +
 +	/* TODO error handling; -ENOMEM, etc. - disable FEC? */
 +	if (recovery_status < 0)
 +		return recovery_status;
 +
 +	/* Send an explicit ACK if recovery failed */
 +	if (recovery_status == FEC_LOSS_UNRECOVERED)
 +		tcp_fec_send_ack(sk, skb, recovery_status);
 +
 +	return 0;
 +}
 +
 +/* Checks the received options for loss indicators and acts upon them.
 + * In particular, the function handles recovery flags (indicators for
 + * successful and failed recoveries, tail losses)
 + * Returns: 1, if ACK contains a loss indicator
 + */
 +int tcp_fec_check_ack(struct sock *sk, u32 ack_seq)
 +{
 +	struct tcp_sock *tp;
 +
 +	tp = tcp_sk(sk);
 +
 +	/* Clear local recovery indication (and ECN CWR demand)
 +	 * if it was ACKED by the other node
 +	 */
 +	if (tp->rx_opt.fec.flags & TCP_FEC_RECOVERY_CWR) {
 +		tp->fec.flags &= ~TCP_FEC_RECOVERY_SUCCESSFUL;
 +		tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
 +	}
 +
 +	/* Check for tail loss indicators
 +	 * This happens when FEC was unable to recover the lost data and
 +	 * thus only sends an ACK with the loss range back. Everything not
 +	 * ACKed/SACKed now, is considered lost now.
 +	 */
 +	if (tp->rx_opt.fec.flags & TCP_FEC_RECOVERY_FAILED) {
 +		tcp_fec_mark_skbs_lost(sk);
 +		return 1;
 +	}
 +
 +	/* Check if the remote endpoint successfully recovered data,
 +	 * if so we trigger a window reduction
 +	 */
 +	if (tp->rx_opt.fec.flags & TCP_FEC_RECOVERY_SUCCESSFUL) {
 +		/* Ignore flag if window was already reduced for the current
 +		 * loss episode or if previous reduction was not signaled
 +		 * yet (no outgoing packets)
 +		 */
 +		if (after(ack_seq, tp->high_seq) &&
 +				!(tp->fec.flags & TCP_FEC_RECOVERY_CWR)) {
 +			tcp_fec_reduce_window(sk);
 +			tp->fec.flags |= TCP_FEC_RECOVERY_CWR;
 +		}
 +
 +		return 1;
 +	}
 +
 +	return 0;
 +}
 +
 +/* Since data in the socket's receive queue can get consumed by other parties
 + * we need to clone these SKBs until they are no longer required for possible
 + * future recoveries. This function is called after the TCP header has been
 + * removed from the SKB already. All parameters required for recovery are
 + * stored in the SKB's control buffer.
 + * @skb - buffer which is moved to the receive queue
 + */
 +int tcp_fec_update_queue(struct sock *sk, struct sk_buff *skb)
 +{
 +	struct tcp_sock *tp;
 +	struct sk_buff *cskb;
 +	u32 data_len;
 +	int extra_bytes, err;
 +	tp = tcp_sk(sk);
 +
 +	/* clone the SKB and add it to the FEC receive queue
 +	 * (a simple extra reference to the SKB is not sufficient since
 +	 * since SKBs can only be queued on one list at a time)
 +	 */
 +	cskb = skb_clone(skb, GFP_ATOMIC);
 +	if (cskb == NULL)
 +		return -ENOMEM;
 +
 +	/* linearize the SKB (for easier payload access) */
 +	err = skb_linearize(cskb);
 +	if (err)
 +		return err;
 +
 +	data_len = skb->len;
 +	if (!data_len) {
 +		kfree_skb(cskb);
 +		return 0;
 +	}
 +
 +	skb_queue_tail(&tp->fec.rcv_queue, cskb);
 +	tp->fec.bytes_rcv_queue += data_len;
 +
 +	/* check if we can dereference old SKBs (as long as we have enough
 +	 * data for future recoveries)
 +	 */
 +	extra_bytes = tp->fec.bytes_rcv_queue - FEC_RCV_QUEUE_LIMIT;
 +	while (extra_bytes > 0) {
 +		cskb = skb_peek(&tp->fec.rcv_queue);
 +		if (cskb == NULL)
 +			return -EINVAL;
 +
 +		data_len = TCP_SKB_CB(cskb)->end_seq - TCP_SKB_CB(cskb)->seq;
 +		if (data_len > extra_bytes) {
 +			break;
 +		} else {
 +			extra_bytes -= data_len;
 +			tp->fec.bytes_rcv_queue -= data_len;
 +			skb_unlink(cskb, &tp->fec.rcv_queue);
 +			kfree_skb(cskb);
 +		}
 +	}
 +
 +	return 0;
 +}
 +
 +/* Disables FEC for this connection (includes clearing references
 + * to buffers in receive queue)
 + */
 +void tcp_fec_disable(struct sock *sk)
 +{
 +	struct tcp_sock *tp = tcp_sk(sk);
 +
 +	if (!tcp_fec_is_enabled(tp))
 +		return;
 +
 +	tp->fec.type = 0;
 +	tp->fec.bytes_rcv_queue = 0;
 +	skb_queue_purge(&tp->fec.rcv_queue);
 +}
 +
 +/* Processes the current packet in the buffer, treated as an FEC packet
 + * with XOR-encoded payload (assumes that options were already processed)
 + * Returns: negative code, if an error occurred;
 + *	positive code, otherwise (recovery status)
 + * @block_skip - Number of unencoded blocks between two encoded blocks
 + */
 +static int tcp_fec_process_xor(struct sock *sk, const struct sk_buff *skb,
 +			unsigned int block_skip)
 +{
 +	struct sk_buff *pskb;
 +	struct tcp_sock *tp;
 +	struct tcphdr *th;
 +	u32 next_seq, end_seq, rec_seq;
 +	unsigned char *data, *block;
 +	unsigned int i, offset, data_len, block_len, rec_len;
 +	bool seen_loss;
 +	int ret;
 +
 +	pskb = NULL;
 +	tp = tcp_sk(sk);
 +	th = tcp_hdr(skb);
 +	next_seq = tp->rx_opt.fec.enc_seq;
 +	end_seq = next_seq + tp->rx_opt.fec.enc_len;
 +	block_len = skb->len - tcp_hdrlen(skb);
 +	seen_loss = false;
 +	offset = 0;
 +
 +	/* memory allocation for decoding / recovered SKB data */
 +	data = kmalloc(2 * block_len, GFP_ATOMIC);
 +	if (data == NULL)
 +		return -ENOMEM;
 +
 +	block = data + block_len;
 +
 +	/* copy FEC payload (skip TCP header) */
 +	memcpy(data, skb->data + tcp_hdrlen(skb), block_len);
 +
 +	/* process in-sequence data */
 +	while ((data_len = tcp_fec_get_next_block(sk, &pskb,
 +				&tp->fec.rcv_queue, next_seq,
 +				min(block_len, end_seq - next_seq),
 +				block))) {
 +		next_seq += data_len;
 +
 +		/* XOR with existing payload */
 +		for (i = 0; i < data_len; i++)
 +			data[i] ^= block[i];
 +
 +		/* we could no read a whole MSS block, which means we
 +		 * reached the end of the queue or end of range which the
 +		 * FEC packet covers
 +		 */
 +		if (data_len < block_len)
 +			break;
 +
 +		/* skip unencoded blocks if there is more data encoded */
 +		if (end_seq - next_seq > 0)
 +			next_seq += block_len * block_skip;
 +	}
 +
 +	/* check if all encoded bytes were already received */
 +	if (next_seq == end_seq) {
 +		kfree(data);
 +		return FEC_NO_LOSS;
 +	}
 +
 +	/* we always recover one whole MSS block (otherwise slicing
 +	 * would introduce a lot of additional complexity here) and handle
 +	 * cut out already received sequences later
 +	 */
 +	rec_seq = next_seq;
 +	rec_len = min(block_len, end_seq - rec_seq);
 +	offset  = data_len;
 +	if ((rec_seq + rec_len) == end_seq)
 +		goto recover;
 +
 +	next_seq += block_len * (block_skip + 1);
 +	pskb = NULL;
 +
 +	/* read a possibly partial (smaller than MSS) block to fill up the
 +	 * previously unfilled block and achieve alignment again
 +	 */
 +	data_len = tcp_fec_get_next_block(sk, &pskb, &tp->out_of_order_queue,
 +				next_seq, block_len - offset, block);
 +
 +	next_seq += data_len;
 +
 +	/* check if we could not read as much data as requested */
 +	if ((next_seq != end_seq) && (data_len < (block_len - offset)))
 +		goto clean;
 +
 +	/* XOR with existing payload */
 +	for (i = 0; i < data_len; i++)
 +		data[i+offset] ^= block[i];
 +
 +	/* skip unencoded blocks if there is more data encoded */
 +	if (end_seq - next_seq > 0)
 +		next_seq += block_len * block_skip;
 +
 +	/* read all necessary blocks to finish decoding */
 +	while ((data_len = tcp_fec_get_next_block(sk, &pskb,
 +				&tp->out_of_order_queue, next_seq,
 +				min(block_len, end_seq - next_seq),
 +				block))) {
 +		next_seq += data_len;
 +
 +		/* XOR with existing payload */
 +		for (i = 0; i < data_len; i++)
 +			data[i] ^= block[i];
 +
 +		/* we could not read a whole MSS block, which means we reached
 +		 * the end of the queue or end of range which the FEC packet
 +		 * covers
 +		 */
 +		if (data_len < block_len)
 +			break;
 +
 +		/* skip unencoded blocks if there is more data encoded */
 +		if (end_seq - next_seq > 0)
 +			next_seq += block_len * block_skip;
 +	}
 +
 +	/* check if additional losses were observed (cannot recover) */
 +	if (next_seq != end_seq)
 +		goto clean;
 +
 +recover:
 +	/* create and process recovered packets */
 +	for (i = 0; i < rec_len; i++)
 +		block[i] = data[(offset + i) % block_len];
 +
 +	if (block_skip && ((block_len - offset) < rec_len)) {
 +		/* recover non-consecutive sequence ranges (only when
 +		 * slicing is used)
 +		 */
 +		u32 second_seq;
 +		unsigned int second_seq_len, first_seq_len;
 +
 +		first_seq_len = block_len - offset;
 +		second_seq = rec_seq + first_seq_len + block_len * block_skip;
 +		second_seq_len = rec_len - first_seq_len;
 +
 +		ret = tcp_fec_recover(sk, skb, block, rec_seq, first_seq_len);
 +		if (ret >= 0) {
 +			int second_ret = tcp_fec_recover(sk, skb,
 +						block + first_seq_len,
 +						second_seq, second_seq_len);
 +			if (second_ret < 0 || !ret)
 +				ret = second_ret;
 +		}
 +	} else {
 +		ret = tcp_fec_recover(sk, skb, block, rec_seq, rec_len);
 +	}
 +
 +	kfree(data);
 +	return ret ? ret : FEC_LOSS_RECOVERED;
 +
 +clean:
 +        kfree(data);
 +        return FEC_LOSS_UNRECOVERED;
 +}
 +
 +/* Create a recovered packet and forward it to the reception routine */
 +static int tcp_fec_recover(struct sock *sk, const struct sk_buff *skb,
 +		unsigned char *data, u32 seq, int len)
 +{
 +	struct sk_buff *rskb;
 +	struct tcp_sock *tp;
 +
 +	tp = tcp_sk(sk);
 +
 +	/* We will notify the remote node that recovery was successful */
 +	tp->fec.flags |= TCP_FEC_RECOVERY_SUCCESSFUL;
 +
 +	/* Check if we received some tail of the recovered sequence already
 +	 * by looking at the current SACK blocks (we don't want to recover
 +	 * more data than necessary to prevent DSACKS)
 +	 */
 +	if (tcp_is_sack(tp)) {
 +		int i;
 +		for (i = 0; i < tp->rx_opt.num_sacks; i++) {
 +			if (before(tp->selective_acks[i].start_seq,
 +				   seq + len) &&
 +			   !before(tp->selective_acks[i].end_seq,
 +				   seq + len)) {
 +				len = tp->selective_acks[i].start_seq - seq;
 +				break;
 +			}
 +		}
 +	}
 +
 +	/* We might have prematurely asked for a recovery in the case where the
 +	 * whole recovery sequence is already covered by SACKs
 +	 */
 +	if (len <= 0)
 +		return FEC_NO_LOSS;
 +
 +	/* Create decoded packet and forward to reception routine */
 +	rskb = tcp_fec_make_decoded_pkt(sk, skb, data, seq, len);
 +	if (rskb == NULL)
 +		return -EINVAL;
 +
 +	tcp_rcv_established(sk, rskb, tcp_hdr(rskb), rskb->len);
 +	return 0;
 +}
 +
 +/* Sends an ACK for the FEC packet and encodes any congestion or
 + * and/or recovery information
 + */
 +static void tcp_fec_send_ack(struct sock *sk, const struct sk_buff *skb,
 +				int recovery_status)
 +{
 +	struct tcp_sock *tp;
 +	u32 end_seq;
 +
 +	tp = tcp_sk(sk);
 +
 +	/* Right now we only need an outgoing ACK if FEC recovery failed,
 +	 * in all other cases ACKs are implicitly generated
 +	 */
 +	switch (recovery_status) {
 +	case FEC_LOSS_UNRECOVERED:
 +		end_seq = tp->rx_opt.fec.enc_seq + tp->rx_opt.fec.enc_len;
 +		tp->fec.flags |= TCP_FEC_RECOVERY_FAILED;
 +		tp->fec.lost_len = end_seq - tp->rcv_nxt;
 +		tcp_send_ack(sk);
 +		break;
 +	}
 +}
 +
 +/* Reduces the congestion window (similar to completed fast recovery)
 + * If the node is already in recovery mode, undo is disabled to enforce
 + * the window reduction upon completion
 + */
 +static void tcp_fec_reduce_window(struct sock *sk)
 +{
 +	struct tcp_sock *tp;
 +	const struct inet_connection_sock *icsk;
 +
 +	tp = tcp_sk(sk);
 +	icsk = inet_csk(sk);
 +
 +	if (icsk->icsk_ca_state < TCP_CA_CWR) {
 +		tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
 +		if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH) {
 +			tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
 +			tp->snd_cwnd_stamp = tcp_time_stamp;
 +		}
 +
 +		/* Any future window reduction requests are ignored until
 +		 * snd_nxt is ACKed
 +		 */
 +		tp->high_seq = tp->snd_nxt;
 +		tp->undo_marker = 0;
 +	} else {
 +		/* Socket is in some congestion mode and we only need to make
 +		 * sure that window reduction is executed when recovery
 +		 * is finished
 +		 */
 +		tp->undo_marker = 0;
 +	}
 +}
 +
 +/* The incoming ACK indicates a failed recovery.
 + * Mark all unacked SKBs in the loss range as lost.
 + * TODO With interleaved coding, we have the additional constraint
 + * that the SKBs in the loss range also must have been encoded the
 + * triggering FEC packet, and for that we need to keep some info
 + * about FEC packets on the sender side
 + */
 +static void tcp_fec_mark_skbs_lost(struct sock *sk)
 +{
 +	struct tcp_sock *tp;
 +	struct sk_buff *skb;
 +	u32 start_seq, end_seq;
 +
 +	tp = tcp_sk(sk);
 +	skb = tp->lost_skb_hint ? tp->lost_skb_hint : tcp_write_queue_head(sk);
 +
 +	/* All SKBs falling completely in the range are marked */
 +	start_seq = tp->rx_opt.fec.lost_seq;
 +	end_seq = tp->rx_opt.fec.lost_seq + tp->rx_opt.fec.lost_len;
 +
 +	tcp_for_write_queue_from(skb, sk) {
 +		if (skb == tcp_send_head(sk))
 +			break;
 +
 +		/* Past loss range */
 +		if (!before(TCP_SKB_CB(skb)->seq, end_seq))
 +			break;
 +
 +		/* SKB not (fully) within range */
 +		if (before(TCP_SKB_CB(skb)->seq, start_seq) ||
 +		    after(TCP_SKB_CB(skb)->end_seq, end_seq))
 +			continue;
 +
 +		/* SKB already marked */
 +		if (TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))
 +			continue;
 +
 +		/* Verify retransmit hint before marking
 +		 * (see tcp_verify_retransmit_hint(),
 +		 * copied since method defined static in tcp_input.c)
 +		 */
 +		if ((tp->retransmit_skb_hint == NULL) ||
 +		    before(TCP_SKB_CB(skb)->seq,
 +			   TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
 +			tp->retransmit_skb_hint = skb;
 +
 +		if (!tp->lost_out ||
 +		    after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high))
 +			tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
 +
 +		/* Mark SKB as lost (see tcp_skb_mark_lost()) */
 +		tp->lost_out += tcp_skb_pcount(skb);
 +		TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
 +	}
 +
 +	tcp_verify_left_out(tp);
 +}
 +
 +/* Searches for the FEC option in the packet header and replaces
 + * the long option with a short one padded by NOPs.
 + * This is done to convert the option used by an encoded packet
 + * to the option used by a recovered packet.
 + */
 +static bool tcp_fec_update_decoded_option(struct sk_buff *skb)
 +{
 +	struct tcphdr *th;
 +	unsigned char *ptr;
 +	int length;
 +
 +	th = tcp_hdr(skb);
 +	ptr = (unsigned char *) (th + 1);
 +	length = (th->doff * 4) - sizeof(struct tcphdr);
 +
 +	while (length > 0) {
 +		int opcode = *ptr++;
 +		int opsize;
 +
 +		switch (opcode) {
 +		case TCPOPT_EOL:
 +			return 0;
 +		case TCPOPT_NOP:
 +			length--;
 +			continue;
 +		default:
 +			opsize = *ptr++;
 +			if (opsize < 2 || opsize > length)
 +				return 0;
 +
 +			if (opcode == TCPOPT_EXP &&
 +				get_unaligned_be16(ptr) == TCPOPT_FEC_MAGIC) {
 +				/* Update FEC option:
 +				 * 1. Convert long option into short option
 +				 * 2. Clear ENCODED flag (keep other flags)
 +				 * 3. Replace option value (long option) by NOPs
 +				 */
 +				u32 *fec_opt_start = (u32 *) (ptr - 2);
 +				*fec_opt_start = htonl((
 +					get_unaligned_be32(fec_opt_start) &
 +					0xFF00FFFF) | 0x00050000);
 +				*(fec_opt_start + 1) = htonl((
 +					get_unaligned_be32(fec_opt_start + 1) &
 +					0xEF000000) | 0x00010101);
 +
 +				return 1;
 +			}
 +
 +			ptr += opsize - 2;
 +			length -= opsize;
 +		}
 +	}
 +
 +	return 0;
 +}
 +
 +/* Allocates an SKB for data we want to forward to reception routines
 + * (recovered data) by making a copy of the FEC SKB and replacing the data
 + * part, all other segments (options, etc.) are preserved
 + */
 +static struct sk_buff *tcp_fec_make_decoded_pkt(struct sock *sk,
 +				const struct sk_buff *skb,
 +				unsigned char *dec_data,
 +				u32 seq, unsigned int len)
 +{
 +	struct tcp_sock *tp;
 +	struct sk_buff *nskb;
 +
 +	tp = tcp_sk(sk);
 +	nskb = skb_copy(skb, GFP_ATOMIC);
 +	if (nskb == NULL)
 +		return NULL;
 +
 +	/* Update FEC option for the new packet */
 +	if (!tcp_fec_update_decoded_option(nskb)) {
 +		/* TODO Do we need this catch? Technically we don't reach this
 +		 * method if there is no FEC option in the header.
 +		 */
 +		return NULL;
 +	}
 +
 +	/* check if we received some tail of the recovered sequence already
 +	 * by looking at the current SACK blocks (we don't want to recover
 +	 * more data than necessary to prevent DSACKS)
 +	 */
 +	if (tcp_is_sack(tp)) {
 +		int i;
 +		for (i = 0; i < tp->rx_opt.num_sacks; i++) {
 +			if (before(tp->selective_acks[i].start_seq,
 +				   seq + len) &&
 +				   !before(tp->selective_acks[i].end_seq,
 +				   seq + len)) {
 +				len = tp->selective_acks[i].start_seq - seq;
 +				break;
 +			}
 +		}
 +	}
 +
 +	/* trim data section to fit recovered sequence if necessary */
 +	if (len < (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq))
 +		skb_trim(nskb, len + tcp_hdrlen(nskb));
 +
 +	/* fix the sequence numbers */
 +	tcp_hdr(nskb)->seq = htonl(seq);
 +	tcp_hdr(nskb)->ack_seq = htonl(tp->snd_una);
 +	TCP_SKB_CB(nskb)->seq = seq;
 +	TCP_SKB_CB(nskb)->end_seq = seq + len;
 +
 +	/* replace SKB payload with recovered data */
 +	memcpy(nskb->data + tcp_hdrlen(nskb), dec_data, len);
 +
 +	/* packets used for recovery had their checksums checked already */
 +	nskb->ip_summed = CHECKSUM_UNNECESSARY;
 +
 +	return nskb;
 +}
 +
 +/* Gets the next byte block from an SKB queue (any SKB which is touched
 + * in this procedure will be linearized to simplify payload access)
 + * @skb - Points to SKB from which previous block was extracted (useful
 + *	  for successive calls to this function, which avoids moving through
 + *	  the whole queue again)
 + * @queue - SKB queue to read from (SKB has to point to an element on this
 + *	  queue)
 + * @seq - Sequence number of first byte in the block
 + * @block_len
 + * @block
 + *
 + * Returns the bytes written to the block memory
 + */
 +static unsigned int tcp_fec_get_next_block(struct sock *sk,
 +				struct sk_buff **skb,
 +				struct sk_buff_head *queue, u32 seq,
 +				unsigned int block_len, unsigned char *block)
 +{
 +	unsigned int cur_len, offset, num_bytes;
 +	int err;
 +	u32 end_seq;
 +
 +	cur_len = 0;
 +
 +	/* Get first SKB of the write queue and specify next sequence to
 +	 * encode
 +	 */
 +	if (*skb == NULL) {
 +		*skb = skb_peek(queue);
 +		if (*skb == NULL)
 +			return 0;
 +	}
 +
 +	/* move to SKB which stores the next sequence to encode */
 +	while (*skb) {
 +		/* If we observe an RST/SYN, we stop here to avoid
 +		 * handling corner cases
 +		 */
 +		if (TCP_SKB_CB(*skb)->tcp_flags &
 +					(TCPHDR_RST |
 +					 TCPHDR_SYN))
 +			return 0;
 +		if (!before(seq, TCP_SKB_CB(*skb)->seq) &&
 +					before(seq, TCP_SKB_CB(*skb)->end_seq))
 +			break;
 +		if (*skb == skb_peek_tail(queue)) {
 +			*skb = NULL;
 +			break;
 +		}
 +
 +		*skb = skb_queue_next(queue, *skb);
 +	}
 +
 +	if (*skb == NULL)
 +		return 0;
 +
 +	/* copy bytes from SKBs (connected sequences) */
 +	while (*skb && (cur_len < block_len)) {
 +		err = skb_linearize(*skb);
 +		if (err)
 +			return err;
 +
 +		/* Deal with the end seq number being incremented by
 +		 * one if the FIN flag is set (we don't want to encode this)
 +		 */
 +		end_seq = TCP_SKB_CB(*skb)->end_seq;
 +		if (TCP_SKB_CB(*skb)->tcp_flags & TCPHDR_FIN)
 +			end_seq--;
 +
 +		if ((seq >= TCP_SKB_CB(*skb)->seq) && (seq < end_seq)) {
 +			/* Copy data depending on:
 +			 * - remaining space in the block
 +			 * - remaining data in the SKB
 +			 */
 +			offset = seq - TCP_SKB_CB(*skb)->seq;
 +			num_bytes = min(block_len - cur_len,
 +					end_seq - seq);
 +
 +			memcpy(block + cur_len, (*skb)->data + offset,
 +			       num_bytes);
 +			cur_len += num_bytes;
 +			seq += num_bytes;
 +		}
 +
 +		if (*skb == skb_peek_tail(queue) || cur_len >= block_len)
 +			break;
 +
 +		*skb = skb_queue_next(queue, *skb);
 +	}
 +
 +	return cur_len;
 +}
 +
 +/* Arms the timer for a delayed FEC transmission if there is
 + * no earlier timeout defined (i.e. retransmission timeout)
 + */
 +void tcp_fec_arm_timer(struct sock *sk)
 +{
 +	struct inet_connection_sock *icsk;
 +	struct tcp_sock *tp;
 +	u32 delta, timeout, rtt;
 +
 +	icsk = inet_csk(sk);
 +	tp = tcp_sk(sk);
 +
 +	/* Only arm a timer if connection is established */
 +	if (sk->sk_state != TCP_ESTABLISHED)
 +		return;
 +
 +	/* Forward next sequence to be encoded if unencoded data was acked */
 +	if (after(tp->snd_una, tp->fec.next_seq))
 +		tp->fec.next_seq = tp->snd_una;
 +
 +	/* Don't arm the timer if there is no unencoded data left */
 +	if (!before(tp->fec.next_seq, tp->snd_nxt))
 +		return;
 +
 +	/* TODO handle other timers which might be armed;
 +	 * EARLY_RETRANS? LOSS_PROBE?
 +	 */
 +
 +	/* Compute timeout (currently 0.25 * RTT) */
 +	rtt = tp->srtt_us >> 3;
 +	timeout = rtt >> 2;
 +
 +	/* Compute delay between transmission of original packet and this call
 +	 * (difference is subtracted from timeout value)
 +	 */
 +	delta = 0;
 +	if (delta > timeout) {
 +		tcp_fec_invoke_nodelay(sk);
 +		return;
 +	} else if (delta > 0) {
 +		timeout -= delta;
 +	}
 +
 +	/* Do not replace a timeout occurring earlier */
 +	if (jiffies + timeout >= icsk->icsk_timeout)
 +		return;
 +
 +	inet_csk_reset_xmit_timer(sk, ICSK_TIME_FEC, timeout, TCP_RTO_MAX);
 +}
 +
 +/* The FEC timer fired. Force an FEC transmission for the
 + * last unencoded burst. Rearm the RTO timer (which was switched
 + * out when setting the FEC timer). Set a new FEC timer if there
 + * is pending unencoded data.
 + */
 +void tcp_fec_timer(struct sock *sk)
 +{
 +	struct inet_connection_sock *icsk;
 +	struct tcp_sock *tp;
 +
 +	icsk = inet_csk(sk);
 +	tp = tcp_sk(sk);
 +
 +	tcp_fec_invoke_nodelay(sk);
 +
 +	icsk->icsk_pending = 0;
 +	tcp_rearm_rto(sk);
 +
 +	tcp_fec_arm_timer(sk);
 +}
 +
 +/* If FEC packet transmissions are delayed set a timer
 + * (if not already set), otherwise invoke the FEC mechanism
 + * immediately
 + */
 +int tcp_fec_invoke(struct sock *sk)
 +{
 +	struct inet_connection_sock *icsk;
 +	struct tcp_sock *tp;
 +
 +	icsk = inet_csk(sk);
 +	tp = tcp_sk(sk);
 +
 +#ifndef TCP_FEC_DELAYED_SEND
 +	return tcp_fec_invoke_nodelay(sk);
 +#else
 +	/* Set the timer for sending an FEC packet if no FEC
 +	 * timer is active yet
 +	 */
 +	if (!icsk->icsk_pending || icsk->icsk_pending != ICSK_TIME_FEC)
 +		tcp_fec_arm_timer(sk);
 +#endif
 +
 +	return 0;
 +}
 +
 +/* Invokes the FEC mechanism set for the connection;
 + * Creates and sends out FEC packets
 + */
 +int tcp_fec_invoke_nodelay(struct sock *sk)
 +{
 +	int err;
 +	struct sk_buff_head *list;
 +	struct sk_buff *skb;
 +	struct tcp_fec *fec;
 +
 +	list = kmalloc(sizeof(struct sk_buff_head), GFP_ATOMIC);
 +	if (list == NULL)
 +		return -ENOMEM;
 +
 +	skb_queue_head_init(list);
 +	err = tcp_fec_create(sk, list);
 +	if (err)
 +		goto clean;
 +
 +	err = tcp_fec_xmit_all(sk, list);
 +	if (err)
 +		goto clean;
 +
 +clean:
 +	/* Purge all SKBs (purge FEC structs first) */
 +	skb = (struct sk_buff *) list;
 +	while (!skb_queue_is_last(list, skb)) {
 +		skb = skb_queue_next(list, skb);
 +		fec = TCP_SKB_CB(skb)->fec;
 +		if (fec != NULL) {
 +			kfree(fec);
 +			TCP_SKB_CB(skb)->fec = NULL;
 +		}
 +	}
 +
 +	skb_queue_purge(list);
 +	kfree(list);
 +
 +	/* TODO error handling; -ENOMEM, etc. - disable FEC? */
 +
 +	return err;
 +}
 +
 +/* Creates one or more FEC packets (can depend on the FEC type used)
 + * and puts them in a queue
 + * @list: queue head
 + */
 +static int tcp_fec_create(struct sock *sk, struct sk_buff_head *list)
 +{
 +	struct tcp_sock *tp;
 +	unsigned int first_seq, block_len;
 +	int err;
 +
 +	tp = tcp_sk(sk);
 +
 +	/* Update the pointer to the first byte to be encoded next
 +	 * (this only matters when a packet was ACKed before it was
 +	 * encoded)
 +	 */
 +	if (after(tp->snd_una, tp->fec.next_seq))
 +		tp->fec.next_seq = tp->snd_una;
 +
 +	first_seq = tp->fec.next_seq;
 +	block_len = tcp_current_mss(sk);
 +
 +	switch (tp->fec.type) {
 +	case TCP_FEC_TYPE_NONE:
 +		return 0;
 +	case TCP_FEC_TYPE_XOR_ALL:
 +		return tcp_fec_create_xor(sk, list, first_seq,
 +					  block_len, 0,
 +					  FEC_RCV_QUEUE_LIMIT - block_len);
 +	case TCP_FEC_TYPE_XOR_SKIP_1:
 +		err = tcp_fec_create_xor(sk, list, first_seq, block_len, 1,
 +					  FEC_RCV_QUEUE_LIMIT - block_len);
 +		if (err)
 +			return err;
 +
 +		return tcp_fec_create_xor(sk, list, first_seq + block_len,
 +					  block_len, 1,
 +					  FEC_RCV_QUEUE_LIMIT - block_len);
 +	}
 +
 +	return 0;
 +}
 +
 +/* Creates FEC packet(s) using XOR encoding
 + * (allocates memory for the FEC structs)
 + * @first_seq - Sequence number of first byte to be encoded
 + * @block_len - Block length (typically MSS)
 + * @block_skip - Number of unencoded blocks between two encoded blocks
 + * @max_encoded_per_pkt - maximum number of blocks encoded per packet
 + *	(0, if unlimited)
 + */
 +static int tcp_fec_create_xor(struct sock *sk, struct sk_buff_head *list,
 +				unsigned int first_seq, unsigned int block_len,
 +				unsigned int block_skip,
 +				unsigned int max_encoded_per_pkt)
 +{
 +	struct tcp_sock *tp;
 +	struct sk_buff *skb, *fskb;
 +	struct tcp_fec *fec;
 +	unsigned int c_encoded;		/* Number of currently encoded blocks
 +					   not yet added to an FEC packet */
 +        unsigned int next_seq;          /* Next byte to encode */
 +        unsigned int i;
 +	unsigned char *data, *block;
 +	u16 data_len;
 +
 +	tp = tcp_sk(sk);
 +	skb = NULL;
 +	c_encoded = 0;
 +	next_seq = first_seq;
 +
 +	/* memory allocation
 +	 * data - used temporarily to obtain byte blocks and store the payload
 +		  (is freed before returning; we need two blocks here to store
 +                   the previously XORed data that has not been added to an FEC
 +                   packet yet, and the new to-be XORed data extracted from one
 +                   or more existing buffers)
 +
 +	 * fec	- used to store the FEC parameters
 +		  (is freed after the corresponding packet is forwarded to the
 +		  transmission routine)
 +	 */
 +	data = kmalloc(2 * block_len, GFP_ATOMIC);
 +	if (data == NULL)
 +		return -ENOMEM;
 +
 +	fec = kmalloc(sizeof(struct tcp_fec), GFP_ATOMIC);
 +	if (fec == NULL) {
 +		kfree(data);
 +		return -ENOMEM;
 +	}
 +
 +	memset(data, 0, 2 * block_len);
 +	memset(fec, 0, sizeof(struct tcp_fec));
 +
 +	block = data + block_len;
 +
 +	/* encode data blocks
 +	 * XXX atomicity check?
 +	 */
 +	fec->enc_seq = next_seq;
 +	while ((data_len = tcp_fec_get_next_block(sk, &skb,
 +				&sk->sk_write_queue, next_seq,
 +				min(block_len, tp->snd_nxt - next_seq),
 +				block))) {
 +		/* Check if we reached the encoding limit; then create packet
 +		 * with current payload and add it to the queue
 +		 */
 +		if (max_encoded_per_pkt > 0 &&
 +					c_encoded >= max_encoded_per_pkt) {
 +			fskb = tcp_fec_make_encoded_pkt(sk, fec, data,
 +						block_len);
 +			if (fskb == NULL) {
 +				kfree(data);
 +				kfree(fec);
 +				return -EINVAL;
 +			}
 +
 +			skb_queue_tail(list, fskb);
 +			memset(data, 0, block_len);
 +			c_encoded = 0;
 +
 +			/* memory allocation for the FEC struct of the next
 +			 * packet
 +			 */
 +			fec = kmalloc(sizeof(struct tcp_fec), GFP_ATOMIC);
 +			if (fec == NULL) {
 +				kfree(data);
 +				return -ENOMEM;
 +			}
 +
 +			memset(fec, 0, sizeof(struct tcp_fec));
 +			fec->enc_seq = next_seq;
 +		}
 +
 +		next_seq += data_len;
 +		fec->enc_len = next_seq - fec->enc_seq;
 +
 +		/* encode block into existing payload (XOR) */
 +		for (i = 0; i < data_len; i++)
 +			data[i] ^= block[i];
 +
 +		c_encoded++;
 +
 +		/* skip over blocks which are not requested for encoding */
 +		next_seq += block_len * block_skip;
 +	}
 +
 +	/* create final packet if some data was selected for encoding */
 +	if (c_encoded > 0) {
 +		fskb = tcp_fec_make_encoded_pkt(sk, fec, data, block_len);
 +		if (fskb == NULL) {
 +			kfree(data);
 +			kfree(fec);
 +			return -EINVAL;
 +		}
 +
 +		skb_queue_tail(list, fskb);
 +	} else {
 +		kfree(fec);
 +	}
 +
 +	tp->fec.next_seq = next_seq;
 +	kfree(data);
 +
 +	return 0;
 +}
 +
 +/* Allocates an SKB for data we want to send and assigns
 + * the necessary options and fields
 + */
 +static struct sk_buff *tcp_fec_make_encoded_pkt(struct sock *sk,
 +				struct tcp_fec *fec,
 +				unsigned char *enc_data,
 +				unsigned int len)
 +{
 +	struct sk_buff *skb;
 +	unsigned char *data;
 +
 +	/* See tcp_make_synack(); 15 probably for tail pointer etc.? */
 +	len = min(len, fec->enc_len);
 +	skb = alloc_skb(MAX_TCP_HEADER + 15 + len, GFP_ATOMIC);
 +	if (skb == NULL)
 +		return NULL;
 +
 +	/* Reserve space for headers */
 +	skb_reserve(skb, MAX_TCP_HEADER);
 +
 +	/* Specify sequence number and FEC struct address in control buffer */
 +	fec->flags |= TCP_FEC_ENCODED;
 +	TCP_SKB_CB(skb)->seq = fec->enc_seq;
 +	TCP_SKB_CB(skb)->fec = fec;
 +
 +	/* Enable ACK flag (required for all data packets) */
 +	TCP_SKB_CB(skb)->tcp_flags = TCPHDR_ACK;
 +
 +	/* Set GSO parameters */
 +	skb_shinfo(skb)->gso_segs = 1;
 +	skb_shinfo(skb)->gso_size = 0;
 +	skb_shinfo(skb)->gso_type = 0;
 +
 +	/* Append payload to SKB */
 +	data = skb_put(skb, len);
 +	memcpy(data, enc_data, len);
 +
 +	skb->ip_summed = CHECKSUM_PARTIAL;
 +
 +	return skb;
 +}
 +
 +/* Transmit all FEC packets in a list */
 +static int tcp_fec_xmit_all(struct sock *sk, struct sk_buff_head *list)
 +{
 +	struct sk_buff *skb;
 +	int err;
 +
 +	if (list == NULL || skb_queue_empty(list))
 +		return 0;
 +
 +	skb = (struct sk_buff *) list;
 +	while (!skb_queue_is_last(list, skb)) {
 +		skb = skb_queue_next(list, skb);
 +		err = tcp_fec_xmit(sk, skb);
 +		if (err)
 +			return err;
 +	}
 +
 +	return 0;
 +}
 +
 +/* Transmits an FEC packet */
 +static int tcp_fec_xmit(struct sock *sk, struct sk_buff *skb)
 +{
 +	/* TODO timers? no retransmissions, but want to deactivate FEC
 +	 * if we never get any FEC ACKs back
 +	 */
 +	return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
 +}
 diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
 index f83ddf9..b640461 100644
 --- a/net/ipv4/tcp_input.c
 +++ b/net/ipv4/tcp_input.c
 @@ -70,6 +70,7 @@
 #include <linux/kernel.h>
 #include <net/dst.h>
 #include <net/tcp.h>
 +#include <net/tcp_fec.h>
 #include <net/inet_common.h>
 #include <linux/ipsec.h>
 #include <asm/unaligned.h>
 @@ -106,6 +107,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
 #define FLAG_SYN_ACKED		0x10 /* This ACK acknowledged SYN.		*/
 #define FLAG_DATA_SACKED	0x20 /* New SACK.				*/
 #define FLAG_ECE		0x40 /* ECE in this ACK				*/
 +#define FLAG_FEC_CWR_REQUESTED	0x80 /* cwnd reduction requested */
 #define FLAG_SLOWPATH		0x100 /* Do not skip RFC checks for window update.*/
 #define FLAG_ORIG_SACK_ACKED	0x200 /* Never retransmitted data are (s)acked	*/
 #define FLAG_SND_UNA_ADVANCED	0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
 @@ -115,8 +117,9 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
 
 #define FLAG_ACKED		(FLAG_DATA_ACKED|FLAG_SYN_ACKED)
 #define FLAG_NOT_DUP		(FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
 -#define FLAG_CA_ALERT		(FLAG_DATA_SACKED|FLAG_ECE)
 +#define FLAG_CA_ALERT	(FLAG_DATA_SACKED|FLAG_ECE|FLAG_FEC_CWR_REQUESTED)
 #define FLAG_FORWARD_PROGRESS	(FLAG_ACKED|FLAG_DATA_SACKED)
 +#define FLAG_CONGESTION		(FLAG_ECE|FLAG_FEC_CWR_REQUESTED)
 
 #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
 #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
 @@ -2546,7 +2549,11 @@ void tcp_enter_cwr(struct sock *sk)
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	tp->prior_ssthresh = 0;
 -	if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
 +	
 +	/*
 +	 *      !!! TCP FEC patch !!!
 +	 */
 +	if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR && after(tp->snd_una, tp->high_seq)) {
 		tp->undo_marker = 0;
 		tcp_init_cwnd_reduction(sk);
 		tcp_set_ca_state(sk, TCP_CA_CWR);
 @@ -2968,6 +2975,12 @@ void tcp_rearm_rto(struct sock *sk)
 	if (tp->fastopen_rsk)
 		return;
 
 +	/* Don't rearm the timer if an FEC timer is active.
 +	 * The FEC handler will rearm the timer once the event is handled.
 +	 */
 +	if (icsk->icsk_pending == ICSK_TIME_FEC)
 +		return;
 +
 	if (!tp->packets_out) {
 		inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS);
 	} else {
 @@ -3228,16 +3241,23 @@ static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag)
 /* Decide wheather to run the increase function of congestion control. */
 static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
 {
 +	const struct tcp_sock *tp = tcp_sk(sk);
 	if (tcp_in_cwnd_reduction(sk))
 		return false;
 
 +	/*
 +	 *      !!! TCP FEC patch !!!
 +	 */
 +	if ((flag & FLAG_CONGESTION) && !(tp->snd_cwnd < tp->snd_ssthresh))
 +		return false;
 +
 	/* If reordering is high then always grow cwnd whenever data is
 	 * delivered regardless of its ordering. Otherwise stay conservative
 	 * and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/
 	 * new SACK or ECE mark may first advance cwnd here and later reduce
 	 * cwnd in tcp_fastretrans_alert() based on more states.
 	 */
 -	if (tcp_sk(sk)->reordering > sysctl_tcp_reordering)
 +	if (tp->reordering > sysctl_tcp_reordering)
 		return flag & FLAG_FORWARD_PROGRESS;
 
 	return flag & FLAG_DATA_ACKED;
 @@ -3425,6 +3445,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 		icsk->icsk_retransmits = 0;
 	}
 
 +	/* Check if FEC expects and executes a window reduction */
 +	if (tcp_fec_is_enabled(tp) && tcp_fec_check_ack(sk, ack))
 +		flag |= FLAG_FEC_CWR_REQUESTED;
 +
 	prior_fackets = tp->fackets_out;
 
 	/* ts_recent update must be made after we are sure that the packet
 @@ -3656,6 +3680,20 @@ void tcp_parse_options(const struct sk_buff *skb,
 				break;
 
 			case TCPOPT_EXP:
 +				/*
 +				 *      !!! TCP FEC patch !!!
 +				 */
 +				if (sysctl_tcp_fec &&
 +				    get_unaligned_be16(ptr) ==
 +				    TCPOPT_FEC_MAGIC) {
 +					tcp_fec_decode_option(&(opt_rx->fec),
 +						ntohl(th->seq),
 +						ntohl(th->ack_seq), th->syn,
 +						ptr + 2,
 +						opsize - TCPOLEN_EXP_FEC_BASE);
 +					break;
 +				}
 +
 				/* Fast Open option shares code 254 using a
 				 * 16 bits magic number.
 				 */
 @@ -4173,6 +4211,12 @@ static void tcp_ofo_queue(struct sock *sk)
 			   tp->rcv_nxt, TCP_SKB_CB(skb)->seq,
 			   TCP_SKB_CB(skb)->end_seq);
 
 +		/*
 +		 *      !!! TCP FEC patch !!!
 +		 */
 +		if (tcp_fec_is_enabled(tp))
 +			tcp_fec_update_queue(sk, skb);
 +
 		tail = skb_peek_tail(&sk->sk_receive_queue);
 		eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen);
 		tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
 @@ -4410,6 +4454,9 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 			goto out_of_window;
 
 		/* Ok. In sequence. In window. */
 +		if (tcp_fec_is_enabled(tp))
 +			tcp_fec_update_queue(sk, skb);
 +
 		if (tp->ucopy.task == current &&
 		    tp->copied_seq == tp->rcv_nxt && tp->ucopy.len &&
 		    sock_owned_by_user(sk) && !tp->urg_data) {
 @@ -4715,6 +4762,12 @@ static int tcp_prune_queue(struct sock *sk)
 			     tp->copied_seq, tp->rcv_nxt);
 	sk_mem_reclaim(sk);
 
 +	/* Disable FEC if it was enabled to prevent keeping data
 +	 * in the receive queue longer than necessary
 +	 */
 +	if (tcp_fec_is_enabled(tp))
 +		tcp_fec_disable(sk);
 +
 	if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
 		return 0;
 
 @@ -4998,6 +5051,21 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
 		/* Reset is accepted even if it did not pass PAWS. */
 	}
 
 +	/* Special processing if FEC is enabled */
 +	if (tcp_fec_is_enabled(tp)) {
 +		if (tcp_fec_is_encoded(tp)) {
 +			tcp_fec_process(sk, skb);
 +			goto discard;
 +		} else if (!tp->rx_opt.fec.saw_fec && th->ack &&
 +			   sk->sk_state == TCP_LAST_ACK) {
 +			/* TODO Sometimes the FEC option is not appended to the
 +			 * FIN-ACK packet; socket options cleared?
 +			 */
 +			tcp_ack(sk, skb, FLAG_SLOWPATH);
 +			goto discard;
 +		}
 +	}
 +
 	/* Step 1: check sequence number */
 	if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {
 		/* RFC793, page 37: "In all states except SYN-SENT, all reset
 @@ -5099,6 +5167,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 	 */
 
 	tp->rx_opt.saw_tstamp = 0;
 +	tp->rx_opt.fec.saw_fec = 0;
 
 	/*	pred_flags is 0xS?10 << 16 + snd_wnd
 	 *	if header_prediction is to be made
 @@ -5461,6 +5530,15 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		if (tcp_is_sack(tp) && sysctl_tcp_fack)
 			tcp_enable_fack(tp);
 
 +		/*
 +		 * FEC negotiation
 +		 * Disable FEC if both ends do not agree on the FEC type used
 +		 */
 +		if (tp->fec.type != tp->rx_opt.fec.type) {
 +			tp->fec.type = 0;
 +			tp->rx_opt.fec.type = 0;
 +		}
 +
 		tcp_mtup_init(sk);
 		tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
 		tcp_initialize_rcv_mss(sk);
 @@ -5735,6 +5813,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
 
 		tcp_initialize_rcv_mss(sk);
 		tcp_fast_path_on(tp);
 +
 +		/* SYN requested FEC usage */
 +		if (tp->rx_opt.fec.type > 0)
 +			tp->fec.type = tp->rx_opt.fec.type;
 +
 		break;
 
 	case TCP_FIN_WAIT1: {
 diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
 index d886b60..5efbc2e 100644
 --- a/net/ipv4/tcp_ipv4.c
 +++ b/net/ipv4/tcp_ipv4.c
 @@ -73,6 +73,9 @@
 #include <net/timewait_sock.h>
 #include <net/xfrm.h>
 #include <net/secure_seq.h>
 +
 +#include <net/tcp_fec.h>
 +
 #include <net/tcp_memcontrol.h>
 #include <net/busy_poll.h>
 
 @@ -212,6 +215,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 
 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 
 +	memset(&(tp->rx_opt.fec), 0, sizeof(struct tcp_fec));
 +
 	/* Socket identity is still unknown (sport may be zero).
 	 * However we set state to SYN-SENT and not releasing socket
 	 * lock select source port, enter ourselves into the hash tables and
 @@ -2270,7 +2275,8 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
 
 	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
 	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
 -	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
 +	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE ||
 +	    icsk->icsk_pending == ICSK_TIME_FEC) {
 		timer_active	= 1;
 		timer_expires	= icsk->icsk_timeout;
 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
 diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
 index 927586e..f59faf9 100644
 --- a/net/ipv4/tcp_minisocks.c
 +++ b/net/ipv4/tcp_minisocks.c
 @@ -552,6 +552,15 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		newtp->fastopen_rsk = NULL;
 		newtp->syn_data_acked = 0;
 
 +		newtp->high_seq = newtp->snd_nxt;
 +
 +		/* TCP FEC option */
 +		newtp->rx_opt.fec.type = sysctl_tcp_fec ? req->fec_type : 0;
 +		newtp->fec.type = newtp->fec.flags = 0;
 +		newtp->fec.next_seq = newtp->snd_nxt;
 +		newtp->fec.bytes_rcv_queue = 0;
 +		skb_queue_head_init(&newtp->fec.rcv_queue);
 +
 		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);
 	}
 	return newsk;
 diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
 index ddd2a6f..7791899 100644
 --- a/net/ipv4/tcp_output.c
 +++ b/net/ipv4/tcp_output.c
 @@ -37,6 +37,7 @@
 #define pr_fmt(fmt) "TCP: " fmt
 
 #include <net/tcp.h>
 +#include <net/tcp_fec.h>
 
 #include <linux/compiler.h>
 #include <linux/gfp.h>
 @@ -65,6 +66,12 @@ int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS;
 /* By default, RFC2861 behavior.  */
 int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
 
 +/*
 + *      !!! TCP FEC patch !!!
 + */
 +int sysctl_tcp_fec __read_mostly;
 +
 +
 unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX;
 EXPORT_SYMBOL(sysctl_tcp_notsent_lowat);
 
 @@ -422,6 +429,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
 #define OPTION_MD5		(1 << 2)
 #define OPTION_WSCALE		(1 << 3)
 #define OPTION_FAST_OPEN_COOKIE	(1 << 8)
 +#define OPTION_FEC		(1 << 9)
 
 struct tcp_out_options {
 	u16 options;		/* bit field of OPTION_* */
 @@ -432,6 +440,7 @@ struct tcp_out_options {
 	__u8 *hash_location;	/* temporary pointer, overloaded */
 	__u32 tsval, tsecr;	/* need to include OPTION_TS */
 	struct tcp_fastopen_cookie *fastopen_cookie;	/* Fast open cookie */
 +	struct tcp_fec fec;	/* FEC parameters */
 };
 
 /* Write previously computed TCP options to the packet.
 @@ -540,6 +549,9 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
 		}
 		ptr += (len + 3) >> 2;
 	}
 +
 +	if (unlikely(OPTION_FEC & options))
 +		tcp_fec_encode_option(tp, &(opts->fec), &ptr);
 }
 
 /* Compute TCP options for SYN packets. This is not the final
 @@ -607,6 +619,14 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 		}
 	}
 
 +	/* Prepare for FEC negotation if requested */
 +	if (unlikely(tcp_fec_is_enabled(tp)) &&
 +	    remaining >= TCPOLEN_EXP_FEC_NEGOTIATION_ALIGNED) {
 +		opts->options |= OPTION_FEC;
 +		opts->fec.type = tp->fec.type;
 +		remaining -= TCPOLEN_EXP_FEC_NEGOTIATION_ALIGNED;
 +	}
 +
 	return MAX_TCP_OPTION_SPACE - remaining;
 }
 
 @@ -671,6 +691,16 @@ static unsigned int tcp_synack_options(struct sock *sk,
 		}
 	}
 
 +	/* Handle request for FEC support from other side
 +	 * (respond with same FEC option if FEC is locally supported)
 +	 */
 +	if (sysctl_tcp_fec && unlikely(req->fec_type) &&
 +	    remaining >= TCPOLEN_EXP_FEC_NEGOTIATION_ALIGNED) {
 +		opts->options |= OPTION_FEC;
 +		opts->fec.type = req->fec_type;
 +		remaining -= TCPOLEN_EXP_FEC_NEGOTIATION_ALIGNED;
 +	}
 +
 	return MAX_TCP_OPTION_SPACE - remaining;
 }
 
 @@ -681,6 +711,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
 					struct tcp_out_options *opts,
 					struct tcp_md5sig_key **md5)
 {
 +	struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL;
 	struct tcp_sock *tp = tcp_sk(sk);
 	unsigned int size = 0;
 	unsigned int eff_sacks;
 @@ -715,6 +746,19 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
 			opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
 	}
 
 +	/* Prepare option if connection has FEC enabled */
 +	if (tcp_fec_is_enabled(tp)) {
 +		opts->options |= OPTION_FEC;
 +		if (tcb && tcb->fec)
 +			opts->fec = *(tcb->fec);
 +
 +		/* regardless of packet type we need 4 more bytes
 +		 * including alignment
 +		 */
 +		size += 4;
 +		size += TCPOLEN_EXP_FEC_BASE;
 +	}
 +
 	return size;
 }
 
 @@ -895,7 +939,7 @@ void tcp_wfree(struct sk_buff *skb)
  * We are working here with either a clone of the original
  * SKB, or a fresh unique copy made by the retransmit engine.
  */
 -static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 +int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 			    gfp_t gfp_mask)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 @@ -2055,6 +2099,9 @@ repair:
 			break;
 	}
 
 +	if (tcp_fec_is_enabled(tp))
 +		tcp_fec_invoke(sk);
 +
 	if (likely(sent_pkts)) {
 		if (tcp_in_cwnd_reduction(sk))
 			tp->prr_out += sent_pkts;
 @@ -3153,6 +3200,12 @@ int tcp_connect(struct sock *sk)
 	 */
 	tp->snd_nxt = tp->write_seq;
 	tp->pushed_seq = tp->write_seq;
 +
 +	/* Initialize FEC members */
 +	tp->fec.next_seq = tp->snd_nxt;
 +	tp->fec.bytes_rcv_queue = 0;
 +	skb_queue_head_init(&tp->fec.rcv_queue);
 +
 	TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);
 
 	/* Timer for repeating the SYN until an answer. */
 diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
 index dceaacc..b78ea8f 100644
 --- a/net/ipv4/tcp_timer.c
 +++ b/net/ipv4/tcp_timer.c
 @@ -21,6 +21,7 @@
 #include <linux/module.h>
 #include <linux/gfp.h>
 #include <net/tcp.h>
 +#include <net/tcp_fec.h>
 
 int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES;
 int sysctl_tcp_synack_retries __read_mostly = TCP_SYNACK_RETRIES;
 @@ -478,7 +479,15 @@ out_reset_timer:
 	if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0))
 		__sk_dst_reset(sk);
 
 -out:;
 +out:
 +	/* FEC will switch out the RTO timer if a delayed FEC transmission
 +	 * should happen earlier than this. RTO timer will be switched in
 +	 * once the FEC timer fired.
 +	 * FEC transmissions during a loss episode require that the sysctl
 +	 * value is >= 2.
 +	 */
 +	if (tcp_fec_is_enabled(tp) && sysctl_tcp_fec >= 2)
 +		tcp_fec_arm_timer(sk);
 }
 
 void tcp_write_timer_handler(struct sock *sk)
 @@ -503,6 +512,9 @@ void tcp_write_timer_handler(struct sock *sk)
 	case ICSK_TIME_LOSS_PROBE:
 		tcp_send_loss_probe(sk);
 		break;
 +	case ICSK_TIME_FEC:
 +		tcp_fec_timer(sk);
 +		break;
 	case ICSK_TIME_RETRANS:
 		icsk->icsk_pending = 0;
 		tcp_retransmit_timer(sk);
 diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
 index c5078c5..d5205c6 100644
 --- a/net/ipv6/tcp_ipv6.c
 +++ b/net/ipv6/tcp_ipv6.c
 @@ -288,6 +288,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 
 	tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
 
 +	memset(&(tp->rx_opt.fec), 0, sizeof(struct tcp_fec));
 +
 	inet->inet_dport = usin->sin6_port;
 
 	tcp_set_state(sk, TCP_SYN_SENT);
No results found