Created
September 17, 2016 15:03
-
-
Save userid/49737fb237dac1026d13932fcb218e9b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h | |
| index 89bcfe8..9298207 100644 | |
| --- a/include/linux/skbuff.h | |
| +++ b/include/linux/skbuff.h | |
| @@ -533,8 +533,10 @@ struct sk_buff { | |
| * layer. Please put your private variables there. If you | |
| * want to keep them across layers you have to do a skb_clone() | |
| * first. This is owned by whoever has the skb queued ATM. | |
| + * | |
| + * Increased the CB to hold pointer to an FEC structure. | |
| */ | |
| - char cb[48] __aligned(8); | |
| + char cb[56] __aligned(8); | |
| unsigned long _skb_refdst; | |
| #ifdef CONFIG_XFRM | |
| diff --git a/include/linux/tcp.h b/include/linux/tcp.h | |
| index 4345d49..ccc0e91 100644 | |
| --- a/include/linux/tcp.h | |
| +++ b/include/linux/tcp.h | |
| @@ -79,6 +79,24 @@ struct tcp_sack_block { | |
| #define TCP_FACK_ENABLED (1 << 1) /*1 = FACK is enabled locally*/ | |
| #define TCP_DSACK_SEEN (1 << 2) /*1 = DSACK was received from peer*/ | |
| +/* Flags transmitted in the first FEC option byte after magic bytes | |
| + * (except if option is used for negotiation) */ | |
| +#define TCP_FEC_RECOVERY_CWR 0x80 /* Recovery triggered CWR */ | |
| +#define TCP_FEC_RECOVERY_SUCCESSFUL 0x40 /* Local recovery done */ | |
| +#define TCP_FEC_RECOVERY_FAILED 0x20 /* Local recovery failed */ | |
| +#define TCP_FEC_ENCODED 0x10 /* Packet is FEC-encoded */ | |
| + | |
| +struct tcp_fec { | |
| + u8 type; /* Requested FEC type (negotiation only, | |
| + * see net/tcp_fec.h for type defs) */ | |
| + u32 enc_seq; /* Sequence number of first encoded byte */ | |
| + u32 enc_len; /* Encoding length */ | |
| + u32 lost_seq; /* Sequence number of first lost byte */ | |
| + u32 lost_len; /* Loss length */ | |
| + u8 flags; /* See flag definitions above */ | |
| + bool saw_fec; /* FEC option was retrieved from packet */ | |
| +}; | |
| + | |
| struct tcp_options_received { | |
| /* PAWS/RTTM data */ | |
| long ts_recent_stamp;/* Time we stored ts_recent (for aging) */ | |
| @@ -95,12 +113,14 @@ struct tcp_options_received { | |
| u8 num_sacks; /* Number of SACK blocks */ | |
| u16 user_mss; /* mss requested by user in ioctl */ | |
| u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ | |
| + struct tcp_fec fec; /* FEC-related parameters */ | |
| }; | |
| static inline void tcp_clear_options(struct tcp_options_received *rx_opt) | |
| { | |
| rx_opt->tstamp_ok = rx_opt->sack_ok = 0; | |
| rx_opt->wscale_ok = rx_opt->snd_wscale = 0; | |
| + memset(&(rx_opt->fec), 0, sizeof(struct tcp_fec)); | |
| } | |
| /* This is the max number of SACKS that we'll generate and process. It's safe | |
| @@ -327,6 +347,24 @@ struct tcp_sock { | |
| * socket. Used to retransmit SYNACKs etc. | |
| */ | |
| struct request_sock *fastopen_rsk; | |
| + | |
| +/* TCP FEC parameters | |
| + * type - negotiated FEC type to be used | |
| + * next_seq - next sequence which was not FEC-encoded before | |
| + * lost_len - bytes after rcv_nxt considered lost | |
| + * flags - see TCP_FEC_* flag definitions above | |
| + * bytes_rcv_queue - number of bytes stored in queued SKBs | |
| + * rcv_queue - copies from the socket's receive queue kept for | |
| + * FEC recovery | |
| + */ | |
| + struct { | |
| + u8 type; | |
| + u32 next_seq; | |
| + u32 lost_len; | |
| + u8 flags; | |
| + u32 bytes_rcv_queue; | |
| + struct sk_buff_head rcv_queue; | |
| + } fec; | |
| }; | |
| enum tsq_flags { | |
| diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h | |
| index 06d0d0f..063aa59 100644 | |
| --- a/include/net/inet_connection_sock.h | |
| +++ b/include/net/inet_connection_sock.h | |
| @@ -138,6 +138,7 @@ struct inet_connection_sock { | |
| #define ICSK_TIME_PROBE0 3 /* Zero window probe timer */ | |
| #define ICSK_TIME_EARLY_RETRANS 4 /* Early retransmit timer */ | |
| #define ICSK_TIME_LOSS_PROBE 5 /* Tail loss probe timer */ | |
| +#define ICSK_TIME_FEC 6 /* FEC delayed send timer */ | |
| static inline struct inet_connection_sock *inet_csk(const struct sock *sk) | |
| { | |
| @@ -228,7 +229,8 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what, | |
| } | |
| if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0 || | |
| - what == ICSK_TIME_EARLY_RETRANS || what == ICSK_TIME_LOSS_PROBE) { | |
| + what == ICSK_TIME_EARLY_RETRANS || what == ICSK_TIME_LOSS_PROBE || | |
| + what == ICSK_TIME_FEC) { | |
| icsk->icsk_pending = what; | |
| icsk->icsk_timeout = jiffies + when; | |
| sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout); | |
| diff --git a/include/net/request_sock.h b/include/net/request_sock.h | |
| index 610fa9e..1c1b4ba 100644 | |
| --- a/include/net/request_sock.h | |
| +++ b/include/net/request_sock.h | |
| @@ -63,6 +63,8 @@ struct request_sock { | |
| struct sock *sk; | |
| u32 secid; | |
| u32 peer_secid; | |
| + u8 fec_type; /* Encoding type (see | |
| + * net/tcp_fec.h) */ | |
| }; | |
| static inline struct request_sock *reqsk_alloc(const struct request_sock_ops *ops) | |
| diff --git a/include/net/tcp.h b/include/net/tcp.h | |
| index d59f206..f894889 100644 | |
| --- a/include/net/tcp.h | |
| +++ b/include/net/tcp.h | |
| @@ -182,6 +182,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); | |
| * experimental options. See draft-ietf-tcpm-experimental-options-00.txt | |
| */ | |
| #define TCPOPT_FASTOPEN_MAGIC 0xF989 | |
| +#define TCPOPT_FEC_MAGIC 0xDC60 | |
| /* | |
| * TCP option lengths | |
| @@ -195,6 +196,11 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); | |
| #define TCPOLEN_FASTOPEN_BASE 2 | |
| #define TCPOLEN_EXP_FASTOPEN_BASE 4 | |
| +/* | |
| + * !!! TCP FEC patch !!! | |
| + */ | |
| +#define TCPOLEN_EXP_FEC_BASE 4 | |
| + | |
| /* But this is what stacks really send out. */ | |
| #define TCPOLEN_TSTAMP_ALIGNED 12 | |
| #define TCPOLEN_WSCALE_ALIGNED 4 | |
| @@ -204,6 +210,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); | |
| #define TCPOLEN_SACK_PERBLOCK 8 | |
| #define TCPOLEN_MD5SIG_ALIGNED 20 | |
| #define TCPOLEN_MSS_ALIGNED 4 | |
| +#define TCPOLEN_EXP_FEC_NEGOTIATION_ALIGNED 8 | |
| /* Flags in tp->nonagle */ | |
| #define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */ | |
| @@ -230,6 +237,15 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); | |
| #define TFO_SERVER_WO_SOCKOPT1 0x400 | |
| #define TFO_SERVER_WO_SOCKOPT2 0x800 | |
| +/* | |
| + * !!! TCP FEC patch !!! | |
| + */ | |
| + | |
| +/* Maximum number of in-order bytes kept in the receiver's buffer for FEC | |
| + * recoveries. The sender will never send more than this in a single FEC | |
| + * packet. */ | |
| +#define FEC_RCV_QUEUE_LIMIT 16000 | |
| + | |
| extern struct inet_timewait_death_row tcp_death_row; | |
| /* sysctl variables for tcp */ | |
| @@ -274,6 +290,12 @@ extern int sysctl_tcp_thin_dupack; | |
| extern int sysctl_tcp_early_retrans; | |
| extern int sysctl_tcp_limit_output_bytes; | |
| extern int sysctl_tcp_challenge_ack_limit; | |
| + | |
| +/* | |
| + * !!! TCP FEC patch !!! | |
| + */ | |
| +extern int sysctl_tcp_fec; | |
| + | |
| extern unsigned int sysctl_tcp_notsent_lowat; | |
| extern int sysctl_tcp_min_tso_segs; | |
| extern int sysctl_tcp_autocorking; | |
| @@ -725,6 +747,7 @@ struct tcp_skb_cb { | |
| __u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */ | |
| /* 1 byte hole */ | |
| __u32 ack_seq; /* Sequence number ACK'd */ | |
| + struct tcp_fec *fec; /* FEC parameters */ | |
| }; | |
| #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) | |
| @@ -1131,6 +1154,11 @@ static inline void tcp_openreq_init(struct request_sock *req, | |
| ireq->ecn_ok = 0; | |
| ireq->ir_rmt_port = tcp_hdr(skb)->source; | |
| ireq->ir_num = ntohs(tcp_hdr(skb)->dest); | |
| + | |
| + /* | |
| + * !!! TCP FEC patch !!! | |
| + */ | |
| + req->fec_type = rx_opt->fec.type; | |
| } | |
| extern void tcp_openreq_init_rwin(struct request_sock *req, | |
| diff --git a/include/net/tcp_fec.h b/include/net/tcp_fec.h | |
| new file mode 100644 | |
| index 0000000..38f2c40 | |
| --- /dev/null | |
| +++ b/include/net/tcp_fec.h | |
| @@ -0,0 +1,106 @@ | |
| +#ifndef _TCP_FEC_H | |
| +#define _TCP_FEC_H | |
| + | |
| +#include <net/tcp.h> | |
| +#include <asm/unaligned.h> | |
| + | |
| +/* FEC-encoding types (8 bits, internal) */ | |
| +#define TCP_FEC_TYPE_NONE 0 /* FEC disabled */ | |
| +#define TCP_FEC_TYPE_XOR_ALL 1 /* XOR every MSS length segment */ | |
| +#define TCP_FEC_TYPE_XOR_SKIP_1 2 /* XOR every other MSS length | |
| + * segment */ | |
| + | |
| +#define TCP_FEC_NUM_TYPES 3 | |
| + | |
| +/* Delay transmission of FEC packets (delay defined in tcp_fec_arm_timer()) */ | |
| +#define TCP_FEC_DELAYED_SEND 1 | |
| + | |
| +/* | |
| + * Returns true if FEC is enabled for the socket | |
| + */ | |
| +static inline bool tcp_fec_is_enabled(const struct tcp_sock *tp) | |
| +{ | |
| + return unlikely(tp->fec.type > 0); | |
| +} | |
| + | |
| +/* | |
| + * Returns true if the current packet in the buffer is FEC-encoded | |
| + */ | |
| +static inline bool tcp_fec_is_encoded(const struct tcp_sock *tp) | |
| +{ | |
| + return unlikely((tp->rx_opt.fec.flags & TCP_FEC_ENCODED) && | |
| + (tp->rx_opt.fec.saw_fec)); | |
| +} | |
| + | |
| +/* | |
| + * Decodes FEC parameters and stores them in the FEC struct | |
| + * @seq - sequence number of the packet | |
| + * @ack_seq - ACKed sequence number | |
| + * @is_syn - true, if option was attached to a packet with a SYN flag | |
| + * @ptr - points to the first byte of the FEC option after kind, length, | |
| + * and possible magic bytes | |
| + * @len - option length (without kind, length, magic bytes) | |
| + */ | |
| +int tcp_fec_decode_option(struct tcp_fec *fec, u32 seq, u32 ack_seq, | |
| + bool is_syn, const unsigned char *ptr, | |
| + unsigned int len); | |
| + | |
| +/* | |
| + * Encodes FEC parameters to wire format | |
| + * Pointer points to the first byte of the FEC option after kind, length, | |
| + * and possible magic bytes (pointer will be moved to first unoccupied byte) | |
| + */ | |
| +int tcp_fec_encode_option(struct tcp_sock *tp, struct tcp_fec *fec, | |
| + __be32 **ptr); | |
| + | |
| +/* | |
| + * Processes the current packet in the buffer (treated as FEC packet) | |
| + */ | |
| +int tcp_fec_process(struct sock *sk, struct sk_buff *skb); | |
| + | |
| +/* | |
| + * Checks the received options for loss indicators and acts upon them. | |
| + * In particular, the function handles window reduction requests and processes | |
| + * tail loss indicators. | |
| + * Returns: 1, if window is reduced - 0, otherwise | |
| + */ | |
| +int tcp_fec_check_ack(struct sock *sk, u32 ack_seq); | |
| + | |
| +/* | |
| + * Since data in the socket's receive queue can get consumed by other parties | |
| + * we need to keep extra references these SKBs until they are no longer | |
| + * required for possible future recoveries. | |
| + * @skb - buffer which is moved to the receive queue | |
| + */ | |
| +int tcp_fec_update_queue(struct sock *sk, struct sk_buff *skb); | |
| + | |
| +/* | |
| + * Disables FEC for this connection (includes clearing references | |
| + * to buffers in receive queue) | |
| + */ | |
| +void tcp_fec_disable(struct sock *sk); | |
| + | |
| +/* Arms the timer for a delayed FEC transmission if there is | |
| + * no earlier timeout defined (i.e. retransmission timeout) | |
| + */ | |
| +void tcp_fec_arm_timer(struct sock *sk); | |
| + | |
| +/* The FEC timer fired. Force an FEC transmission for the | |
| + * last unencoded burst. Rearm the RTO timer (which was switched | |
| + * out when setting the FEC timer). Set a new FEC timer if there | |
| + * is pending unencoded data. | |
| + */ | |
| +void tcp_fec_timer(struct sock *sk); | |
| + | |
| +/* If FEC packets transmissions are delayed set a timer | |
| + * (if not already set), otherwise invoke the FEC mechanism | |
| + * immediately | |
| + */ | |
| +int tcp_fec_invoke(struct sock *sk); | |
| + | |
| +/* Invoke the FEC mechanism set for the connection; | |
| + * Create and sends out FEC packets | |
| + */ | |
| +int tcp_fec_invoke_nodelay(struct sock *sk); | |
| + | |
| +#endif | |
| diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h | |
| index 3b97183..d679733 100644 | |
| --- a/include/uapi/linux/tcp.h | |
| +++ b/include/uapi/linux/tcp.h | |
| @@ -113,6 +113,11 @@ enum { | |
| #define TCP_TIMESTAMP 24 | |
| #define TCP_NOTSENT_LOWAT 25 /* limit number of unsent bytes in write queue */ | |
| +/* | |
| + * !!! TCP FEC patch !!! | |
| + */ | |
| +#define TCP_FEC 79 /* Forward error correction */ | |
| + | |
| struct tcp_repair_opt { | |
| __u32 opt_code; | |
| __u32 opt_val; | |
| diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile | |
| index 518c04e..6aa32ca 100644 | |
| --- a/net/ipv4/Makefile | |
| +++ b/net/ipv4/Makefile | |
| @@ -6,7 +6,7 @@ obj-y := route.o inetpeer.o protocol.o \ | |
| ip_input.o ip_fragment.o ip_forward.o ip_options.o \ | |
| ip_output.o ip_sockglue.o inet_hashtables.o \ | |
| inet_timewait_sock.o inet_connection_sock.o \ | |
| - tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ | |
| + tcp.o tcp_fec.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ | |
| tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \ | |
| tcp_offload.o datagram.o raw.o udp.o udplite.o \ | |
| udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ | |
| diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c | |
| index 0d438fb..9cfa3d3 100644 | |
| --- a/net/ipv4/inet_diag.c | |
| +++ b/net/ipv4/inet_diag.c | |
| @@ -183,7 +183,8 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, | |
| if (icsk->icsk_pending == ICSK_TIME_RETRANS || | |
| icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || | |
| - icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { | |
| + icsk->icsk_pending == ICSK_TIME_LOSS_PROBE || | |
| + icsk->icsk_pending == ICSK_TIME_FEC) { | |
| r->idiag_timer = 1; | |
| r->idiag_retrans = icsk->icsk_retransmits; | |
| r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout); | |
| diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c | |
| index eeb17b3..9c58530 100644 | |
| --- a/net/ipv4/sysctl_net_ipv4.c | |
| +++ b/net/ipv4/sysctl_net_ipv4.c | |
| @@ -28,6 +28,7 @@ | |
| static int zero; | |
| static int one = 1; | |
| +static int two = 2; | |
| static int four = 4; | |
| static int gso_max_segs = GSO_MAX_SEGS; | |
| static int tcp_retr1_max = 255; | |
| @@ -810,6 +811,15 @@ static struct ctl_table ipv4_table[] = { | |
| .proc_handler = proc_dointvec_minmax, | |
| .extra1 = &one | |
| }, | |
| + { | |
| + .procname = "tcp_fec", | |
| + .data = &sysctl_tcp_fec, | |
| + .maxlen = sizeof(int), | |
| + .mode = 0644, | |
| + .proc_handler = proc_dointvec, | |
| + .extra1 = &zero, | |
| + .extra2 = &two, | |
| + }, | |
| { } | |
| }; | |
| diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c | |
| index b8ff562..1a2dab5 100644 | |
| --- a/net/ipv4/tcp.c | |
| +++ b/net/ipv4/tcp.c | |
| @@ -276,6 +276,8 @@ | |
| #include <net/ip.h> | |
| #include <net/sock.h> | |
| +#include <net/tcp_fec.h> | |
| + | |
| #include <asm/uaccess.h> | |
| #include <asm/ioctls.h> | |
| #include <net/busy_poll.h> | |
| @@ -2565,6 +2567,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level, | |
| else | |
| tp->tsoffset = val - tcp_time_stamp; | |
| break; | |
| + case TCP_FEC: | |
| + if (sysctl_tcp_fec && val >= 0 && val < TCP_FEC_NUM_TYPES) | |
| + tp->fec.type = val; | |
| + else | |
| + err = -EINVAL; | |
| + break; | |
| case TCP_NOTSENT_LOWAT: | |
| tp->notsent_lowat = val; | |
| sk->sk_write_space(sk); | |
| @@ -2792,6 +2800,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level, | |
| case TCP_TIMESTAMP: | |
| val = tcp_time_stamp + tp->tsoffset; | |
| break; | |
| + case TCP_FEC: | |
| + val = tp->fec.type; | |
| + break; | |
| case TCP_NOTSENT_LOWAT: | |
| val = tp->notsent_lowat; | |
| break; | |
| diff --git a/net/ipv4/tcp_fec.c b/net/ipv4/tcp_fec.c | |
| new file mode 100644 | |
| index 0000000..53711cb | |
| --- /dev/null | |
| +++ b/net/ipv4/tcp_fec.c | |
| @@ -0,0 +1,1253 @@ | |
| +#include <net/tcp_fec.h> | |
| + | |
| +/* Codes for incoming FEC packet processing */ | |
| +#define FEC_NO_LOSS 1 | |
| +#define FEC_LOSS_UNRECOVERED 2 | |
| +#define FEC_LOSS_RECOVERED 3 | |
| + | |
| +/* Receiver routines */ | |
| +static int tcp_fec_process_xor(struct sock *sk, const struct sk_buff *skb, | |
| + unsigned int block_skip); | |
| +static int tcp_fec_recover(struct sock *sk, const struct sk_buff *skb, | |
| + unsigned char *data, u32 seq, int len); | |
| +static void tcp_fec_send_ack(struct sock *sk, const struct sk_buff *skb, | |
| + int recovery_status); | |
| +static void tcp_fec_reduce_window(struct sock *sk); | |
| +static void tcp_fec_mark_skbs_lost(struct sock *sk); | |
| +static bool tcp_fec_update_decoded_option(struct sk_buff *skb); | |
| +static struct sk_buff *tcp_fec_make_decoded_pkt(struct sock *sk, | |
| + const struct sk_buff *skb, unsigned char *dec_data, | |
| + u32 seq, unsigned int len); | |
| + | |
| +/* Sender routines */ | |
| +static int tcp_fec_create(struct sock *sk, struct sk_buff_head *list); | |
| +static int tcp_fec_create_xor(struct sock *sk, struct sk_buff_head *list, | |
| + unsigned int first_seq, unsigned int block_len, | |
| + unsigned int block_skip, | |
| + unsigned int max_encoded_per_pkt); | |
| +static struct sk_buff *tcp_fec_make_encoded_pkt(struct sock *sk, | |
| + struct tcp_fec *fec, unsigned char *enc_data, | |
| + u32 seq); | |
| +static int tcp_fec_xmit_all(struct sock *sk, struct sk_buff_head *list); | |
| +static int tcp_fec_xmit(struct sock *sk, struct sk_buff *skb); | |
| + | |
| +/* Buffer access routine */ | |
| +static unsigned int tcp_fec_get_next_block(struct sock *sk, | |
| + struct sk_buff **skb, struct sk_buff_head *queue, | |
| + u32 seq, unsigned int block_len, | |
| + unsigned char *block); | |
| + | |
| +/* Have to define this signature here since the actual function was static | |
| + * and tcp_output.c has no corresponding header file | |
| + */ | |
| +extern int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |
| + gfp_t gfp_mask); | |
| + | |
| +/* Decodes FEC parameters and stores them in the FEC struct | |
| + * @seq - sequence number of the packet | |
| + * @ack_seq - ACKed sequence number | |
| + * @is_syn - true, if option was attached to a packet with a SYN flag | |
| + * @ptr - points to the first byte of the FEC option after kind, length, | |
| + * and possible magic bytes | |
| + * @len - option length (without kind, length, magic bytes) | |
| + */ | |
| +int tcp_fec_decode_option(struct tcp_fec *fec, u32 seq, u32 ack_seq, | |
| + bool is_syn, const unsigned char *ptr, | |
| + unsigned int len) | |
| +{ | |
| + /* reset / initialize option values which should be evaluated | |
| + * with EVERY incoming packet | |
| + */ | |
| + fec->flags = 0; | |
| + fec->saw_fec = 1; | |
| + | |
| + if (len == 1) { | |
| + /* Short option */ | |
| + u8 val = *((u8 *) ptr); | |
| + if (is_syn) { | |
| + /* Negotiation */ | |
| + fec->type = val; | |
| + } else { | |
| + /* Regular packet */ | |
| + fec->flags = val; | |
| + } | |
| + | |
| + return 0; | |
| + } | |
| + | |
| + if (len == 4) { | |
| + /* Long option */ | |
| + u32 val = get_unaligned_be32(ptr); | |
| + fec->flags = val >> 24; | |
| + | |
| + if (fec->flags & TCP_FEC_ENCODED) { | |
| + fec->enc_seq = seq; | |
| + fec->enc_len = val & 0xFFFFFF; | |
| + } else if (fec->flags & TCP_FEC_RECOVERY_FAILED) { | |
| + fec->lost_seq = ack_seq; | |
| + fec->lost_len = val & 0xFFFFFF; | |
| + } else { | |
| + return -EINVAL; | |
| + } | |
| + | |
| + return 0; | |
| + } | |
| + | |
| + /* Invalid option length */ | |
| + return -EINVAL; | |
| +} | |
| + | |
| +/* Encodes FEC parameters to wire format | |
| + * @ptr - Encoded option is written to this memory location (and the pointer | |
| + * is advanced to the next unoccupied byte, 4-byte aligned) | |
| + * Returns the length of the encoded option (including alignment) | |
| + */ | |
| +int tcp_fec_encode_option(struct tcp_sock *tp, struct tcp_fec *fec, | |
| + __be32 **ptr) | |
| +{ | |
| + int len; | |
| + | |
| + fec->flags |= tp->fec.flags; | |
| + fec->lost_len = tp->fec.lost_len; | |
| + tp->fec.flags &= ~TCP_FEC_RECOVERY_CWR; | |
| + tp->fec.flags &= ~TCP_FEC_RECOVERY_FAILED; | |
| + | |
| + /* Encode fixed option part (option kind, length, and magic bytes) */ | |
| + if (fec->flags & (TCP_FEC_ENCODED | TCP_FEC_RECOVERY_FAILED)) | |
| + len = 4 + TCPOLEN_EXP_FEC_BASE; /* Long option */ | |
| + else | |
| + len = 1 + TCPOLEN_EXP_FEC_BASE; /* Short option */ | |
| + | |
| + **ptr = htonl((TCPOPT_EXP << 24) | (len << 16) | TCPOPT_FEC_MAGIC); | |
| + (*ptr)++; | |
| + | |
| + if ((fec->flags & TCP_FEC_ENCODED) && | |
| + (fec->flags & TCP_FEC_RECOVERY_FAILED)) { | |
| + /* TODO Special case: need to separate loss indication | |
| + * from encoding or make option 12 bytes long | |
| + * This can only happen if a node receives and sends FEC | |
| + * data | |
| + */ | |
| + fec->flags &= ~TCP_FEC_RECOVERY_FAILED; | |
| + } | |
| + | |
| + if (fec->flags & TCP_FEC_ENCODED) { | |
| + /* FEC-encoded packets carry: | |
| + * <Flags:8, Encoding length:24> | |
| + */ | |
| + **ptr = htonl((fec->flags << 24) | | |
| + (fec->enc_len)); | |
| + (*ptr)++; | |
| + return 8; | |
| + } else if (fec->flags & TCP_FEC_RECOVERY_FAILED) { | |
| + /* Packets with failed recovery indication carry: | |
| + * <Flags:8, Bytes after ACKed seq lost:24> | |
| + */ | |
| + **ptr = htonl((fec->flags << 24) | | |
| + (fec->lost_len)); | |
| + (*ptr)++; | |
| + return 8; | |
| + } else if (fec->type) { | |
| + /* Negotiation packets carry: <Encoding type:8> */ | |
| + **ptr = htonl((fec->type << 24) | | |
| + (TCPOPT_NOP << 16) | | |
| + (TCPOPT_NOP << 8) | | |
| + TCPOPT_NOP); | |
| + (*ptr)++; | |
| + return 8; | |
| + } else { | |
| + /* All other packets carry: <Flags:8> */ | |
| + **ptr = htonl((fec->flags << 24) | | |
| + (TCPOPT_NOP << 16) | | |
| + (TCPOPT_NOP << 8) | | |
| + TCPOPT_NOP); | |
| + (*ptr)++; | |
| + return 8; | |
| + } | |
| +} | |
| + | |
| +/* Processes the current packet in the buffer, treated as an FEC packet | |
| + * (assumes that options were already processed) | |
| + */ | |
| +int tcp_fec_process(struct sock *sk, struct sk_buff *skb) | |
| +{ | |
| + struct tcp_sock *tp; | |
| + struct tcphdr *th; | |
| + int recovery_status, err; | |
| + u32 end_seq; | |
| + | |
| + tp = tcp_sk(sk); | |
| + th = tcp_hdr(skb); | |
| + recovery_status = 0; | |
| + | |
| + /* drop packet if packet is not encoded */ | |
| + if (!(tp->rx_opt.fec.flags & TCP_FEC_ENCODED)) | |
| + return -1; | |
| + | |
| + /* check if all encoded packets were already received */ | |
| + end_seq = tp->rx_opt.fec.enc_seq + tp->rx_opt.fec.enc_len; | |
| + if (!after(end_seq, tp->rcv_nxt)) { | |
| + tcp_fec_send_ack(sk, skb, FEC_NO_LOSS); | |
| + return 0; | |
| + } | |
| + | |
| + /* linearize the SKB (for easier payload access) */ | |
| + err = skb_linearize(skb); | |
| + if (err) | |
| + return err; | |
| + | |
| + /* data recovery */ | |
| + switch (tp->fec.type) { | |
| + case TCP_FEC_TYPE_NONE: | |
| + return -1; | |
| + case TCP_FEC_TYPE_XOR_ALL: | |
| + recovery_status = tcp_fec_process_xor(sk, skb, 0); | |
| + break; | |
| + case TCP_FEC_TYPE_XOR_SKIP_1: | |
| + recovery_status = tcp_fec_process_xor(sk, skb, 1); | |
| + break; | |
| + } | |
| + | |
| + /* TODO error handling; -ENOMEM, etc. - disable FEC? */ | |
| + if (recovery_status < 0) | |
| + return recovery_status; | |
| + | |
| + /* Send an explicit ACK if recovery failed */ | |
| + if (recovery_status == FEC_LOSS_UNRECOVERED) | |
| + tcp_fec_send_ack(sk, skb, recovery_status); | |
| + | |
| + return 0; | |
| +} | |
| + | |
| +/* Checks the received options for loss indicators and acts upon them. | |
| + * In particular, the function handles recovery flags (indicators for | |
| + * successful and failed recoveries, tail losses) | |
| + * Returns: 1, if ACK contains a loss indicator | |
| + */ | |
| +int tcp_fec_check_ack(struct sock *sk, u32 ack_seq) | |
| +{ | |
| + struct tcp_sock *tp; | |
| + | |
| + tp = tcp_sk(sk); | |
| + | |
| + /* Clear local recovery indication (and ECN CWR demand) | |
| + * if it was ACKED by the other node | |
| + */ | |
| + if (tp->rx_opt.fec.flags & TCP_FEC_RECOVERY_CWR) { | |
| + tp->fec.flags &= ~TCP_FEC_RECOVERY_SUCCESSFUL; | |
| + tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; | |
| + } | |
| + | |
| + /* Check for tail loss indicators | |
| + * This happens when FEC was unable to recover the lost data and | |
| + * thus only sends an ACK with the loss range back. Everything not | |
| + * ACKed/SACKed now, is considered lost now. | |
| + */ | |
| + if (tp->rx_opt.fec.flags & TCP_FEC_RECOVERY_FAILED) { | |
| + tcp_fec_mark_skbs_lost(sk); | |
| + return 1; | |
| + } | |
| + | |
| + /* Check if the remote endpoint successfully recovered data, | |
| + * if so we trigger a window reduction | |
| + */ | |
| + if (tp->rx_opt.fec.flags & TCP_FEC_RECOVERY_SUCCESSFUL) { | |
| + /* Ignore flag if window was already reduced for the current | |
| + * loss episode or if previous reduction was not signaled | |
| + * yet (no outgoing packets) | |
| + */ | |
| + if (after(ack_seq, tp->high_seq) && | |
| + !(tp->fec.flags & TCP_FEC_RECOVERY_CWR)) { | |
| + tcp_fec_reduce_window(sk); | |
| + tp->fec.flags |= TCP_FEC_RECOVERY_CWR; | |
| + } | |
| + | |
| + return 1; | |
| + } | |
| + | |
| + return 0; | |
| +} | |
| + | |
| +/* Since data in the socket's receive queue can get consumed by other parties | |
| + * we need to clone these SKBs until they are no longer required for possible | |
| + * future recoveries. This function is called after the TCP header has been | |
| + * removed from the SKB already. All parameters required for recovery are | |
| + * stored in the SKB's control buffer. | |
| + * @skb - buffer which is moved to the receive queue | |
| + */ | |
| +int tcp_fec_update_queue(struct sock *sk, struct sk_buff *skb) | |
| +{ | |
| + struct tcp_sock *tp; | |
| + struct sk_buff *cskb; | |
| + u32 data_len; | |
| + int extra_bytes, err; | |
| + tp = tcp_sk(sk); | |
| + | |
| + /* clone the SKB and add it to the FEC receive queue | |
| + * (a simple extra reference to the SKB is not sufficient since | |
| + * since SKBs can only be queued on one list at a time) | |
| + */ | |
| + cskb = skb_clone(skb, GFP_ATOMIC); | |
| + if (cskb == NULL) | |
| + return -ENOMEM; | |
| + | |
| + /* linearize the SKB (for easier payload access) */ | |
| + err = skb_linearize(cskb); | |
| + if (err) | |
| + return err; | |
| + | |
| + data_len = skb->len; | |
| + if (!data_len) { | |
| + kfree_skb(cskb); | |
| + return 0; | |
| + } | |
| + | |
| + skb_queue_tail(&tp->fec.rcv_queue, cskb); | |
| + tp->fec.bytes_rcv_queue += data_len; | |
| + | |
| + /* check if we can dereference old SKBs (as long as we have enough | |
| + * data for future recoveries) | |
| + */ | |
| + extra_bytes = tp->fec.bytes_rcv_queue - FEC_RCV_QUEUE_LIMIT; | |
| + while (extra_bytes > 0) { | |
| + cskb = skb_peek(&tp->fec.rcv_queue); | |
| + if (cskb == NULL) | |
| + return -EINVAL; | |
| + | |
| + data_len = TCP_SKB_CB(cskb)->end_seq - TCP_SKB_CB(cskb)->seq; | |
| + if (data_len > extra_bytes) { | |
| + break; | |
| + } else { | |
| + extra_bytes -= data_len; | |
| + tp->fec.bytes_rcv_queue -= data_len; | |
| + skb_unlink(cskb, &tp->fec.rcv_queue); | |
| + kfree_skb(cskb); | |
| + } | |
| + } | |
| + | |
| + return 0; | |
| +} | |
| + | |
| +/* Disables FEC for this connection (includes clearing references | |
| + * to buffers in receive queue) | |
| + */ | |
| +void tcp_fec_disable(struct sock *sk) | |
| +{ | |
| + struct tcp_sock *tp = tcp_sk(sk); | |
| + | |
| + if (!tcp_fec_is_enabled(tp)) | |
| + return; | |
| + | |
| + tp->fec.type = 0; | |
| + tp->fec.bytes_rcv_queue = 0; | |
| + skb_queue_purge(&tp->fec.rcv_queue); | |
| +} | |
| + | |
| +/* Processes the current packet in the buffer, treated as an FEC packet | |
| + * with XOR-encoded payload (assumes that options were already processed) | |
| + * Returns: negative code, if an error occurred; | |
| + * positive code, otherwise (recovery status) | |
| + * @block_skip - Number of unencoded blocks between two encoded blocks | |
| + */ | |
| +static int tcp_fec_process_xor(struct sock *sk, const struct sk_buff *skb, | |
| + unsigned int block_skip) | |
| +{ | |
| + struct sk_buff *pskb; | |
| + struct tcp_sock *tp; | |
| + struct tcphdr *th; | |
| + u32 next_seq, end_seq, rec_seq; | |
| + unsigned char *data, *block; | |
| + unsigned int i, offset, data_len, block_len, rec_len; | |
| + bool seen_loss; | |
| + int ret; | |
| + | |
| + pskb = NULL; | |
| + tp = tcp_sk(sk); | |
| + th = tcp_hdr(skb); | |
| + next_seq = tp->rx_opt.fec.enc_seq; | |
| + end_seq = next_seq + tp->rx_opt.fec.enc_len; | |
| + block_len = skb->len - tcp_hdrlen(skb); | |
| + seen_loss = false; | |
| + offset = 0; | |
| + | |
| + /* memory allocation for decoding / recovered SKB data */ | |
| + data = kmalloc(2 * block_len, GFP_ATOMIC); | |
| + if (data == NULL) | |
| + return -ENOMEM; | |
| + | |
| + block = data + block_len; | |
| + | |
| + /* copy FEC payload (skip TCP header) */ | |
| + memcpy(data, skb->data + tcp_hdrlen(skb), block_len); | |
| + | |
| + /* process in-sequence data */ | |
| + while ((data_len = tcp_fec_get_next_block(sk, &pskb, | |
| + &tp->fec.rcv_queue, next_seq, | |
| + min(block_len, end_seq - next_seq), | |
| + block))) { | |
| + next_seq += data_len; | |
| + | |
| + /* XOR with existing payload */ | |
| + for (i = 0; i < data_len; i++) | |
| + data[i] ^= block[i]; | |
| + | |
| + /* we could no read a whole MSS block, which means we | |
| + * reached the end of the queue or end of range which the | |
| + * FEC packet covers | |
| + */ | |
| + if (data_len < block_len) | |
| + break; | |
| + | |
| + /* skip unencoded blocks if there is more data encoded */ | |
| + if (end_seq - next_seq > 0) | |
| + next_seq += block_len * block_skip; | |
| + } | |
| + | |
| + /* check if all encoded bytes were already received */ | |
| + if (next_seq == end_seq) { | |
| + kfree(data); | |
| + return FEC_NO_LOSS; | |
| + } | |
| + | |
| + /* we always recover one whole MSS block (otherwise slicing | |
| + * would introduce a lot of additional complexity here) and handle | |
| + * cut out already received sequences later | |
| + */ | |
| + rec_seq = next_seq; | |
| + rec_len = min(block_len, end_seq - rec_seq); | |
| + offset = data_len; | |
| + if ((rec_seq + rec_len) == end_seq) | |
| + goto recover; | |
| + | |
| + next_seq += block_len * (block_skip + 1); | |
| + pskb = NULL; | |
| + | |
| + /* read a possibly partial (smaller than MSS) block to fill up the | |
| + * previously unfilled block and achieve alignment again | |
| + */ | |
| + data_len = tcp_fec_get_next_block(sk, &pskb, &tp->out_of_order_queue, | |
| + next_seq, block_len - offset, block); | |
| + | |
| + next_seq += data_len; | |
| + | |
| + /* check if we could not read as much data as requested */ | |
| + if ((next_seq != end_seq) && (data_len < (block_len - offset))) | |
| + goto clean; | |
| + | |
| + /* XOR with existing payload */ | |
| + for (i = 0; i < data_len; i++) | |
| + data[i+offset] ^= block[i]; | |
| + | |
| + /* skip unencoded blocks if there is more data encoded */ | |
| + if (end_seq - next_seq > 0) | |
| + next_seq += block_len * block_skip; | |
| + | |
| + /* read all necessary blocks to finish decoding */ | |
| + while ((data_len = tcp_fec_get_next_block(sk, &pskb, | |
| + &tp->out_of_order_queue, next_seq, | |
| + min(block_len, end_seq - next_seq), | |
| + block))) { | |
| + next_seq += data_len; | |
| + | |
| + /* XOR with existing payload */ | |
| + for (i = 0; i < data_len; i++) | |
| + data[i] ^= block[i]; | |
| + | |
| + /* we could not read a whole MSS block, which means we reached | |
| + * the end of the queue or end of range which the FEC packet | |
| + * covers | |
| + */ | |
| + if (data_len < block_len) | |
| + break; | |
| + | |
| + /* skip unencoded blocks if there is more data encoded */ | |
| + if (end_seq - next_seq > 0) | |
| + next_seq += block_len * block_skip; | |
| + } | |
| + | |
| + /* check if additional losses were observed (cannot recover) */ | |
| + if (next_seq != end_seq) | |
| + goto clean; | |
| + | |
| +recover: | |
| + /* create and process recovered packets */ | |
| + for (i = 0; i < rec_len; i++) | |
| + block[i] = data[(offset + i) % block_len]; | |
| + | |
| + if (block_skip && ((block_len - offset) < rec_len)) { | |
| + /* recover non-consecutive sequence ranges (only when | |
| + * slicing is used) | |
| + */ | |
| + u32 second_seq; | |
| + unsigned int second_seq_len, first_seq_len; | |
| + | |
| + first_seq_len = block_len - offset; | |
| + second_seq = rec_seq + first_seq_len + block_len * block_skip; | |
| + second_seq_len = rec_len - first_seq_len; | |
| + | |
| + ret = tcp_fec_recover(sk, skb, block, rec_seq, first_seq_len); | |
| + if (ret >= 0) { | |
| + int second_ret = tcp_fec_recover(sk, skb, | |
| + block + first_seq_len, | |
| + second_seq, second_seq_len); | |
| + if (second_ret < 0 || !ret) | |
| + ret = second_ret; | |
| + } | |
| + } else { | |
| + ret = tcp_fec_recover(sk, skb, block, rec_seq, rec_len); | |
| + } | |
| + | |
| + kfree(data); | |
| + return ret ? ret : FEC_LOSS_RECOVERED; | |
| + | |
| +clean: | |
| + kfree(data); | |
| + return FEC_LOSS_UNRECOVERED; | |
| +} | |
| + | |
| +/* Create a recovered packet and forward it to the reception routine */ | |
| +static int tcp_fec_recover(struct sock *sk, const struct sk_buff *skb, | |
| + unsigned char *data, u32 seq, int len) | |
| +{ | |
| + struct sk_buff *rskb; | |
| + struct tcp_sock *tp; | |
| + | |
| + tp = tcp_sk(sk); | |
| + | |
| + /* We will notify the remote node that recovery was successful */ | |
| + tp->fec.flags |= TCP_FEC_RECOVERY_SUCCESSFUL; | |
| + | |
| + /* Check if we received some tail of the recovered sequence already | |
| + * by looking at the current SACK blocks (we don't want to recover | |
| + * more data than necessary to prevent DSACKS) | |
| + */ | |
| + if (tcp_is_sack(tp)) { | |
| + int i; | |
| + for (i = 0; i < tp->rx_opt.num_sacks; i++) { | |
| + if (before(tp->selective_acks[i].start_seq, | |
| + seq + len) && | |
| + !before(tp->selective_acks[i].end_seq, | |
| + seq + len)) { | |
| + len = tp->selective_acks[i].start_seq - seq; | |
| + break; | |
| + } | |
| + } | |
| + } | |
| + | |
| + /* We might have prematurely asked for a recovery in the case where the | |
| + * whole recovery sequence is already covered by SACKs | |
| + */ | |
| + if (len <= 0) | |
| + return FEC_NO_LOSS; | |
| + | |
| + /* Create decoded packet and forward to reception routine */ | |
| + rskb = tcp_fec_make_decoded_pkt(sk, skb, data, seq, len); | |
| + if (rskb == NULL) | |
| + return -EINVAL; | |
| + | |
| + tcp_rcv_established(sk, rskb, tcp_hdr(rskb), rskb->len); | |
| + return 0; | |
| +} | |
| + | |
| +/* Sends an ACK for the FEC packet and encodes any congestion or | |
| + * and/or recovery information | |
| + */ | |
| +static void tcp_fec_send_ack(struct sock *sk, const struct sk_buff *skb, | |
| + int recovery_status) | |
| +{ | |
| + struct tcp_sock *tp; | |
| + u32 end_seq; | |
| + | |
| + tp = tcp_sk(sk); | |
| + | |
| + /* Right now we only need an outgoing ACK if FEC recovery failed, | |
| + * in all other cases ACKs are implicitly generated | |
| + */ | |
| + switch (recovery_status) { | |
| + case FEC_LOSS_UNRECOVERED: | |
| + end_seq = tp->rx_opt.fec.enc_seq + tp->rx_opt.fec.enc_len; | |
| + tp->fec.flags |= TCP_FEC_RECOVERY_FAILED; | |
| + tp->fec.lost_len = end_seq - tp->rcv_nxt; | |
| + tcp_send_ack(sk); | |
| + break; | |
| + } | |
| +} | |
| + | |
| +/* Reduces the congestion window (similar to completed fast recovery) | |
| + * If the node is already in recovery mode, undo is disabled to enforce | |
| + * the window reduction upon completion | |
| + */ | |
| +static void tcp_fec_reduce_window(struct sock *sk) | |
| +{ | |
| + struct tcp_sock *tp; | |
| + const struct inet_connection_sock *icsk; | |
| + | |
| + tp = tcp_sk(sk); | |
| + icsk = inet_csk(sk); | |
| + | |
| + if (icsk->icsk_ca_state < TCP_CA_CWR) { | |
| + tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); | |
| + if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH) { | |
| + tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); | |
| + tp->snd_cwnd_stamp = tcp_time_stamp; | |
| + } | |
| + | |
| + /* Any future window reduction requests are ignored until | |
| + * snd_nxt is ACKed | |
| + */ | |
| + tp->high_seq = tp->snd_nxt; | |
| + tp->undo_marker = 0; | |
| + } else { | |
| + /* Socket is in some congestion mode and we only need to make | |
| + * sure that window reduction is executed when recovery | |
| + * is finished | |
| + */ | |
| + tp->undo_marker = 0; | |
| + } | |
| +} | |
| + | |
| +/* The incoming ACK indicates a failed recovery. | |
| + * Mark all unacked SKBs in the loss range as lost. | |
| + * TODO With interleaved coding, we have the additional constraint | |
| + * that the SKBs in the loss range also must have been encoded the | |
| + * triggering FEC packet, and for that we need to keep some info | |
| + * about FEC packets on the sender side | |
| + */ | |
| +static void tcp_fec_mark_skbs_lost(struct sock *sk) | |
| +{ | |
| + struct tcp_sock *tp; | |
| + struct sk_buff *skb; | |
| + u32 start_seq, end_seq; | |
| + | |
| + tp = tcp_sk(sk); | |
| + skb = tp->lost_skb_hint ? tp->lost_skb_hint : tcp_write_queue_head(sk); | |
| + | |
| + /* All SKBs falling completely in the range are marked */ | |
| + start_seq = tp->rx_opt.fec.lost_seq; | |
| + end_seq = tp->rx_opt.fec.lost_seq + tp->rx_opt.fec.lost_len; | |
| + | |
| + tcp_for_write_queue_from(skb, sk) { | |
| + if (skb == tcp_send_head(sk)) | |
| + break; | |
| + | |
| + /* Past loss range */ | |
| + if (!before(TCP_SKB_CB(skb)->seq, end_seq)) | |
| + break; | |
| + | |
| + /* SKB not (fully) within range */ | |
| + if (before(TCP_SKB_CB(skb)->seq, start_seq) || | |
| + after(TCP_SKB_CB(skb)->end_seq, end_seq)) | |
| + continue; | |
| + | |
| + /* SKB already marked */ | |
| + if (TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED)) | |
| + continue; | |
| + | |
| + /* Verify retransmit hint before marking | |
| + * (see tcp_verify_retransmit_hint(), | |
| + * copied since method defined static in tcp_input.c) | |
| + */ | |
| + if ((tp->retransmit_skb_hint == NULL) || | |
| + before(TCP_SKB_CB(skb)->seq, | |
| + TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) | |
| + tp->retransmit_skb_hint = skb; | |
| + | |
| + if (!tp->lost_out || | |
| + after(TCP_SKB_CB(skb)->end_seq, tp->retransmit_high)) | |
| + tp->retransmit_high = TCP_SKB_CB(skb)->end_seq; | |
| + | |
| + /* Mark SKB as lost (see tcp_skb_mark_lost()) */ | |
| + tp->lost_out += tcp_skb_pcount(skb); | |
| + TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; | |
| + } | |
| + | |
| + tcp_verify_left_out(tp); | |
| +} | |
| + | |
| +/* Searches for the FEC option in the packet header and replaces | |
| + * the long option with a short one padded by NOPs. | |
| + * This is done to convert the option used by an encoded packet | |
| + * to the option used by a recovered packet. | |
| + */ | |
| +static bool tcp_fec_update_decoded_option(struct sk_buff *skb) | |
| +{ | |
| + struct tcphdr *th; | |
| + unsigned char *ptr; | |
| + int length; | |
| + | |
| + th = tcp_hdr(skb); | |
| + ptr = (unsigned char *) (th + 1); | |
| + length = (th->doff * 4) - sizeof(struct tcphdr); | |
| + | |
| + while (length > 0) { | |
| + int opcode = *ptr++; | |
| + int opsize; | |
| + | |
| + switch (opcode) { | |
| + case TCPOPT_EOL: | |
| + return 0; | |
| + case TCPOPT_NOP: | |
| + length--; | |
| + continue; | |
| + default: | |
| + opsize = *ptr++; | |
| + if (opsize < 2 || opsize > length) | |
| + return 0; | |
| + | |
| + if (opcode == TCPOPT_EXP && | |
| + get_unaligned_be16(ptr) == TCPOPT_FEC_MAGIC) { | |
| + /* Update FEC option: | |
| + * 1. Convert long option into short option | |
| + * 2. Clear ENCODED flag (keep other flags) | |
| + * 3. Replace option value (long option) by NOPs | |
| + */ | |
| + u32 *fec_opt_start = (u32 *) (ptr - 2); | |
| + *fec_opt_start = htonl(( | |
| + get_unaligned_be32(fec_opt_start) & | |
| + 0xFF00FFFF) | 0x00050000); | |
| + *(fec_opt_start + 1) = htonl(( | |
| + get_unaligned_be32(fec_opt_start + 1) & | |
| + 0xEF000000) | 0x00010101); | |
| + | |
| + return 1; | |
| + } | |
| + | |
| + ptr += opsize - 2; | |
| + length -= opsize; | |
| + } | |
| + } | |
| + | |
| + return 0; | |
| +} | |
| + | |
| +/* Allocates an SKB for data we want to forward to reception routines | |
| + * (recovered data) by making a copy of the FEC SKB and replacing the data | |
| + * part, all other segments (options, etc.) are preserved | |
| + */ | |
| +static struct sk_buff *tcp_fec_make_decoded_pkt(struct sock *sk, | |
| + const struct sk_buff *skb, | |
| + unsigned char *dec_data, | |
| + u32 seq, unsigned int len) | |
| +{ | |
| + struct tcp_sock *tp; | |
| + struct sk_buff *nskb; | |
| + | |
| + tp = tcp_sk(sk); | |
| + nskb = skb_copy(skb, GFP_ATOMIC); | |
| + if (nskb == NULL) | |
| + return NULL; | |
| + | |
| + /* Update FEC option for the new packet */ | |
| + if (!tcp_fec_update_decoded_option(nskb)) { | |
| + /* TODO Do we need this catch? Technically we don't reach this | |
| + * method if there is no FEC option in the header. | |
| + */ | |
| + return NULL; | |
| + } | |
| + | |
| + /* check if we received some tail of the recovered sequence already | |
| + * by looking at the current SACK blocks (we don't want to recover | |
| + * more data than necessary to prevent DSACKS) | |
| + */ | |
| + if (tcp_is_sack(tp)) { | |
| + int i; | |
| + for (i = 0; i < tp->rx_opt.num_sacks; i++) { | |
| + if (before(tp->selective_acks[i].start_seq, | |
| + seq + len) && | |
| + !before(tp->selective_acks[i].end_seq, | |
| + seq + len)) { | |
| + len = tp->selective_acks[i].start_seq - seq; | |
| + break; | |
| + } | |
| + } | |
| + } | |
| + | |
| + /* trim data section to fit recovered sequence if necessary */ | |
| + if (len < (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq)) | |
| + skb_trim(nskb, len + tcp_hdrlen(nskb)); | |
| + | |
| + /* fix the sequence numbers */ | |
| + tcp_hdr(nskb)->seq = htonl(seq); | |
| + tcp_hdr(nskb)->ack_seq = htonl(tp->snd_una); | |
| + TCP_SKB_CB(nskb)->seq = seq; | |
| + TCP_SKB_CB(nskb)->end_seq = seq + len; | |
| + | |
| + /* replace SKB payload with recovered data */ | |
| + memcpy(nskb->data + tcp_hdrlen(nskb), dec_data, len); | |
| + | |
| + /* packets used for recovery had their checksums checked already */ | |
| + nskb->ip_summed = CHECKSUM_UNNECESSARY; | |
| + | |
| + return nskb; | |
| +} | |
| + | |
| +/* Gets the next byte block from an SKB queue (any SKB which is touched | |
| + * in this procedure will be linearized to simplify payload access) | |
| + * @skb - Points to SKB from which previous block was extracted (useful | |
| + * for successive calls to this function, which avoids moving through | |
| + * the whole queue again) | |
| + * @queue - SKB queue to read from (SKB has to point to an element on this | |
| + * queue) | |
| + * @seq - Sequence number of first byte in the block | |
| + * @block_len | |
| + * @block | |
| + * | |
| + * Returns the bytes written to the block memory | |
| + */ | |
| +static unsigned int tcp_fec_get_next_block(struct sock *sk, | |
| + struct sk_buff **skb, | |
| + struct sk_buff_head *queue, u32 seq, | |
| + unsigned int block_len, unsigned char *block) | |
| +{ | |
| + unsigned int cur_len, offset, num_bytes; | |
| + int err; | |
| + u32 end_seq; | |
| + | |
| + cur_len = 0; | |
| + | |
| + /* Get first SKB of the write queue and specify next sequence to | |
| + * encode | |
| + */ | |
| + if (*skb == NULL) { | |
| + *skb = skb_peek(queue); | |
| + if (*skb == NULL) | |
| + return 0; | |
| + } | |
| + | |
| + /* move to SKB which stores the next sequence to encode */ | |
| + while (*skb) { | |
| + /* If we observe an RST/SYN, we stop here to avoid | |
| + * handling corner cases | |
| + */ | |
| + if (TCP_SKB_CB(*skb)->tcp_flags & | |
| + (TCPHDR_RST | | |
| + TCPHDR_SYN)) | |
| + return 0; | |
| + if (!before(seq, TCP_SKB_CB(*skb)->seq) && | |
| + before(seq, TCP_SKB_CB(*skb)->end_seq)) | |
| + break; | |
| + if (*skb == skb_peek_tail(queue)) { | |
| + *skb = NULL; | |
| + break; | |
| + } | |
| + | |
| + *skb = skb_queue_next(queue, *skb); | |
| + } | |
| + | |
| + if (*skb == NULL) | |
| + return 0; | |
| + | |
| + /* copy bytes from SKBs (connected sequences) */ | |
| + while (*skb && (cur_len < block_len)) { | |
| + err = skb_linearize(*skb); | |
| + if (err) | |
| + return err; | |
| + | |
| + /* Deal with the end seq number being incremented by | |
| + * one if the FIN flag is set (we don't want to encode this) | |
| + */ | |
| + end_seq = TCP_SKB_CB(*skb)->end_seq; | |
| + if (TCP_SKB_CB(*skb)->tcp_flags & TCPHDR_FIN) | |
| + end_seq--; | |
| + | |
| + if ((seq >= TCP_SKB_CB(*skb)->seq) && (seq < end_seq)) { | |
| + /* Copy data depending on: | |
| + * - remaining space in the block | |
| + * - remaining data in the SKB | |
| + */ | |
| + offset = seq - TCP_SKB_CB(*skb)->seq; | |
| + num_bytes = min(block_len - cur_len, | |
| + end_seq - seq); | |
| + | |
| + memcpy(block + cur_len, (*skb)->data + offset, | |
| + num_bytes); | |
| + cur_len += num_bytes; | |
| + seq += num_bytes; | |
| + } | |
| + | |
| + if (*skb == skb_peek_tail(queue) || cur_len >= block_len) | |
| + break; | |
| + | |
| + *skb = skb_queue_next(queue, *skb); | |
| + } | |
| + | |
| + return cur_len; | |
| +} | |
| + | |
| +/* Arms the timer for a delayed FEC transmission if there is | |
| + * no earlier timeout defined (i.e. retransmission timeout) | |
| + */ | |
| +void tcp_fec_arm_timer(struct sock *sk) | |
| +{ | |
| + struct inet_connection_sock *icsk; | |
| + struct tcp_sock *tp; | |
| + u32 delta, timeout, rtt; | |
| + | |
| + icsk = inet_csk(sk); | |
| + tp = tcp_sk(sk); | |
| + | |
| + /* Only arm a timer if connection is established */ | |
| + if (sk->sk_state != TCP_ESTABLISHED) | |
| + return; | |
| + | |
| + /* Forward next sequence to be encoded if unencoded data was acked */ | |
| + if (after(tp->snd_una, tp->fec.next_seq)) | |
| + tp->fec.next_seq = tp->snd_una; | |
| + | |
| + /* Don't arm the timer if there is no unencoded data left */ | |
| + if (!before(tp->fec.next_seq, tp->snd_nxt)) | |
| + return; | |
| + | |
| + /* TODO handle other timers which might be armed; | |
| + * EARLY_RETRANS? LOSS_PROBE? | |
| + */ | |
| + | |
| + /* Compute timeout (currently 0.25 * RTT) */ | |
| + rtt = tp->srtt_us >> 3; | |
| + timeout = rtt >> 2; | |
| + | |
| + /* Compute delay between transmission of original packet and this call | |
| + * (difference is subtracted from timeout value) | |
| + */ | |
| + delta = 0; | |
| + if (delta > timeout) { | |
| + tcp_fec_invoke_nodelay(sk); | |
| + return; | |
| + } else if (delta > 0) { | |
| + timeout -= delta; | |
| + } | |
| + | |
| + /* Do not replace a timeout occurring earlier */ | |
| + if (jiffies + timeout >= icsk->icsk_timeout) | |
| + return; | |
| + | |
| + inet_csk_reset_xmit_timer(sk, ICSK_TIME_FEC, timeout, TCP_RTO_MAX); | |
| +} | |
| + | |
| +/* The FEC timer fired. Force an FEC transmission for the | |
| + * last unencoded burst. Rearm the RTO timer (which was switched | |
| + * out when setting the FEC timer). Set a new FEC timer if there | |
| + * is pending unencoded data. | |
| + */ | |
| +void tcp_fec_timer(struct sock *sk) | |
| +{ | |
| + struct inet_connection_sock *icsk; | |
| + struct tcp_sock *tp; | |
| + | |
| + icsk = inet_csk(sk); | |
| + tp = tcp_sk(sk); | |
| + | |
| + tcp_fec_invoke_nodelay(sk); | |
| + | |
| + icsk->icsk_pending = 0; | |
| + tcp_rearm_rto(sk); | |
| + | |
| + tcp_fec_arm_timer(sk); | |
| +} | |
| + | |
| +/* If FEC packet transmissions are delayed set a timer | |
| + * (if not already set), otherwise invoke the FEC mechanism | |
| + * immediately | |
| + */ | |
| +int tcp_fec_invoke(struct sock *sk) | |
| +{ | |
| + struct inet_connection_sock *icsk; | |
| + struct tcp_sock *tp; | |
| + | |
| + icsk = inet_csk(sk); | |
| + tp = tcp_sk(sk); | |
| + | |
| +#ifndef TCP_FEC_DELAYED_SEND | |
| + return tcp_fec_invoke_nodelay(sk); | |
| +#else | |
| + /* Set the timer for sending an FEC packet if no FEC | |
| + * timer is active yet | |
| + */ | |
| + if (!icsk->icsk_pending || icsk->icsk_pending != ICSK_TIME_FEC) | |
| + tcp_fec_arm_timer(sk); | |
| +#endif | |
| + | |
| + return 0; | |
| +} | |
| + | |
| +/* Invokes the FEC mechanism set for the connection; | |
| + * Creates and sends out FEC packets | |
| + */ | |
| +int tcp_fec_invoke_nodelay(struct sock *sk) | |
| +{ | |
| + int err; | |
| + struct sk_buff_head *list; | |
| + struct sk_buff *skb; | |
| + struct tcp_fec *fec; | |
| + | |
| + list = kmalloc(sizeof(struct sk_buff_head), GFP_ATOMIC); | |
| + if (list == NULL) | |
| + return -ENOMEM; | |
| + | |
| + skb_queue_head_init(list); | |
| + err = tcp_fec_create(sk, list); | |
| + if (err) | |
| + goto clean; | |
| + | |
| + err = tcp_fec_xmit_all(sk, list); | |
| + if (err) | |
| + goto clean; | |
| + | |
| +clean: | |
| + /* Purge all SKBs (purge FEC structs first) */ | |
| + skb = (struct sk_buff *) list; | |
| + while (!skb_queue_is_last(list, skb)) { | |
| + skb = skb_queue_next(list, skb); | |
| + fec = TCP_SKB_CB(skb)->fec; | |
| + if (fec != NULL) { | |
| + kfree(fec); | |
| + TCP_SKB_CB(skb)->fec = NULL; | |
| + } | |
| + } | |
| + | |
| + skb_queue_purge(list); | |
| + kfree(list); | |
| + | |
| + /* TODO error handling; -ENOMEM, etc. - disable FEC? */ | |
| + | |
| + return err; | |
| +} | |
| + | |
| +/* Creates one or more FEC packets (can depend on the FEC type used) | |
| + * and puts them in a queue | |
| + * @list: queue head | |
| + */ | |
| +static int tcp_fec_create(struct sock *sk, struct sk_buff_head *list) | |
| +{ | |
| + struct tcp_sock *tp; | |
| + unsigned int first_seq, block_len; | |
| + int err; | |
| + | |
| + tp = tcp_sk(sk); | |
| + | |
| + /* Update the pointer to the first byte to be encoded next | |
| + * (this only matters when a packet was ACKed before it was | |
| + * encoded) | |
| + */ | |
| + if (after(tp->snd_una, tp->fec.next_seq)) | |
| + tp->fec.next_seq = tp->snd_una; | |
| + | |
| + first_seq = tp->fec.next_seq; | |
| + block_len = tcp_current_mss(sk); | |
| + | |
| + switch (tp->fec.type) { | |
| + case TCP_FEC_TYPE_NONE: | |
| + return 0; | |
| + case TCP_FEC_TYPE_XOR_ALL: | |
| + return tcp_fec_create_xor(sk, list, first_seq, | |
| + block_len, 0, | |
| + FEC_RCV_QUEUE_LIMIT - block_len); | |
| + case TCP_FEC_TYPE_XOR_SKIP_1: | |
| + err = tcp_fec_create_xor(sk, list, first_seq, block_len, 1, | |
| + FEC_RCV_QUEUE_LIMIT - block_len); | |
| + if (err) | |
| + return err; | |
| + | |
| + return tcp_fec_create_xor(sk, list, first_seq + block_len, | |
| + block_len, 1, | |
| + FEC_RCV_QUEUE_LIMIT - block_len); | |
| + } | |
| + | |
| + return 0; | |
| +} | |
| + | |
| +/* Creates FEC packet(s) using XOR encoding | |
| + * (allocates memory for the FEC structs) | |
| + * @first_seq - Sequence number of first byte to be encoded | |
| + * @block_len - Block length (typically MSS) | |
| + * @block_skip - Number of unencoded blocks between two encoded blocks | |
| + * @max_encoded_per_pkt - maximum number of blocks encoded per packet | |
| + * (0, if unlimited) | |
| + */ | |
| +static int tcp_fec_create_xor(struct sock *sk, struct sk_buff_head *list, | |
| + unsigned int first_seq, unsigned int block_len, | |
| + unsigned int block_skip, | |
| + unsigned int max_encoded_per_pkt) | |
| +{ | |
| + struct tcp_sock *tp; | |
| + struct sk_buff *skb, *fskb; | |
| + struct tcp_fec *fec; | |
| + unsigned int c_encoded; /* Number of currently encoded blocks | |
| + not yet added to an FEC packet */ | |
| + unsigned int next_seq; /* Next byte to encode */ | |
| + unsigned int i; | |
| + unsigned char *data, *block; | |
| + u16 data_len; | |
| + | |
| + tp = tcp_sk(sk); | |
| + skb = NULL; | |
| + c_encoded = 0; | |
| + next_seq = first_seq; | |
| + | |
| + /* memory allocation | |
| + * data - used temporarily to obtain byte blocks and store the payload | |
| + (is freed before returning; we need two blocks here to store | |
| + the previously XORed data that has not been added to an FEC | |
| + packet yet, and the new to-be XORed data extracted from one | |
| + or more existing buffers) | |
| + | |
| + * fec - used to store the FEC parameters | |
| + (is freed after the corresponding packet is forwarded to the | |
| + transmission routine) | |
| + */ | |
| + data = kmalloc(2 * block_len, GFP_ATOMIC); | |
| + if (data == NULL) | |
| + return -ENOMEM; | |
| + | |
| + fec = kmalloc(sizeof(struct tcp_fec), GFP_ATOMIC); | |
| + if (fec == NULL) { | |
| + kfree(data); | |
| + return -ENOMEM; | |
| + } | |
| + | |
| + memset(data, 0, 2 * block_len); | |
| + memset(fec, 0, sizeof(struct tcp_fec)); | |
| + | |
| + block = data + block_len; | |
| + | |
| + /* encode data blocks | |
| + * XXX atomicity check? | |
| + */ | |
| + fec->enc_seq = next_seq; | |
| + while ((data_len = tcp_fec_get_next_block(sk, &skb, | |
| + &sk->sk_write_queue, next_seq, | |
| + min(block_len, tp->snd_nxt - next_seq), | |
| + block))) { | |
| + /* Check if we reached the encoding limit; then create packet | |
| + * with current payload and add it to the queue | |
| + */ | |
| + if (max_encoded_per_pkt > 0 && | |
| + c_encoded >= max_encoded_per_pkt) { | |
| + fskb = tcp_fec_make_encoded_pkt(sk, fec, data, | |
| + block_len); | |
| + if (fskb == NULL) { | |
| + kfree(data); | |
| + kfree(fec); | |
| + return -EINVAL; | |
| + } | |
| + | |
| + skb_queue_tail(list, fskb); | |
| + memset(data, 0, block_len); | |
| + c_encoded = 0; | |
| + | |
| + /* memory allocation for the FEC struct of the next | |
| + * packet | |
| + */ | |
| + fec = kmalloc(sizeof(struct tcp_fec), GFP_ATOMIC); | |
| + if (fec == NULL) { | |
| + kfree(data); | |
| + return -ENOMEM; | |
| + } | |
| + | |
| + memset(fec, 0, sizeof(struct tcp_fec)); | |
| + fec->enc_seq = next_seq; | |
| + } | |
| + | |
| + next_seq += data_len; | |
| + fec->enc_len = next_seq - fec->enc_seq; | |
| + | |
| + /* encode block into existing payload (XOR) */ | |
| + for (i = 0; i < data_len; i++) | |
| + data[i] ^= block[i]; | |
| + | |
| + c_encoded++; | |
| + | |
| + /* skip over blocks which are not requested for encoding */ | |
| + next_seq += block_len * block_skip; | |
| + } | |
| + | |
| + /* create final packet if some data was selected for encoding */ | |
| + if (c_encoded > 0) { | |
| + fskb = tcp_fec_make_encoded_pkt(sk, fec, data, block_len); | |
| + if (fskb == NULL) { | |
| + kfree(data); | |
| + kfree(fec); | |
| + return -EINVAL; | |
| + } | |
| + | |
| + skb_queue_tail(list, fskb); | |
| + } else { | |
| + kfree(fec); | |
| + } | |
| + | |
| + tp->fec.next_seq = next_seq; | |
| + kfree(data); | |
| + | |
| + return 0; | |
| +} | |
| + | |
| +/* Allocates an SKB for data we want to send and assigns | |
| + * the necessary options and fields | |
| + */ | |
| +static struct sk_buff *tcp_fec_make_encoded_pkt(struct sock *sk, | |
| + struct tcp_fec *fec, | |
| + unsigned char *enc_data, | |
| + unsigned int len) | |
| +{ | |
| + struct sk_buff *skb; | |
| + unsigned char *data; | |
| + | |
| + /* See tcp_make_synack(); 15 probably for tail pointer etc.? */ | |
| + len = min(len, fec->enc_len); | |
| + skb = alloc_skb(MAX_TCP_HEADER + 15 + len, GFP_ATOMIC); | |
| + if (skb == NULL) | |
| + return NULL; | |
| + | |
| + /* Reserve space for headers */ | |
| + skb_reserve(skb, MAX_TCP_HEADER); | |
| + | |
| + /* Specify sequence number and FEC struct address in control buffer */ | |
| + fec->flags |= TCP_FEC_ENCODED; | |
| + TCP_SKB_CB(skb)->seq = fec->enc_seq; | |
| + TCP_SKB_CB(skb)->fec = fec; | |
| + | |
| + /* Enable ACK flag (required for all data packets) */ | |
| + TCP_SKB_CB(skb)->tcp_flags = TCPHDR_ACK; | |
| + | |
| + /* Set GSO parameters */ | |
| + skb_shinfo(skb)->gso_segs = 1; | |
| + skb_shinfo(skb)->gso_size = 0; | |
| + skb_shinfo(skb)->gso_type = 0; | |
| + | |
| + /* Append payload to SKB */ | |
| + data = skb_put(skb, len); | |
| + memcpy(data, enc_data, len); | |
| + | |
| + skb->ip_summed = CHECKSUM_PARTIAL; | |
| + | |
| + return skb; | |
| +} | |
| + | |
| +/* Transmit all FEC packets in a list */ | |
| +static int tcp_fec_xmit_all(struct sock *sk, struct sk_buff_head *list) | |
| +{ | |
| + struct sk_buff *skb; | |
| + int err; | |
| + | |
| + if (list == NULL || skb_queue_empty(list)) | |
| + return 0; | |
| + | |
| + skb = (struct sk_buff *) list; | |
| + while (!skb_queue_is_last(list, skb)) { | |
| + skb = skb_queue_next(list, skb); | |
| + err = tcp_fec_xmit(sk, skb); | |
| + if (err) | |
| + return err; | |
| + } | |
| + | |
| + return 0; | |
| +} | |
| + | |
| +/* Transmits an FEC packet */ | |
| +static int tcp_fec_xmit(struct sock *sk, struct sk_buff *skb) | |
| +{ | |
| + /* TODO timers? no retransmissions, but want to deactivate FEC | |
| + * if we never get any FEC ACKs back | |
| + */ | |
| + return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); | |
| +} | |
| diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c | |
| index f83ddf9..b640461 100644 | |
| --- a/net/ipv4/tcp_input.c | |
| +++ b/net/ipv4/tcp_input.c | |
| @@ -70,6 +70,7 @@ | |
| #include <linux/kernel.h> | |
| #include <net/dst.h> | |
| #include <net/tcp.h> | |
| +#include <net/tcp_fec.h> | |
| #include <net/inet_common.h> | |
| #include <linux/ipsec.h> | |
| #include <asm/unaligned.h> | |
| @@ -106,6 +107,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; | |
| #define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ | |
| #define FLAG_DATA_SACKED 0x20 /* New SACK. */ | |
| #define FLAG_ECE 0x40 /* ECE in this ACK */ | |
| +#define FLAG_FEC_CWR_REQUESTED 0x80 /* cwnd reduction requested */ | |
| #define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ | |
| #define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ | |
| #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ | |
| @@ -115,8 +117,9 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; | |
| #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) | |
| #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) | |
| -#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE) | |
| +#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE|FLAG_FEC_CWR_REQUESTED) | |
| #define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED) | |
| +#define FLAG_CONGESTION (FLAG_ECE|FLAG_FEC_CWR_REQUESTED) | |
| #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) | |
| #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) | |
| @@ -2546,7 +2549,11 @@ void tcp_enter_cwr(struct sock *sk) | |
| struct tcp_sock *tp = tcp_sk(sk); | |
| tp->prior_ssthresh = 0; | |
| - if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { | |
| + | |
| + /* | |
| + * !!! TCP FEC patch !!! | |
| + */ | |
| + if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR && after(tp->snd_una, tp->high_seq)) { | |
| tp->undo_marker = 0; | |
| tcp_init_cwnd_reduction(sk); | |
| tcp_set_ca_state(sk, TCP_CA_CWR); | |
| @@ -2968,6 +2975,12 @@ void tcp_rearm_rto(struct sock *sk) | |
| if (tp->fastopen_rsk) | |
| return; | |
| + /* Don't rearm the timer if an FEC timer is active. | |
| + * The FEC handler will rearm the timer once the event is handled. | |
| + */ | |
| + if (icsk->icsk_pending == ICSK_TIME_FEC) | |
| + return; | |
| + | |
| if (!tp->packets_out) { | |
| inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); | |
| } else { | |
| @@ -3228,16 +3241,23 @@ static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag) | |
| /* Decide wheather to run the increase function of congestion control. */ | |
| static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag) | |
| { | |
| + const struct tcp_sock *tp = tcp_sk(sk); | |
| if (tcp_in_cwnd_reduction(sk)) | |
| return false; | |
| + /* | |
| + * !!! TCP FEC patch !!! | |
| + */ | |
| + if ((flag & FLAG_CONGESTION) && !(tp->snd_cwnd < tp->snd_ssthresh)) | |
| + return false; | |
| + | |
| /* If reordering is high then always grow cwnd whenever data is | |
| * delivered regardless of its ordering. Otherwise stay conservative | |
| * and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/ | |
| * new SACK or ECE mark may first advance cwnd here and later reduce | |
| * cwnd in tcp_fastretrans_alert() based on more states. | |
| */ | |
| - if (tcp_sk(sk)->reordering > sysctl_tcp_reordering) | |
| + if (tp->reordering > sysctl_tcp_reordering) | |
| return flag & FLAG_FORWARD_PROGRESS; | |
| return flag & FLAG_DATA_ACKED; | |
| @@ -3425,6 +3445,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) | |
| icsk->icsk_retransmits = 0; | |
| } | |
| + /* Check if FEC expects and executes a window reduction */ | |
| + if (tcp_fec_is_enabled(tp) && tcp_fec_check_ack(sk, ack)) | |
| + flag |= FLAG_FEC_CWR_REQUESTED; | |
| + | |
| prior_fackets = tp->fackets_out; | |
| /* ts_recent update must be made after we are sure that the packet | |
| @@ -3656,6 +3680,20 @@ void tcp_parse_options(const struct sk_buff *skb, | |
| break; | |
| case TCPOPT_EXP: | |
| + /* | |
| + * !!! TCP FEC patch !!! | |
| + */ | |
| + if (sysctl_tcp_fec && | |
| + get_unaligned_be16(ptr) == | |
| + TCPOPT_FEC_MAGIC) { | |
| + tcp_fec_decode_option(&(opt_rx->fec), | |
| + ntohl(th->seq), | |
| + ntohl(th->ack_seq), th->syn, | |
| + ptr + 2, | |
| + opsize - TCPOLEN_EXP_FEC_BASE); | |
| + break; | |
| + } | |
| + | |
| /* Fast Open option shares code 254 using a | |
| * 16 bits magic number. | |
| */ | |
| @@ -4173,6 +4211,12 @@ static void tcp_ofo_queue(struct sock *sk) | |
| tp->rcv_nxt, TCP_SKB_CB(skb)->seq, | |
| TCP_SKB_CB(skb)->end_seq); | |
| + /* | |
| + * !!! TCP FEC patch !!! | |
| + */ | |
| + if (tcp_fec_is_enabled(tp)) | |
| + tcp_fec_update_queue(sk, skb); | |
| + | |
| tail = skb_peek_tail(&sk->sk_receive_queue); | |
| eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); | |
| tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; | |
| @@ -4410,6 +4454,9 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) | |
| goto out_of_window; | |
| /* Ok. In sequence. In window. */ | |
| + if (tcp_fec_is_enabled(tp)) | |
| + tcp_fec_update_queue(sk, skb); | |
| + | |
| if (tp->ucopy.task == current && | |
| tp->copied_seq == tp->rcv_nxt && tp->ucopy.len && | |
| sock_owned_by_user(sk) && !tp->urg_data) { | |
| @@ -4715,6 +4762,12 @@ static int tcp_prune_queue(struct sock *sk) | |
| tp->copied_seq, tp->rcv_nxt); | |
| sk_mem_reclaim(sk); | |
| + /* Disable FEC if it was enabled to prevent keeping data | |
| + * in the receive queue longer than necessary | |
| + */ | |
| + if (tcp_fec_is_enabled(tp)) | |
| + tcp_fec_disable(sk); | |
| + | |
| if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) | |
| return 0; | |
| @@ -4998,6 +5051,21 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, | |
| /* Reset is accepted even if it did not pass PAWS. */ | |
| } | |
| + /* Special processing if FEC is enabled */ | |
| + if (tcp_fec_is_enabled(tp)) { | |
| + if (tcp_fec_is_encoded(tp)) { | |
| + tcp_fec_process(sk, skb); | |
| + goto discard; | |
| + } else if (!tp->rx_opt.fec.saw_fec && th->ack && | |
| + sk->sk_state == TCP_LAST_ACK) { | |
| + /* TODO Sometimes the FEC option is not appended to the | |
| + * FIN-ACK packet; socket options cleared? | |
| + */ | |
| + tcp_ack(sk, skb, FLAG_SLOWPATH); | |
| + goto discard; | |
| + } | |
| + } | |
| + | |
| /* Step 1: check sequence number */ | |
| if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) { | |
| /* RFC793, page 37: "In all states except SYN-SENT, all reset | |
| @@ -5099,6 +5167,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, | |
| */ | |
| tp->rx_opt.saw_tstamp = 0; | |
| + tp->rx_opt.fec.saw_fec = 0; | |
| /* pred_flags is 0xS?10 << 16 + snd_wnd | |
| * if header_prediction is to be made | |
| @@ -5461,6 +5530,15 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, | |
| if (tcp_is_sack(tp) && sysctl_tcp_fack) | |
| tcp_enable_fack(tp); | |
| + /* | |
| + * FEC negotiation | |
| + * Disable FEC if both ends do not agree on the FEC type used | |
| + */ | |
| + if (tp->fec.type != tp->rx_opt.fec.type) { | |
| + tp->fec.type = 0; | |
| + tp->rx_opt.fec.type = 0; | |
| + } | |
| + | |
| tcp_mtup_init(sk); | |
| tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); | |
| tcp_initialize_rcv_mss(sk); | |
| @@ -5735,6 +5813,11 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, | |
| tcp_initialize_rcv_mss(sk); | |
| tcp_fast_path_on(tp); | |
| + | |
| + /* SYN requested FEC usage */ | |
| + if (tp->rx_opt.fec.type > 0) | |
| + tp->fec.type = tp->rx_opt.fec.type; | |
| + | |
| break; | |
| case TCP_FIN_WAIT1: { | |
| diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c | |
| index d886b60..5efbc2e 100644 | |
| --- a/net/ipv4/tcp_ipv4.c | |
| +++ b/net/ipv4/tcp_ipv4.c | |
| @@ -73,6 +73,9 @@ | |
| #include <net/timewait_sock.h> | |
| #include <net/xfrm.h> | |
| #include <net/secure_seq.h> | |
| + | |
| +#include <net/tcp_fec.h> | |
| + | |
| #include <net/tcp_memcontrol.h> | |
| #include <net/busy_poll.h> | |
| @@ -212,6 +215,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) | |
| tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; | |
| + memset(&(tp->rx_opt.fec), 0, sizeof(struct tcp_fec)); | |
| + | |
| /* Socket identity is still unknown (sport may be zero). | |
| * However we set state to SYN-SENT and not releasing socket | |
| * lock select source port, enter ourselves into the hash tables and | |
| @@ -2270,7 +2275,8 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) | |
| if (icsk->icsk_pending == ICSK_TIME_RETRANS || | |
| icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || | |
| - icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { | |
| + icsk->icsk_pending == ICSK_TIME_LOSS_PROBE || | |
| + icsk->icsk_pending == ICSK_TIME_FEC) { | |
| timer_active = 1; | |
| timer_expires = icsk->icsk_timeout; | |
| } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { | |
| diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c | |
| index 927586e..f59faf9 100644 | |
| --- a/net/ipv4/tcp_minisocks.c | |
| +++ b/net/ipv4/tcp_minisocks.c | |
| @@ -552,6 +552,15 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, | |
| newtp->fastopen_rsk = NULL; | |
| newtp->syn_data_acked = 0; | |
| + newtp->high_seq = newtp->snd_nxt; | |
| + | |
| + /* TCP FEC option */ | |
| + newtp->rx_opt.fec.type = sysctl_tcp_fec ? req->fec_type : 0; | |
| + newtp->fec.type = newtp->fec.flags = 0; | |
| + newtp->fec.next_seq = newtp->snd_nxt; | |
| + newtp->fec.bytes_rcv_queue = 0; | |
| + skb_queue_head_init(&newtp->fec.rcv_queue); | |
| + | |
| TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); | |
| } | |
| return newsk; | |
| diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c | |
| index ddd2a6f..7791899 100644 | |
| --- a/net/ipv4/tcp_output.c | |
| +++ b/net/ipv4/tcp_output.c | |
| @@ -37,6 +37,7 @@ | |
| #define pr_fmt(fmt) "TCP: " fmt | |
| #include <net/tcp.h> | |
| +#include <net/tcp_fec.h> | |
| #include <linux/compiler.h> | |
| #include <linux/gfp.h> | |
| @@ -65,6 +66,12 @@ int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS; | |
| /* By default, RFC2861 behavior. */ | |
| int sysctl_tcp_slow_start_after_idle __read_mostly = 1; | |
| +/* | |
| + * !!! TCP FEC patch !!! | |
| + */ | |
| +int sysctl_tcp_fec __read_mostly; | |
| + | |
| + | |
| unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX; | |
| EXPORT_SYMBOL(sysctl_tcp_notsent_lowat); | |
| @@ -422,6 +429,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) | |
| #define OPTION_MD5 (1 << 2) | |
| #define OPTION_WSCALE (1 << 3) | |
| #define OPTION_FAST_OPEN_COOKIE (1 << 8) | |
| +#define OPTION_FEC (1 << 9) | |
| struct tcp_out_options { | |
| u16 options; /* bit field of OPTION_* */ | |
| @@ -432,6 +440,7 @@ struct tcp_out_options { | |
| __u8 *hash_location; /* temporary pointer, overloaded */ | |
| __u32 tsval, tsecr; /* need to include OPTION_TS */ | |
| struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie */ | |
| + struct tcp_fec fec; /* FEC parameters */ | |
| }; | |
| /* Write previously computed TCP options to the packet. | |
| @@ -540,6 +549,9 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, | |
| } | |
| ptr += (len + 3) >> 2; | |
| } | |
| + | |
| + if (unlikely(OPTION_FEC & options)) | |
| + tcp_fec_encode_option(tp, &(opts->fec), &ptr); | |
| } | |
| /* Compute TCP options for SYN packets. This is not the final | |
| @@ -607,6 +619,14 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, | |
| } | |
| } | |
| + /* Prepare for FEC negotation if requested */ | |
| + if (unlikely(tcp_fec_is_enabled(tp)) && | |
| + remaining >= TCPOLEN_EXP_FEC_NEGOTIATION_ALIGNED) { | |
| + opts->options |= OPTION_FEC; | |
| + opts->fec.type = tp->fec.type; | |
| + remaining -= TCPOLEN_EXP_FEC_NEGOTIATION_ALIGNED; | |
| + } | |
| + | |
| return MAX_TCP_OPTION_SPACE - remaining; | |
| } | |
| @@ -671,6 +691,16 @@ static unsigned int tcp_synack_options(struct sock *sk, | |
| } | |
| } | |
| + /* Handle request for FEC support from other side | |
| + * (respond with same FEC option if FEC is locally supported) | |
| + */ | |
| + if (sysctl_tcp_fec && unlikely(req->fec_type) && | |
| + remaining >= TCPOLEN_EXP_FEC_NEGOTIATION_ALIGNED) { | |
| + opts->options |= OPTION_FEC; | |
| + opts->fec.type = req->fec_type; | |
| + remaining -= TCPOLEN_EXP_FEC_NEGOTIATION_ALIGNED; | |
| + } | |
| + | |
| return MAX_TCP_OPTION_SPACE - remaining; | |
| } | |
| @@ -681,6 +711,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb | |
| struct tcp_out_options *opts, | |
| struct tcp_md5sig_key **md5) | |
| { | |
| + struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL; | |
| struct tcp_sock *tp = tcp_sk(sk); | |
| unsigned int size = 0; | |
| unsigned int eff_sacks; | |
| @@ -715,6 +746,19 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb | |
| opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; | |
| } | |
| + /* Prepare option if connection has FEC enabled */ | |
| + if (tcp_fec_is_enabled(tp)) { | |
| + opts->options |= OPTION_FEC; | |
| + if (tcb && tcb->fec) | |
| + opts->fec = *(tcb->fec); | |
| + | |
| + /* regardless of packet type we need 4 more bytes | |
| + * including alignment | |
| + */ | |
| + size += 4; | |
| + size += TCPOLEN_EXP_FEC_BASE; | |
| + } | |
| + | |
| return size; | |
| } | |
| @@ -895,7 +939,7 @@ void tcp_wfree(struct sk_buff *skb) | |
| * We are working here with either a clone of the original | |
| * SKB, or a fresh unique copy made by the retransmit engine. | |
| */ | |
| -static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |
| +int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, | |
| gfp_t gfp_mask) | |
| { | |
| const struct inet_connection_sock *icsk = inet_csk(sk); | |
| @@ -2055,6 +2099,9 @@ repair: | |
| break; | |
| } | |
| + if (tcp_fec_is_enabled(tp)) | |
| + tcp_fec_invoke(sk); | |
| + | |
| if (likely(sent_pkts)) { | |
| if (tcp_in_cwnd_reduction(sk)) | |
| tp->prr_out += sent_pkts; | |
| @@ -3153,6 +3200,12 @@ int tcp_connect(struct sock *sk) | |
| */ | |
| tp->snd_nxt = tp->write_seq; | |
| tp->pushed_seq = tp->write_seq; | |
| + | |
| + /* Initialize FEC members */ | |
| + tp->fec.next_seq = tp->snd_nxt; | |
| + tp->fec.bytes_rcv_queue = 0; | |
| + skb_queue_head_init(&tp->fec.rcv_queue); | |
| + | |
| TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); | |
| /* Timer for repeating the SYN until an answer. */ | |
| diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c | |
| index dceaacc..b78ea8f 100644 | |
| --- a/net/ipv4/tcp_timer.c | |
| +++ b/net/ipv4/tcp_timer.c | |
| @@ -21,6 +21,7 @@ | |
| #include <linux/module.h> | |
| #include <linux/gfp.h> | |
| #include <net/tcp.h> | |
| +#include <net/tcp_fec.h> | |
| int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES; | |
| int sysctl_tcp_synack_retries __read_mostly = TCP_SYNACK_RETRIES; | |
| @@ -478,7 +479,15 @@ out_reset_timer: | |
| if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0)) | |
| __sk_dst_reset(sk); | |
| -out:; | |
| +out: | |
| + /* FEC will switch out the RTO timer if a delayed FEC transmission | |
| + * should happen earlier than this. RTO timer will be switched in | |
| + * once the FEC timer fired. | |
| + * FEC transmissions during a loss episode require that the sysctl | |
| + * value is >= 2. | |
| + */ | |
| + if (tcp_fec_is_enabled(tp) && sysctl_tcp_fec >= 2) | |
| + tcp_fec_arm_timer(sk); | |
| } | |
| void tcp_write_timer_handler(struct sock *sk) | |
| @@ -503,6 +512,9 @@ void tcp_write_timer_handler(struct sock *sk) | |
| case ICSK_TIME_LOSS_PROBE: | |
| tcp_send_loss_probe(sk); | |
| break; | |
| + case ICSK_TIME_FEC: | |
| + tcp_fec_timer(sk); | |
| + break; | |
| case ICSK_TIME_RETRANS: | |
| icsk->icsk_pending = 0; | |
| tcp_retransmit_timer(sk); | |
| diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c | |
| index c5078c5..d5205c6 100644 | |
| --- a/net/ipv6/tcp_ipv6.c | |
| +++ b/net/ipv6/tcp_ipv6.c | |
| @@ -288,6 +288,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, | |
| tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); | |
| + memset(&(tp->rx_opt.fec), 0, sizeof(struct tcp_fec)); | |
| + | |
| inet->inet_dport = usin->sin6_port; | |
| tcp_set_state(sk, TCP_SYN_SENT); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment