📄 tcp_input.c
字号:
/* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * Version: $Id: tcp_input.c,v 1.243 2002/02/01 22:01:04 davem Exp $ * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Florian La Roche, <flla@stud.uni-sb.de> * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> * Linus Torvalds, <torvalds@cs.helsinki.fi> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Matthew Dillon, <dillon@apollo.west.oic.com> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Jorge Cwik, <jorge@laser.satlink.net> *//* * Changes: * Pedro Roque : Fast Retransmit/Recovery. * Two receive queues. * Retransmit queue handled by TCP. * Better retransmit timer handling. * New congestion avoidance. * Header prediction. * Variable renaming. * * Eric : Fast Retransmit. * Randy Scott : MSS option defines. * Eric Schenk : Fixes to slow start algorithm. * Eric Schenk : Yet another double ACK bug. * Eric Schenk : Delayed ACK bug fixes. * Eric Schenk : Floyd style fast retrans war avoidance. * David S. Miller : Don't allow zero congestion window. * Eric Schenk : Fix retransmitter so that it sends * next packet on ack of previous packet. * Andi Kleen : Moved open_request checking here * and process RSTs for open_requests. * Andi Kleen : Better prune_queue, and other fixes. * Andrey Savochkin: Fix RTT measurements in the presence of * timestamps. * Andrey Savochkin: Check sequence numbers correctly when * removing SACKs due to in sequence incoming * data segments. * Andi Kleen: Make sure we never ack data there is not * enough room for. Also make this condition * a fatal error if it might still happen. * Andi Kleen: Add tcp_measure_rcv_mss to make * connections with MSS<min(MTU,ann. MSS) * work without delayed acks. * Andi Kleen: Process packets with PSH set in the * fast path. * J Hadi Salim: ECN support * Andrei Gurtov, * Pasi Sarolahti, * Panu Kuhlberg: Experimental audit of TCP (re)transmission * engine. Lots of bugs are found. * Pasi Sarolahti: F-RTO for dealing with spurious RTOs */#include <linux/mm.h>#include <linux/module.h>#include <linux/sysctl.h>#include <net/tcp.h>#include <net/inet_common.h>#include <linux/ipsec.h>#include <asm/unaligned.h>#include <net/netdma.h>int sysctl_tcp_timestamps __read_mostly = 1;int sysctl_tcp_window_scaling __read_mostly = 1;int sysctl_tcp_sack __read_mostly = 1;int sysctl_tcp_fack __read_mostly = 1;int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH;int sysctl_tcp_ecn __read_mostly;int sysctl_tcp_dsack __read_mostly = 1;int sysctl_tcp_app_win __read_mostly = 31;int sysctl_tcp_adv_win_scale __read_mostly = 2;int sysctl_tcp_stdurg __read_mostly;int sysctl_tcp_rfc1337 __read_mostly;int sysctl_tcp_max_orphans __read_mostly = NR_FILE;int sysctl_tcp_frto __read_mostly = 2;int sysctl_tcp_frto_response __read_mostly;int sysctl_tcp_nometrics_save __read_mostly;int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;int sysctl_tcp_abc __read_mostly;#define FLAG_DATA 0x01 /* Incoming frame contained data. */#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */#define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */#define FLAG_RETRANS_DATA_ACKED 0x08 /* "" "" some of which was retransmitted. */#define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */#define FLAG_DATA_SACKED 0x20 /* New SACK. */#define FLAG_ECE 0x40 /* ECE in this ACK */#define FLAG_DATA_LOST 0x80 /* SACK detected data lossage. */#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/#define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */#define FLAG_NONHEAD_RETRANS_ACKED 0x1000 /* Non-head rexmitted data was ACKed */#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)#define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE)#define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED)#define FLAG_ANY_PROGRESS (FLAG_FORWARD_PROGRESS|FLAG_SND_UNA_ADVANCED)#define IsSackFrto() (sysctl_tcp_frto == 0x2)#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))/* Adapt the MSS value used to make delayed ack decision to the * real world. */static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb){ struct inet_connection_sock *icsk = inet_csk(sk); const unsigned int lss = icsk->icsk_ack.last_seg_size; unsigned int len; icsk->icsk_ack.last_seg_size = 0; /* skb->len may jitter because of SACKs, even if peer * sends good full-sized frames. */ len = skb_shinfo(skb)->gso_size ?: skb->len; if (len >= icsk->icsk_ack.rcv_mss) { icsk->icsk_ack.rcv_mss = len; } else { /* Otherwise, we make more careful check taking into account, * that SACKs block is variable. * * "len" is invariant segment length, including TCP header. */ len += skb->data - skb_transport_header(skb); if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr) || /* If PSH is not set, packet should be * full sized, provided peer TCP is not badly broken. * This observation (if it is correct 8)) allows * to handle super-low mtu links fairly. */ (len >= TCP_MIN_MSS + sizeof(struct tcphdr) && !(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) { /* Subtract also invariant (if peer is RFC compliant), * tcp header plus fixed timestamp option length. * Resulting "len" is MSS free of SACK jitter. */ len -= tcp_sk(sk)->tcp_header_len; icsk->icsk_ack.last_seg_size = len; if (len == lss) { icsk->icsk_ack.rcv_mss = len; return; } } if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED) icsk->icsk_ack.pending |= ICSK_ACK_PUSHED2; icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; }}static void tcp_incr_quickack(struct sock *sk){ struct inet_connection_sock *icsk = inet_csk(sk); unsigned quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss); if (quickacks==0) quickacks=2; if (quickacks > icsk->icsk_ack.quick) icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);}void tcp_enter_quickack_mode(struct sock *sk){ struct inet_connection_sock *icsk = inet_csk(sk); tcp_incr_quickack(sk); icsk->icsk_ack.pingpong = 0; icsk->icsk_ack.ato = TCP_ATO_MIN;}/* Send ACKs quickly, if "quick" count is not exhausted * and the session is not interactive. */static inline int tcp_in_quickack_mode(const struct sock *sk){ const struct inet_connection_sock *icsk = inet_csk(sk); return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong;}static inline void TCP_ECN_queue_cwr(struct tcp_sock *tp){ if (tp->ecn_flags&TCP_ECN_OK) tp->ecn_flags |= TCP_ECN_QUEUE_CWR;}static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, struct sk_buff *skb){ if (tcp_hdr(skb)->cwr) tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;}static inline void TCP_ECN_withdraw_cwr(struct tcp_sock *tp){ tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;}static inline void TCP_ECN_check_ce(struct tcp_sock *tp, struct sk_buff *skb){ if (tp->ecn_flags&TCP_ECN_OK) { if (INET_ECN_is_ce(TCP_SKB_CB(skb)->flags)) tp->ecn_flags |= TCP_ECN_DEMAND_CWR; /* Funny extension: if ECT is not set on a segment, * it is surely retransmit. It is not in ECN RFC, * but Linux follows this rule. */ else if (INET_ECN_is_not_ect((TCP_SKB_CB(skb)->flags))) tcp_enter_quickack_mode((struct sock *)tp); }}static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, struct tcphdr *th){ if ((tp->ecn_flags&TCP_ECN_OK) && (!th->ece || th->cwr)) tp->ecn_flags &= ~TCP_ECN_OK;}static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, struct tcphdr *th){ if ((tp->ecn_flags&TCP_ECN_OK) && (!th->ece || !th->cwr)) tp->ecn_flags &= ~TCP_ECN_OK;}static inline int TCP_ECN_rcv_ecn_echo(struct tcp_sock *tp, struct tcphdr *th){ if (th->ece && !th->syn && (tp->ecn_flags&TCP_ECN_OK)) return 1; return 0;}/* Buffer size and advertised window tuning. * * 1. Tuning sk->sk_sndbuf, when connection enters established state. */static void tcp_fixup_sndbuf(struct sock *sk){ int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); if (sk->sk_sndbuf < 3 * sndmem) sk->sk_sndbuf = min(3 * sndmem, sysctl_tcp_wmem[2]);}/* 2. Tuning advertised window (window_clamp, rcv_ssthresh) * * All tcp_full_space() is split to two parts: "network" buffer, allocated * forward and advertised in receiver window (tp->rcv_wnd) and * "application buffer", required to isolate scheduling/application * latencies from network. * window_clamp is maximal advertised window. It can be less than * tcp_full_space(), in this case tcp_full_space() - window_clamp * is reserved for "application" buffer. The less window_clamp is * the smoother our behaviour from viewpoint of network, but the lower * throughput and the higher sensitivity of the connection to losses. 8) * * rcv_ssthresh is more strict window_clamp used at "slow start" * phase to predict further behaviour of this connection. * It is used for two goals: * - to enforce header prediction at sender, even when application * requires some significant "application buffer". It is check #1. * - to prevent pruning of receive queue because of misprediction * of receiver window. Check #2. * * The scheme does not work when sender sends good segments opening * window and then starts to feed us spaghetti. But it should work * in common situations. Otherwise, we have to rely on queue collapsing. *//* Slow part of check#2. */static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb){ struct tcp_sock *tp = tcp_sk(sk); /* Optimize this! */ int truesize = tcp_win_from_space(skb->truesize)/2; int window = tcp_win_from_space(sysctl_tcp_rmem[2])/2; while (tp->rcv_ssthresh <= window) { if (truesize <= skb->len) return 2 * inet_csk(sk)->icsk_ack.rcv_mss; truesize >>= 1; window >>= 1; } return 0;}static void tcp_grow_window(struct sock *sk, struct sk_buff *skb){ struct tcp_sock *tp = tcp_sk(sk); /* Check #1 */ if (tp->rcv_ssthresh < tp->window_clamp && (int)tp->rcv_ssthresh < tcp_space(sk) && !tcp_memory_pressure) { int incr; /* Check #2. Increase window, if skb with such overhead * will fit to rcvbuf in future. */ if (tcp_win_from_space(skb->truesize) <= skb->len) incr = 2*tp->advmss; else incr = __tcp_grow_window(sk, skb); if (incr) { tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp); inet_csk(sk)->icsk_ack.quick |= 1; } }}/* 3. Tuning rcvbuf, when connection enters established state. */static void tcp_fixup_rcvbuf(struct sock *sk){ struct tcp_sock *tp = tcp_sk(sk); int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); /* Try to select rcvbuf so that 4 mss-sized segments * will fit to window and corresponding skbs will fit to our rcvbuf. * (was 3; 4 is minimum to allow fast retransmit to work.) */ while (tcp_win_from_space(rcvmem) < tp->advmss) rcvmem += 128; if (sk->sk_rcvbuf < 4 * rcvmem) sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]);}/* 4. Try to fixup all. It is made immediately after connection enters * established state. */static void tcp_init_buffer_space(struct sock *sk){ struct tcp_sock *tp = tcp_sk(sk); int maxwin; if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) tcp_fixup_rcvbuf(sk); if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) tcp_fixup_sndbuf(sk); tp->rcvq_space.space = tp->rcv_wnd; maxwin = tcp_full_space(sk); if (tp->window_clamp >= maxwin) { tp->window_clamp = maxwin; if (sysctl_tcp_app_win && maxwin > 4 * tp->advmss) tp->window_clamp = max(maxwin - (maxwin >> sysctl_tcp_app_win), 4 * tp->advmss); } /* Force reservation of one segment. */ if (sysctl_tcp_app_win && tp->window_clamp > 2 * tp->advmss && tp->window_clamp + tp->advmss > maxwin) tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss); tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp); tp->snd_cwnd_stamp = tcp_time_stamp;}/* 5. Recalculate window clamp after socket hit its memory bounds. */static void tcp_clamp_window(struct sock *sk){ struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_ack.quick = 0; if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && !tcp_memory_pressure && atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), sysctl_tcp_rmem[2]); } if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss);}/* Initialize RCV_MSS value. * RCV_MSS is an our guess about MSS used by the peer. * We haven't any direct information about the MSS. * It's better to underestimate the RCV_MSS rather than overestimate. * Overestimations make us ACKing less frequently than needed. * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss(). */void tcp_initialize_rcv_mss(struct sock *sk){
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -