📄 tcp_input.c
字号:
int cnt = 0; /* Reduce ssthresh if it has not yet been made inside this window. */ if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { tp->prior_ssthresh = tcp_current_ssthresh(tp); tp->snd_ssthresh = tcp_recalc_ssthresh(tp); } tp->snd_cwnd = 1; tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_time_stamp; tcp_clear_retrans(tp); /* Push undo marker, if it was plain RTO and nothing * was retransmitted. */ if (!how) tp->undo_marker = tp->snd_una; sk_stream_for_retrans_queue(skb, sk) { cnt += tcp_skb_pcount(skb); if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS) tp->undo_marker = 0; TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tcp_inc_pcount(&tp->lost_out, skb); } else { tcp_inc_pcount(&tp->sacked_out, skb); tcp_set_pcount(&tp->fackets_out, cnt); } } tcp_sync_left_out(tp); tp->reordering = min_t(unsigned int, tp->reordering, sysctl_tcp_reordering); tcp_set_ca_state(tp, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; TCP_ECN_queue_cwr(tp);}static int tcp_check_sack_reneging(struct sock *sk, struct tcp_opt *tp){ struct sk_buff *skb; /* If ACK arrived pointing to a remembered SACK, * it means that our remembered SACKs do not reflect * real state of receiver i.e. * receiver _host_ is heavily congested (or buggy). * Do processing similar to RTO timeout. */ if ((skb = skb_peek(&sk->sk_write_queue)) != NULL && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING); tcp_enter_loss(sk, 1); tp->retransmits++; tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)); tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); return 1; } return 0;}static inline int tcp_fackets_out(struct tcp_opt *tp){ return IsReno(tp) ? tcp_get_pcount(&tp->sacked_out)+1 : tcp_get_pcount(&tp->fackets_out);}static inline int tcp_skb_timedout(struct tcp_opt *tp, struct sk_buff *skb){ return (tcp_time_stamp - TCP_SKB_CB(skb)->when > tp->rto);}static inline int tcp_head_timedout(struct sock *sk, struct tcp_opt *tp){ return tcp_get_pcount(&tp->packets_out) && tcp_skb_timedout(tp, skb_peek(&sk->sk_write_queue));}/* Linux NewReno/SACK/FACK/ECN state machine. * -------------------------------------- * * "Open" Normal state, no dubious events, fast path. * "Disorder" In all the respects it is "Open", * but requires a bit more attention. It is entered when * we see some SACKs or dupacks. It is split of "Open" * mainly to move some processing from fast path to slow one. * "CWR" CWND was reduced due to some Congestion Notification event. * It can be ECN, ICMP source quench, local device congestion. * "Recovery" CWND was reduced, we are fast-retransmitting. * "Loss" CWND was reduced due to RTO timeout or SACK reneging. * * tcp_fastretrans_alert() is entered: * - each incoming ACK, if state is not "Open" * - when arrived ACK is unusual, namely: * * SACK * * Duplicate ACK. * * ECN ECE. * * Counting packets in flight is pretty simple. * * in_flight = packets_out - left_out + retrans_out * * packets_out is SND.NXT-SND.UNA counted in packets. * * retrans_out is number of retransmitted segments. * * left_out is number of segments left network, but not ACKed yet. * * left_out = sacked_out + lost_out * * sacked_out: Packets, which arrived to receiver out of order * and hence not ACKed. With SACKs this number is simply * amount of SACKed data. Even without SACKs * it is easy to give pretty reliable estimate of this number, * counting duplicate ACKs. * * lost_out: Packets lost by network. TCP has no explicit * "loss notification" feedback from network (for now). * It means that this number can be only _guessed_. * Actually, it is the heuristics to predict lossage that * distinguishes different algorithms. * * F.e. after RTO, when all the queue is considered as lost, * lost_out = packets_out and in_flight = retrans_out. * * Essentially, we have now two algorithms counting * lost packets. * * FACK: It is the simplest heuristics. As soon as we decided * that something is lost, we decide that _all_ not SACKed * packets until the most forward SACK are lost. I.e. * lost_out = fackets_out - sacked_out and left_out = fackets_out. * It is absolutely correct estimate, if network does not reorder * packets. And it loses any connection to reality when reordering * takes place. We use FACK by default until reordering * is suspected on the path to this destination. * * NewReno: when Recovery is entered, we assume that one segment * is lost (classic Reno). While we are in Recovery and * a partial ACK arrives, we assume that one more packet * is lost (NewReno). This heuristics are the same in NewReno * and SACK. * * Imagine, that's all! Forget about all this shamanism about CWND inflation * deflation etc. CWND is real congestion window, never inflated, changes * only according to classic VJ rules. * * Really tricky (and requiring careful tuning) part of algorithm * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue(). * The first determines the moment _when_ we should reduce CWND and, * hence, slow down forward transmission. In fact, it determines the moment * when we decide that hole is caused by loss, rather than by a reorder. * * tcp_xmit_retransmit_queue() decides, _what_ we should retransmit to fill * holes, caused by lost packets. * * And the most logically complicated part of algorithm is undo * heuristics. We detect false retransmits due to both too early * fast retransmit (reordering) and underestimated RTO, analyzing * timestamps and D-SACKs. When we detect that some segments were * retransmitted by mistake and CWND reduction was wrong, we undo * window reduction and abort recovery phase. This logic is hidden * inside several functions named tcp_try_undo_<something>. *//* This function decides, when we should leave Disordered state * and enter Recovery phase, reducing congestion window. * * Main question: may we further continue forward transmission * with the same cwnd? */static inttcp_time_to_recover(struct sock *sk, struct tcp_opt *tp){ __u32 packets_out; /* Trick#1: The loss is proven. */ if (tcp_get_pcount(&tp->lost_out)) return 1; /* Not-A-Trick#2 : Classic rule... */ if (tcp_fackets_out(tp) > tp->reordering) return 1; /* Trick#3 : when we use RFC2988 timer restart, fast * retransmit can be triggered by timeout of queue head. */ if (tcp_head_timedout(sk, tp)) return 1; /* Trick#4: It is still not OK... But will it be useful to delay * recovery more? */ packets_out = tcp_get_pcount(&tp->packets_out); if (packets_out <= tp->reordering && tcp_get_pcount(&tp->sacked_out) >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) && !tcp_may_send_now(sk, tp)) { /* We have nothing to send. This connection is limited * either by receiver window or by application. */ return 1; } return 0;}/* If we receive more dupacks than we expected counting segments * in assumption of absent reordering, interpret this as reordering. * The only another reason could be bug in receiver TCP. */static void tcp_check_reno_reordering(struct tcp_opt *tp, int addend){ u32 holes; holes = max(tcp_get_pcount(&tp->lost_out), 1U); holes = min(holes, tcp_get_pcount(&tp->packets_out)); if ((tcp_get_pcount(&tp->sacked_out) + holes) > tcp_get_pcount(&tp->packets_out)) { tcp_set_pcount(&tp->sacked_out, (tcp_get_pcount(&tp->packets_out) - holes)); tcp_update_reordering(tp, tcp_get_pcount(&tp->packets_out)+addend, 0); }}/* Emulate SACKs for SACKless connection: account for a new dupack. */static void tcp_add_reno_sack(struct tcp_opt *tp){ tcp_inc_pcount_explicit(&tp->sacked_out, 1); tcp_check_reno_reordering(tp, 0); tcp_sync_left_out(tp);}/* Account for ACK, ACKing some data in Reno Recovery phase. */static void tcp_remove_reno_sacks(struct sock *sk, struct tcp_opt *tp, int acked){ if (acked > 0) { /* One ACK acked hole. The rest eat duplicate ACKs. */ if (acked-1 >= tcp_get_pcount(&tp->sacked_out)) tcp_set_pcount(&tp->sacked_out, 0); else tcp_dec_pcount_explicit(&tp->sacked_out, acked-1); } tcp_check_reno_reordering(tp, acked); tcp_sync_left_out(tp);}static inline void tcp_reset_reno_sack(struct tcp_opt *tp){ tcp_set_pcount(&tp->sacked_out, 0); tcp_set_pcount(&tp->left_out, tcp_get_pcount(&tp->lost_out));}/* Mark head of queue up as lost. */static voidtcp_mark_head_lost(struct sock *sk, struct tcp_opt *tp, int packets, u32 high_seq){ struct sk_buff *skb; int cnt = packets; BUG_TRAP(cnt <= tcp_get_pcount(&tp->packets_out)); sk_stream_for_retrans_queue(skb, sk) { cnt -= tcp_skb_pcount(skb); if (cnt < 0 || after(TCP_SKB_CB(skb)->end_seq, high_seq)) break; if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tcp_inc_pcount(&tp->lost_out, skb); } } tcp_sync_left_out(tp);}/* Account newly detected lost packet(s) */static void tcp_update_scoreboard(struct sock *sk, struct tcp_opt *tp){ if (IsFack(tp)) { int lost = tcp_get_pcount(&tp->fackets_out) - tp->reordering; if (lost <= 0) lost = 1; tcp_mark_head_lost(sk, tp, lost, tp->high_seq); } else { tcp_mark_head_lost(sk, tp, 1, tp->high_seq); } /* New heuristics: it is possible only after we switched * to restart timer each time when something is ACKed. * Hence, we can detect timed out packets during fast * retransmit without falling to slow start. */ if (tcp_head_timedout(sk, tp)) { struct sk_buff *skb; sk_stream_for_retrans_queue(skb, sk) { if (tcp_skb_timedout(tp, skb) && !(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tcp_inc_pcount(&tp->lost_out, skb); } } tcp_sync_left_out(tp); }}/* CWND moderation, preventing bursts due to too big ACKs * in dubious situations. */static __inline__ void tcp_moderate_cwnd(struct tcp_opt *tp){ tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+tcp_max_burst(tp)); tp->snd_cwnd_stamp = tcp_time_stamp;}/* Decrease cwnd each second ack. */static void tcp_cwnd_down(struct tcp_opt *tp){ int decr = tp->snd_cwnd_cnt + 1; __u32 limit; /* * TCP Westwood * Here limit is evaluated as BWestimation*RTTmin (for obtaining it * in packets we use mss_cache). If sysctl_tcp_westwood is off * tcp_westwood_bw_rttmin() returns 0. In such case snd_ssthresh is * still used as usual. It prevents other strange cases in which * BWE*RTTmin could assume value 0. It should not happen but... */ if (!(limit = tcp_westwood_bw_rttmin(tp))) limit = tp->snd_ssthresh/2; tp->snd_cwnd_cnt = decr&1; decr >>= 1; if (decr && tp->snd_cwnd > limit) tp->snd_cwnd -= decr; tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); tp->snd_cwnd_stamp = tcp_time_stamp;}/* Nothing was retransmitted or returned timestamp is less * than timestamp of the first retransmission. */static __inline__ int tcp_packet_delayed(struct tcp_opt *tp){ return !tp->retrans_stamp || (tp->saw_tstamp && tp->rcv_tsecr && (__s32)(tp->rcv_tsecr - tp->retrans_stamp) < 0);}/* Undo procedures. */#if FASTRETRANS_DEBUG > 1static void DBGUNDO(struct sock *sk, struct tcp_opt *tp, const char *msg){ struct inet_opt *inet = inet_sk(sk); printk(KERN_DEBUG "Undo %s %u.%u.%u.%u/%u c%u l%u ss%u/%u p%u\n", msg, NIPQUAD(inet->daddr), ntohs(inet->dport), tp->snd_cwnd, tcp_get_pcount(&tp->left_out), tp->snd_ssthresh, tp->prior_ssthresh, tcp_get_pcount(&tp->packets_out));}#else#define DBGUNDO(x...) do { } while (0)#endifstatic void tcp_undo_cwr(struct tcp_opt *tp, int undo){ if (tp->prior_ssthresh) { tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1); if (undo && tp->prior_ssthresh > tp->snd_ssthresh) { tp->snd_ssthresh = tp->prior_ssthresh; TCP_ECN_withdraw_cwr(tp); } } else { tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh); } tcp_moderate_cwnd(tp); tp->snd_cwnd_stamp = tcp_time_stamp;}static inline int tcp_may_undo(struct tcp_opt *tp){ return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp));}/* People celebrate: "We love our President!" */static int tcp_try_undo_recovery(struct sock *sk, struct tcp_opt *tp){ if (tcp_may_undo(tp)) { /* Happy end! We did not retransmit anything * or our original transmission succeeded. */ DBGUNDO(sk, tp, tp->ca_state == TCP_CA_Loss ? "loss" : "retrans"); tcp_undo_cwr(tp, 1); if (tp->ca_state == TCP_CA_Loss) NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); else NET_INC_STATS_BH(LINUX_MIB_TCPFULLUNDO); tp->undo_marker = 0; } if (tp->snd_una == tp->high_seq && IsReno(tp)) { /* Hold old state until something *above* high_seq * is ACKed. For Reno it is MUST to prevent false * fast retransmits (RFC2582). SACK TCP is safe. */ tcp_moderate_cwnd(tp); return 1; } tcp_set_ca_state(tp, TCP_CA_Open); return 0;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -