📄 tcp_input.c
字号:
/* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */static void tcp_try_undo_dsack(struct sock *sk, struct tcp_opt *tp){ if (tp->undo_marker && !tp->undo_retrans) { DBGUNDO(sk, tp, "D-SACK"); tcp_undo_cwr(tp, 1); tp->undo_marker = 0; NET_INC_STATS_BH(LINUX_MIB_TCPDSACKUNDO); }}/* Undo during fast recovery after partial ACK. */static int tcp_try_undo_partial(struct sock *sk, struct tcp_opt *tp, int acked){ /* Partial ACK arrived. Force Hoe's retransmit. */ int failed = IsReno(tp) || tcp_get_pcount(&tp->fackets_out)>tp->reordering; if (tcp_may_undo(tp)) { /* Plain luck! Hole if filled with delayed * packet, rather than with a retransmit. */ if (tcp_get_pcount(&tp->retrans_out) == 0) tp->retrans_stamp = 0; tcp_update_reordering(tp, tcp_fackets_out(tp)+acked, 1); DBGUNDO(sk, tp, "Hoe"); tcp_undo_cwr(tp, 0); NET_INC_STATS_BH(LINUX_MIB_TCPPARTIALUNDO); /* So... Do not make Hoe's retransmit yet. * If the first packet was delayed, the rest * ones are most probably delayed as well. */ failed = 0; } return failed;}/* Undo during loss recovery after partial ACK. */static int tcp_try_undo_loss(struct sock *sk, struct tcp_opt *tp){ if (tcp_may_undo(tp)) { struct sk_buff *skb; sk_stream_for_retrans_queue(skb, sk) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; } DBGUNDO(sk, tp, "partial loss"); tcp_set_pcount(&tp->lost_out, 0); tcp_set_pcount(&tp->left_out, tcp_get_pcount(&tp->sacked_out)); tcp_undo_cwr(tp, 1); NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); tp->retransmits = 0; tp->undo_marker = 0; if (!IsReno(tp)) tcp_set_ca_state(tp, TCP_CA_Open); return 1; } return 0;}static __inline__ void tcp_complete_cwr(struct tcp_opt *tp){ if (tcp_westwood_cwnd(tp)) tp->snd_ssthresh = tp->snd_cwnd; else tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); tp->snd_cwnd_stamp = tcp_time_stamp;}static void tcp_try_to_open(struct sock *sk, struct tcp_opt *tp, int flag){ tcp_set_pcount(&tp->left_out, tcp_get_pcount(&tp->sacked_out)); if (tcp_get_pcount(&tp->retrans_out) == 0) tp->retrans_stamp = 0; if (flag&FLAG_ECE) tcp_enter_cwr(tp); if (tp->ca_state != TCP_CA_CWR) { int state = TCP_CA_Open; if (tcp_get_pcount(&tp->left_out) || tcp_get_pcount(&tp->retrans_out) || tp->undo_marker) state = TCP_CA_Disorder; if (tp->ca_state != state) { tcp_set_ca_state(tp, state); tp->high_seq = tp->snd_nxt; } tcp_moderate_cwnd(tp); } else { tcp_cwnd_down(tp); }}/* Process an event, which can update packets-in-flight not trivially. * Main goal of this function is to calculate new estimate for left_out, * taking into account both packets sitting in receiver's buffer and * packets lost by network. * * Besides that it does CWND reduction, when packet loss is detected * and changes state of machine. * * It does _not_ decide what to send, it is made in function * tcp_xmit_retransmit_queue(). */static voidtcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, int prior_packets, int flag){ struct tcp_opt *tp = tcp_sk(sk); int is_dupack = (tp->snd_una == prior_snd_una && !(flag&FLAG_NOT_DUP)); /* Some technical things: * 1. Reno does not count dupacks (sacked_out) automatically. */ if (!tcp_get_pcount(&tp->packets_out)) tcp_set_pcount(&tp->sacked_out, 0); /* 2. SACK counts snd_fack in packets inaccurately. */ if (tcp_get_pcount(&tp->sacked_out) == 0) tcp_set_pcount(&tp->fackets_out, 0); /* Now state machine starts. * A. ECE, hence prohibit cwnd undoing, the reduction is required. */ if (flag&FLAG_ECE) tp->prior_ssthresh = 0; /* B. In all the states check for reneging SACKs. */ if (tcp_get_pcount(&tp->sacked_out) && tcp_check_sack_reneging(sk, tp)) return; /* C. Process data loss notification, provided it is valid. */ if ((flag&FLAG_DATA_LOST) && before(tp->snd_una, tp->high_seq) && tp->ca_state != TCP_CA_Open && tcp_get_pcount(&tp->fackets_out) > tp->reordering) { tcp_mark_head_lost(sk, tp, tcp_get_pcount(&tp->fackets_out)-tp->reordering, tp->high_seq); NET_INC_STATS_BH(LINUX_MIB_TCPLOSS); } /* D. Synchronize left_out to current state. */ tcp_sync_left_out(tp); /* E. Check state exit conditions. State can be terminated * when high_seq is ACKed. */ if (tp->ca_state == TCP_CA_Open) { if (!sysctl_tcp_frto) BUG_TRAP(tcp_get_pcount(&tp->retrans_out) == 0); tp->retrans_stamp = 0; } else if (!before(tp->snd_una, tp->high_seq)) { switch (tp->ca_state) { case TCP_CA_Loss: tp->retransmits = 0; if (tcp_try_undo_recovery(sk, tp)) return; break; case TCP_CA_CWR: /* CWR is to be held something *above* high_seq * is ACKed for CWR bit to reach receiver. */ if (tp->snd_una != tp->high_seq) { tcp_complete_cwr(tp); tcp_set_ca_state(tp, TCP_CA_Open); } break; case TCP_CA_Disorder: tcp_try_undo_dsack(sk, tp); if (!tp->undo_marker || /* For SACK case do not Open to allow to undo * catching for all duplicate ACKs. */ IsReno(tp) || tp->snd_una != tp->high_seq) { tp->undo_marker = 0; tcp_set_ca_state(tp, TCP_CA_Open); } break; case TCP_CA_Recovery: if (IsReno(tp)) tcp_reset_reno_sack(tp); if (tcp_try_undo_recovery(sk, tp)) return; tcp_complete_cwr(tp); break; } } /* F. Process state. */ switch (tp->ca_state) { case TCP_CA_Recovery: if (prior_snd_una == tp->snd_una) { if (IsReno(tp) && is_dupack) tcp_add_reno_sack(tp); } else { int acked = prior_packets - tcp_get_pcount(&tp->packets_out); if (IsReno(tp)) tcp_remove_reno_sacks(sk, tp, acked); is_dupack = tcp_try_undo_partial(sk, tp, acked); } break; case TCP_CA_Loss: if (flag&FLAG_DATA_ACKED) tp->retransmits = 0; if (!tcp_try_undo_loss(sk, tp)) { tcp_moderate_cwnd(tp); tcp_xmit_retransmit_queue(sk); return; } if (tp->ca_state != TCP_CA_Open) return; /* Loss is undone; fall through to processing in Open state. */ default: if (IsReno(tp)) { if (tp->snd_una != prior_snd_una) tcp_reset_reno_sack(tp); if (is_dupack) tcp_add_reno_sack(tp); } if (tp->ca_state == TCP_CA_Disorder) tcp_try_undo_dsack(sk, tp); if (!tcp_time_to_recover(sk, tp)) { tcp_try_to_open(sk, tp, flag); return; } /* Otherwise enter Recovery state */ if (IsReno(tp)) NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERY); else NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERY); tp->high_seq = tp->snd_nxt; tp->prior_ssthresh = 0; tp->undo_marker = tp->snd_una; tp->undo_retrans = tcp_get_pcount(&tp->retrans_out); if (tp->ca_state < TCP_CA_CWR) { if (!(flag&FLAG_ECE)) tp->prior_ssthresh = tcp_current_ssthresh(tp); tp->snd_ssthresh = tcp_recalc_ssthresh(tp); TCP_ECN_queue_cwr(tp); } tp->snd_cwnd_cnt = 0; tcp_set_ca_state(tp, TCP_CA_Recovery); } if (is_dupack || tcp_head_timedout(sk, tp)) tcp_update_scoreboard(sk, tp); tcp_cwnd_down(tp); tcp_xmit_retransmit_queue(sk);}/* Read draft-ietf-tcplw-high-performance before mucking * with this code. (Superceeds RFC1323) */static void tcp_ack_saw_tstamp(struct tcp_opt *tp, int flag){ __u32 seq_rtt; /* RTTM Rule: A TSecr value received in a segment is used to * update the averaged RTT measurement only if the segment * acknowledges some new data, i.e., only if it advances the * left edge of the send window. * * See draft-ietf-tcplw-high-performance-00, section 3.3. * 1998/04/10 Andrey V. Savochkin <saw@msu.ru> * * Changed: reset backoff as soon as we see the first valid sample. * If we do not, we get strongly overstimated rto. With timestamps * samples are accepted even from very old segments: f.e., when rtt=1 * increases to 8, we retransmit 5 times and after 8 seconds delayed * answer arrives rto becomes 120 seconds! If at least one of segments * in window is lost... Voila. --ANK (010210) */ seq_rtt = tcp_time_stamp - tp->rcv_tsecr; tcp_rtt_estimator(tp, seq_rtt); tcp_set_rto(tp); tp->backoff = 0; tcp_bound_rto(tp);}static void tcp_ack_no_tstamp(struct tcp_opt *tp, u32 seq_rtt, int flag){ /* We don't have a timestamp. Can only use * packets that are not retransmitted to determine * rtt estimates. Also, we must not reset the * backoff for rto until we get a non-retransmitted * packet. This allows us to deal with a situation * where the network delay has increased suddenly. * I.e. Karn's algorithm. (SIGCOMM '87, p5.) */ if (flag & FLAG_RETRANS_DATA_ACKED) return; tcp_rtt_estimator(tp, seq_rtt); tcp_set_rto(tp); tp->backoff = 0; tcp_bound_rto(tp);}static __inline__ voidtcp_ack_update_rtt(struct tcp_opt *tp, int flag, s32 seq_rtt){ /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ if (tp->saw_tstamp && tp->rcv_tsecr) tcp_ack_saw_tstamp(tp, flag); else if (seq_rtt >= 0) tcp_ack_no_tstamp(tp, seq_rtt, flag);}/* * Compute congestion window to use. * * This is from the implementation of BICTCP in * Lison-Xu, Kahaled Harfoush, and Injog Rhee. * "Binary Increase Congestion Control for Fast, Long Distance * Networks" in InfoComm 2004 * Available from: * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf * * Unless BIC is enabled and congestion window is large * this behaves the same as the original Reno. */static inline __u32 bictcp_cwnd(struct tcp_opt *tp){ /* orignal Reno behaviour */ if (!tcp_is_bic(tp)) return tp->snd_cwnd; if (tp->bictcp.last_cwnd == tp->snd_cwnd && (s32)(tcp_time_stamp - tp->bictcp.last_stamp) <= (HZ>>5)) return tp->bictcp.cnt; tp->bictcp.last_cwnd = tp->snd_cwnd; tp->bictcp.last_stamp = tcp_time_stamp; /* start off normal */ if (tp->snd_cwnd <= sysctl_tcp_bic_low_window) tp->bictcp.cnt = tp->snd_cwnd; /* binary increase */ else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd) { __u32 dist = (tp->bictcp.last_max_cwnd - tp->snd_cwnd) / BICTCP_B; if (dist > BICTCP_MAX_INCREMENT) /* linear increase */ tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT; else if (dist <= 1U) /* binary search increase */ tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR / BICTCP_B; else /* binary search increase */ tp->bictcp.cnt = tp->snd_cwnd / dist; } else { /* slow start amd linear increase */ if (tp->snd_cwnd < tp->bictcp.last_max_cwnd + BICTCP_B) /* slow start */ tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR / BICTCP_B; else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd + BICTCP_MAX_INCREMENT*(BICTCP_B-1)) /* slow start */ tp->bictcp.cnt = tp->snd_cwnd * (BICTCP_B-1) / (tp->snd_cwnd-tp->bictcp.last_max_cwnd); else /* linear increase */ tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT; } return tp->bictcp.cnt;}/* This is Jacobson's slow start and congestion avoidance. * SIGCOMM '88, p. 328. */static __inline__ void reno_cong_avoid(struct tcp_opt *tp){ if (tp->snd_cwnd <= tp->snd_ssthresh) { /* In "safe" area, increase. */ if (tp->snd_cwnd < tp->snd_cwnd_clamp) tp->snd_cwnd++; } else { /* In dangerous area, increase slowly. * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */ if (tp->snd_cwnd_cnt >= bictcp_cwnd(tp)) { if (tp->snd_cwnd < tp->snd_cwnd_clamp) tp->snd_cwnd++; tp->snd_cwnd_cnt=0; } else tp->snd_cwnd_cnt++; } tp->snd_cwnd_stamp = tcp_time_stamp;}/* This is based on the congestion detection/avoidance scheme described in * Lawrence S. Brakmo and Larry L. Peterson. * "TCP Vegas: End to end congestion avoidance on a global internet." * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480, * October 1995. Available from: * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps * * See http://www.cs.arizona.edu/xkernel/ for their implementation. * The main aspects that distinguish this implementation from the * Arizona Vegas implementation are: * o We do not change the loss detection or recovery mechanisms of * Linux in any way. Linux already recovers from losses quite well, * using fine-grained timers, NewReno, and FACK. * o To avoid the performance penalty imposed by increasing cwnd * only every-other RTT during slow start, we increase during * every RTT during slow start, just like Reno. * o Largely to allow continuous cwnd growth during slow start, * we use the rate at which ACKs come back as the "actual" * rate, rather than the rate at which data is sent. * o To speed convergence to the right rate, we s
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -