📄 tcp_input.c

📁 Linux Kernel 2.6.9 for OMAP1710
💻 C
📖 第 1 页 / 共 5 页
字号:
上一页 1 2 3 45
/* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */static void tcp_try_undo_dsack(struct sock *sk, struct tcp_opt *tp){	if (tp->undo_marker && !tp->undo_retrans) {		DBGUNDO(sk, tp, "D-SACK");		tcp_undo_cwr(tp, 1);		tp->undo_marker = 0;		NET_INC_STATS_BH(LINUX_MIB_TCPDSACKUNDO);	}}/* Undo during fast recovery after partial ACK. */static int tcp_try_undo_partial(struct sock *sk, struct tcp_opt *tp, int acked){	/* Partial ACK arrived. Force Hoe's retransmit. */	int failed = IsReno(tp) || tcp_get_pcount(&tp->fackets_out)>tp->reordering;	if (tcp_may_undo(tp)) {		/* Plain luck! Hole if filled with delayed		 * packet, rather than with a retransmit.		 */		if (tcp_get_pcount(&tp->retrans_out) == 0)			tp->retrans_stamp = 0;		tcp_update_reordering(tp, tcp_fackets_out(tp)+acked, 1);		DBGUNDO(sk, tp, "Hoe");		tcp_undo_cwr(tp, 0);		NET_INC_STATS_BH(LINUX_MIB_TCPPARTIALUNDO);		/* So... Do not make Hoe's retransmit yet.		 * If the first packet was delayed, the rest		 * ones are most probably delayed as well.		 */		failed = 0;	}	return failed;}/* Undo during loss recovery after partial ACK. */static int tcp_try_undo_loss(struct sock *sk, struct tcp_opt *tp){	if (tcp_may_undo(tp)) {		struct sk_buff *skb;		sk_stream_for_retrans_queue(skb, sk) {			TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;		}		DBGUNDO(sk, tp, "partial loss");		tcp_set_pcount(&tp->lost_out, 0);		tcp_set_pcount(&tp->left_out, tcp_get_pcount(&tp->sacked_out));		tcp_undo_cwr(tp, 1);		NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO);		tp->retransmits = 0;		tp->undo_marker = 0;		if (!IsReno(tp))			tcp_set_ca_state(tp, TCP_CA_Open);		return 1;	}	return 0;}static __inline__ void tcp_complete_cwr(struct tcp_opt *tp){	if (tcp_westwood_cwnd(tp)) 		tp->snd_ssthresh = tp->snd_cwnd;	else		tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);	tp->snd_cwnd_stamp = tcp_time_stamp;}static void tcp_try_to_open(struct sock *sk, struct tcp_opt *tp, int flag){	tcp_set_pcount(&tp->left_out, tcp_get_pcount(&tp->sacked_out));	if (tcp_get_pcount(&tp->retrans_out) == 0)		tp->retrans_stamp = 0;	if (flag&FLAG_ECE)		tcp_enter_cwr(tp);	if (tp->ca_state != TCP_CA_CWR) {		int state = TCP_CA_Open;		if (tcp_get_pcount(&tp->left_out) ||		    tcp_get_pcount(&tp->retrans_out) ||		    tp->undo_marker)			state = TCP_CA_Disorder;		if (tp->ca_state != state) {			tcp_set_ca_state(tp, state);			tp->high_seq = tp->snd_nxt;		}		tcp_moderate_cwnd(tp);	} else {		tcp_cwnd_down(tp);	}}/* Process an event, which can update packets-in-flight not trivially. * Main goal of this function is to calculate new estimate for left_out, * taking into account both packets sitting in receiver's buffer and * packets lost by network. * * Besides that it does CWND reduction, when packet loss is detected * and changes state of machine. * * It does _not_ decide what to send, it is made in function * tcp_xmit_retransmit_queue(). */static voidtcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,		      int prior_packets, int flag){	struct tcp_opt *tp = tcp_sk(sk);	int is_dupack = (tp->snd_una == prior_snd_una && !(flag&FLAG_NOT_DUP));	/* Some technical things:	 * 1. Reno does not count dupacks (sacked_out) automatically. */	if (!tcp_get_pcount(&tp->packets_out))		tcp_set_pcount(&tp->sacked_out, 0);        /* 2. SACK counts snd_fack in packets inaccurately. */	if (tcp_get_pcount(&tp->sacked_out) == 0)		tcp_set_pcount(&tp->fackets_out, 0);        /* Now state machine starts.	 * A. ECE, hence prohibit cwnd undoing, the reduction is required. */	if (flag&FLAG_ECE)		tp->prior_ssthresh = 0;	/* B. In all the states check for reneging SACKs. */	if (tcp_get_pcount(&tp->sacked_out) && tcp_check_sack_reneging(sk, tp))		return;	/* C. Process data loss notification, provided it is valid. */	if ((flag&FLAG_DATA_LOST) &&	    before(tp->snd_una, tp->high_seq) &&	    tp->ca_state != TCP_CA_Open &&	    tcp_get_pcount(&tp->fackets_out) > tp->reordering) {		tcp_mark_head_lost(sk, tp, tcp_get_pcount(&tp->fackets_out)-tp->reordering, tp->high_seq);		NET_INC_STATS_BH(LINUX_MIB_TCPLOSS);	}	/* D. Synchronize left_out to current state. */	tcp_sync_left_out(tp);	/* E. Check state exit conditions. State can be terminated	 *    when high_seq is ACKed. */	if (tp->ca_state == TCP_CA_Open) {		if (!sysctl_tcp_frto)			BUG_TRAP(tcp_get_pcount(&tp->retrans_out) == 0);		tp->retrans_stamp = 0;	} else if (!before(tp->snd_una, tp->high_seq)) {		switch (tp->ca_state) {		case TCP_CA_Loss:			tp->retransmits = 0;			if (tcp_try_undo_recovery(sk, tp))				return;			break;		case TCP_CA_CWR:			/* CWR is to be held something *above* high_seq			 * is ACKed for CWR bit to reach receiver. */			if (tp->snd_una != tp->high_seq) {				tcp_complete_cwr(tp);				tcp_set_ca_state(tp, TCP_CA_Open);			}			break;		case TCP_CA_Disorder:			tcp_try_undo_dsack(sk, tp);			if (!tp->undo_marker ||			    /* For SACK case do not Open to allow to undo			     * catching for all duplicate ACKs. */			    IsReno(tp) || tp->snd_una != tp->high_seq) {				tp->undo_marker = 0;				tcp_set_ca_state(tp, TCP_CA_Open);			}			break;		case TCP_CA_Recovery:			if (IsReno(tp))				tcp_reset_reno_sack(tp);			if (tcp_try_undo_recovery(sk, tp))				return;			tcp_complete_cwr(tp);			break;		}	}	/* F. Process state. */	switch (tp->ca_state) {	case TCP_CA_Recovery:		if (prior_snd_una == tp->snd_una) {			if (IsReno(tp) && is_dupack)				tcp_add_reno_sack(tp);		} else {			int acked = prior_packets -				tcp_get_pcount(&tp->packets_out);			if (IsReno(tp))				tcp_remove_reno_sacks(sk, tp, acked);			is_dupack = tcp_try_undo_partial(sk, tp, acked);		}		break;	case TCP_CA_Loss:		if (flag&FLAG_DATA_ACKED)			tp->retransmits = 0;		if (!tcp_try_undo_loss(sk, tp)) {			tcp_moderate_cwnd(tp);			tcp_xmit_retransmit_queue(sk);			return;		}		if (tp->ca_state != TCP_CA_Open)			return;		/* Loss is undone; fall through to processing in Open state. */	default:		if (IsReno(tp)) {			if (tp->snd_una != prior_snd_una)				tcp_reset_reno_sack(tp);			if (is_dupack)				tcp_add_reno_sack(tp);		}		if (tp->ca_state == TCP_CA_Disorder)			tcp_try_undo_dsack(sk, tp);		if (!tcp_time_to_recover(sk, tp)) {			tcp_try_to_open(sk, tp, flag);			return;		}		/* Otherwise enter Recovery state */		if (IsReno(tp))			NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERY);		else			NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERY);		tp->high_seq = tp->snd_nxt;		tp->prior_ssthresh = 0;		tp->undo_marker = tp->snd_una;		tp->undo_retrans = tcp_get_pcount(&tp->retrans_out);		if (tp->ca_state < TCP_CA_CWR) {			if (!(flag&FLAG_ECE))				tp->prior_ssthresh = tcp_current_ssthresh(tp);			tp->snd_ssthresh = tcp_recalc_ssthresh(tp);			TCP_ECN_queue_cwr(tp);		}		tp->snd_cwnd_cnt = 0;		tcp_set_ca_state(tp, TCP_CA_Recovery);	}	if (is_dupack || tcp_head_timedout(sk, tp))		tcp_update_scoreboard(sk, tp);	tcp_cwnd_down(tp);	tcp_xmit_retransmit_queue(sk);}/* Read draft-ietf-tcplw-high-performance before mucking * with this code. (Superceeds RFC1323) */static void tcp_ack_saw_tstamp(struct tcp_opt *tp, int flag){	__u32 seq_rtt;	/* RTTM Rule: A TSecr value received in a segment is used to	 * update the averaged RTT measurement only if the segment	 * acknowledges some new data, i.e., only if it advances the	 * left edge of the send window.	 *	 * See draft-ietf-tcplw-high-performance-00, section 3.3.	 * 1998/04/10 Andrey V. Savochkin <saw@msu.ru>	 *	 * Changed: reset backoff as soon as we see the first valid sample.	 * If we do not, we get strongly overstimated rto. With timestamps	 * samples are accepted even from very old segments: f.e., when rtt=1	 * increases to 8, we retransmit 5 times and after 8 seconds delayed	 * answer arrives rto becomes 120 seconds! If at least one of segments	 * in window is lost... Voila.	 			--ANK (010210)	 */	seq_rtt = tcp_time_stamp - tp->rcv_tsecr;	tcp_rtt_estimator(tp, seq_rtt);	tcp_set_rto(tp);	tp->backoff = 0;	tcp_bound_rto(tp);}static void tcp_ack_no_tstamp(struct tcp_opt *tp, u32 seq_rtt, int flag){	/* We don't have a timestamp. Can only use	 * packets that are not retransmitted to determine	 * rtt estimates. Also, we must not reset the	 * backoff for rto until we get a non-retransmitted	 * packet. This allows us to deal with a situation	 * where the network delay has increased suddenly.	 * I.e. Karn's algorithm. (SIGCOMM '87, p5.)	 */	if (flag & FLAG_RETRANS_DATA_ACKED)		return;	tcp_rtt_estimator(tp, seq_rtt);	tcp_set_rto(tp);	tp->backoff = 0;	tcp_bound_rto(tp);}static __inline__ voidtcp_ack_update_rtt(struct tcp_opt *tp, int flag, s32 seq_rtt){	/* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */	if (tp->saw_tstamp && tp->rcv_tsecr)		tcp_ack_saw_tstamp(tp, flag);	else if (seq_rtt >= 0)		tcp_ack_no_tstamp(tp, seq_rtt, flag);}/* * Compute congestion window to use. * * This is from the implementation of BICTCP in * Lison-Xu, Kahaled Harfoush, and Injog Rhee. *  "Binary Increase Congestion Control for Fast, Long Distance *  Networks" in InfoComm 2004 * Available from: *  http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf * * Unless BIC is enabled and congestion window is large * this behaves the same as the original Reno. */static inline __u32 bictcp_cwnd(struct tcp_opt *tp){	/* orignal Reno behaviour */	if (!tcp_is_bic(tp))		return tp->snd_cwnd;	if (tp->bictcp.last_cwnd == tp->snd_cwnd &&	   (s32)(tcp_time_stamp - tp->bictcp.last_stamp) <= (HZ>>5))		return tp->bictcp.cnt;	tp->bictcp.last_cwnd = tp->snd_cwnd;	tp->bictcp.last_stamp = tcp_time_stamp;      	/* start off normal */	if (tp->snd_cwnd <= sysctl_tcp_bic_low_window)		tp->bictcp.cnt = tp->snd_cwnd;	/* binary increase */	else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd) {		__u32 	dist = (tp->bictcp.last_max_cwnd - tp->snd_cwnd)			/ BICTCP_B;		if (dist > BICTCP_MAX_INCREMENT)			/* linear increase */			tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT;		else if (dist <= 1U)			/* binary search increase */			tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR				/ BICTCP_B;		else			/* binary search increase */			tp->bictcp.cnt = tp->snd_cwnd / dist;	} else {		/* slow start amd linear increase */		if (tp->snd_cwnd < tp->bictcp.last_max_cwnd + BICTCP_B)			/* slow start */			tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR				/ BICTCP_B;		else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd			 		+ BICTCP_MAX_INCREMENT*(BICTCP_B-1))			/* slow start */			tp->bictcp.cnt = tp->snd_cwnd * (BICTCP_B-1)				/ (tp->snd_cwnd-tp->bictcp.last_max_cwnd);		else			/* linear increase */			tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT;	}	return tp->bictcp.cnt;}/* This is Jacobson's slow start and congestion avoidance.  * SIGCOMM '88, p. 328. */static __inline__ void reno_cong_avoid(struct tcp_opt *tp){        if (tp->snd_cwnd <= tp->snd_ssthresh) {                /* In "safe" area, increase. */		if (tp->snd_cwnd < tp->snd_cwnd_clamp)			tp->snd_cwnd++;	} else {                /* In dangerous area, increase slowly.		 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd		 */		if (tp->snd_cwnd_cnt >= bictcp_cwnd(tp)) {			if (tp->snd_cwnd < tp->snd_cwnd_clamp)				tp->snd_cwnd++;			tp->snd_cwnd_cnt=0;		} else			tp->snd_cwnd_cnt++;        }	tp->snd_cwnd_stamp = tcp_time_stamp;}/* This is based on the congestion detection/avoidance scheme described in *    Lawrence S. Brakmo and Larry L. Peterson. *    "TCP Vegas: End to end congestion avoidance on a global internet." *    IEEE Journal on Selected Areas in Communication, 13(8):1465--1480, *    October 1995. Available from: *	ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps * * See http://www.cs.arizona.edu/xkernel/ for their implementation. * The main aspects that distinguish this implementation from the * Arizona Vegas implementation are: *   o We do not change the loss detection or recovery mechanisms of *     Linux in any way. Linux already recovers from losses quite well, *     using fine-grained timers, NewReno, and FACK. *   o To avoid the performance penalty imposed by increasing cwnd *     only every-other RTT during slow start, we increase during *     every RTT during slow start, just like Reno. *   o Largely to allow continuous cwnd growth during slow start, *     we use the rate at which ACKs come back as the "actual" *     rate, rather than the rate at which data is sent. *   o To speed convergence to the right rate, we s
上一页 1 2 3 45
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -