📄 tcp_input.c

📁 linux 内核源代码
💻 C
📖 第 1 页 / 共 5 页
字号:
	struct tcp_sock *tp = tcp_sk(sk);	unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);	hint = min(hint, tp->rcv_wnd/2);	hint = min(hint, TCP_MIN_RCVMSS);	hint = max(hint, TCP_MIN_MSS);	inet_csk(sk)->icsk_ack.rcv_mss = hint;}/* Receiver "autotuning" code. * * The algorithm for RTT estimation w/o timestamps is based on * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL. * <http://www.lanl.gov/radiant/website/pubs/drs/lacsi2001.ps> * * More detail on this code can be found at * <http://www.psc.edu/~jheffner/senior_thesis.ps>, * though this reference is out of date.  A new paper * is pending. */static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep){	u32 new_sample = tp->rcv_rtt_est.rtt;	long m = sample;	if (m == 0)		m = 1;	if (new_sample != 0) {		/* If we sample in larger samples in the non-timestamp		 * case, we could grossly overestimate the RTT especially		 * with chatty applications or bulk transfer apps which		 * are stalled on filesystem I/O.		 *		 * Also, since we are only going for a minimum in the		 * non-timestamp case, we do not smooth things out		 * else with timestamps disabled convergence takes too		 * long.		 */		if (!win_dep) {			m -= (new_sample >> 3);			new_sample += m;		} else if (m < new_sample)			new_sample = m << 3;	} else {		/* No previous measure. */		new_sample = m << 3;	}	if (tp->rcv_rtt_est.rtt != new_sample)		tp->rcv_rtt_est.rtt = new_sample;}static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp){	if (tp->rcv_rtt_est.time == 0)		goto new_measure;	if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))		return;	tcp_rcv_rtt_update(tp,			   jiffies - tp->rcv_rtt_est.time,			   1);new_measure:	tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;	tp->rcv_rtt_est.time = tcp_time_stamp;}static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, const struct sk_buff *skb){	struct tcp_sock *tp = tcp_sk(sk);	if (tp->rx_opt.rcv_tsecr &&	    (TCP_SKB_CB(skb)->end_seq -	     TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss))		tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0);}/* * This function should be called every time data is copied to user space. * It calculates the appropriate TCP receive buffer space. */void tcp_rcv_space_adjust(struct sock *sk){	struct tcp_sock *tp = tcp_sk(sk);	int time;	int space;	if (tp->rcvq_space.time == 0)		goto new_measure;	time = tcp_time_stamp - tp->rcvq_space.time;	if (time < (tp->rcv_rtt_est.rtt >> 3) ||	    tp->rcv_rtt_est.rtt == 0)		return;	space = 2 * (tp->copied_seq - tp->rcvq_space.seq);	space = max(tp->rcvq_space.space, space);	if (tp->rcvq_space.space != space) {		int rcvmem;		tp->rcvq_space.space = space;		if (sysctl_tcp_moderate_rcvbuf &&		    !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {			int new_clamp = space;			/* Receive space grows, normalize in order to			 * take into account packet headers and sk_buff			 * structure overhead.			 */			space /= tp->advmss;			if (!space)				space = 1;			rcvmem = (tp->advmss + MAX_TCP_HEADER +				  16 + sizeof(struct sk_buff));			while (tcp_win_from_space(rcvmem) < tp->advmss)				rcvmem += 128;			space *= rcvmem;			space = min(space, sysctl_tcp_rmem[2]);			if (space > sk->sk_rcvbuf) {				sk->sk_rcvbuf = space;				/* Make the window clamp follow along.  */				tp->window_clamp = new_clamp;			}		}	}new_measure:	tp->rcvq_space.seq = tp->copied_seq;	tp->rcvq_space.time = tcp_time_stamp;}/* There is something which you must keep in mind when you analyze the * behavior of the tp->ato delayed ack timeout interval.  When a * connection starts up, we want to ack as quickly as possible.  The * problem is that "good" TCP's do slow start at the beginning of data * transmission.  The means that until we send the first few ACK's the * sender will sit on his end and only queue most of his data, because * he can only send snd_cwnd unacked packets at any given time.  For * each ACK we send, he increments snd_cwnd and transmits more of his * queue.  -DaveM */static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb){	struct tcp_sock *tp = tcp_sk(sk);	struct inet_connection_sock *icsk = inet_csk(sk);	u32 now;	inet_csk_schedule_ack(sk);	tcp_measure_rcv_mss(sk, skb);	tcp_rcv_rtt_measure(tp);	now = tcp_time_stamp;	if (!icsk->icsk_ack.ato) {		/* The _first_ data packet received, initialize		 * delayed ACK engine.		 */		tcp_incr_quickack(sk);		icsk->icsk_ack.ato = TCP_ATO_MIN;	} else {		int m = now - icsk->icsk_ack.lrcvtime;		if (m <= TCP_ATO_MIN/2) {			/* The fastest case is the first. */			icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + TCP_ATO_MIN / 2;		} else if (m < icsk->icsk_ack.ato) {			icsk->icsk_ack.ato = (icsk->icsk_ack.ato >> 1) + m;			if (icsk->icsk_ack.ato > icsk->icsk_rto)				icsk->icsk_ack.ato = icsk->icsk_rto;		} else if (m > icsk->icsk_rto) {			/* Too long gap. Apparently sender failed to			 * restart window, so that we send ACKs quickly.			 */			tcp_incr_quickack(sk);			sk_stream_mem_reclaim(sk);		}	}	icsk->icsk_ack.lrcvtime = now;	TCP_ECN_check_ce(tp, skb);	if (skb->len >= 128)		tcp_grow_window(sk, skb);}static u32 tcp_rto_min(struct sock *sk){	struct dst_entry *dst = __sk_dst_get(sk);	u32 rto_min = TCP_RTO_MIN;	if (dst && dst_metric_locked(dst, RTAX_RTO_MIN))		rto_min = dst->metrics[RTAX_RTO_MIN-1];	return rto_min;}/* Called to compute a smoothed rtt estimate. The data fed to this * routine either comes from timestamps, or from segments that were * known _not_ to have been retransmitted [see Karn/Partridge * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88 * piece by Van Jacobson. * NOTE: the next three routines used to be one big routine. * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt){	struct tcp_sock *tp = tcp_sk(sk);	long m = mrtt; /* RTT */	/*	The following amusing code comes from Jacobson's	 *	article in SIGCOMM '88.  Note that rtt and mdev	 *	are scaled versions of rtt and mean deviation.	 *	This is designed to be as fast as possible	 *	m stands for "measurement".	 *	 *	On a 1990 paper the rto value is changed to:	 *	RTO = rtt + 4 * mdev	 *	 * Funny. This algorithm seems to be very broken.	 * These formulae increase RTO, when it should be decreased, increase	 * too slowly, when it should be increased quickly, decrease too quickly	 * etc. I guess in BSD RTO takes ONE value, so that it is absolutely	 * does not matter how to _calculate_ it. Seems, it was trap	 * that VJ failed to avoid. 8)	 */	if (m == 0)		m = 1;	if (tp->srtt != 0) {		m -= (tp->srtt >> 3);	/* m is now error in rtt est */		tp->srtt += m;		/* rtt = 7/8 rtt + 1/8 new */		if (m < 0) {			m = -m;		/* m is now abs(error) */			m -= (tp->mdev >> 2);   /* similar update on mdev */			/* This is similar to one of Eifel findings.			 * Eifel blocks mdev updates when rtt decreases.			 * This solution is a bit different: we use finer gain			 * for mdev in this case (alpha*beta).			 * Like Eifel it also prevents growth of rto,			 * but also it limits too fast rto decreases,			 * happening in pure Eifel.			 */			if (m > 0)				m >>= 3;		} else {			m -= (tp->mdev >> 2);   /* similar update on mdev */		}		tp->mdev += m;	    	/* mdev = 3/4 mdev + 1/4 new */		if (tp->mdev > tp->mdev_max) {			tp->mdev_max = tp->mdev;			if (tp->mdev_max > tp->rttvar)				tp->rttvar = tp->mdev_max;		}		if (after(tp->snd_una, tp->rtt_seq)) {			if (tp->mdev_max < tp->rttvar)				tp->rttvar -= (tp->rttvar-tp->mdev_max)>>2;			tp->rtt_seq = tp->snd_nxt;			tp->mdev_max = tcp_rto_min(sk);		}	} else {		/* no previous measure. */		tp->srtt = m<<3;	/* take the measured time to be rtt */		tp->mdev = m<<1;	/* make sure rto = 3*rtt */		tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));		tp->rtt_seq = tp->snd_nxt;	}}/* Calculate rto without backoff.  This is the second half of Van Jacobson's * routine referred to above. */static inline void tcp_set_rto(struct sock *sk){	const struct tcp_sock *tp = tcp_sk(sk);	/* Old crap is replaced with new one. 8)	 *	 * More seriously:	 * 1. If rtt variance happened to be less 50msec, it is hallucination.	 *    It cannot be less due to utterly erratic ACK generation made	 *    at least by solaris and freebsd. "Erratic ACKs" has _nothing_	 *    to do with delayed acks, because at cwnd>2 true delack timeout	 *    is invisible. Actually, Linux-2.4 also generates erratic	 *    ACKs in some circumstances.	 */	inet_csk(sk)->icsk_rto = (tp->srtt >> 3) + tp->rttvar;	/* 2. Fixups made earlier cannot be right.	 *    If we do not estimate RTO correctly without them,	 *    all the algo is pure shit and should be replaced	 *    with correct one. It is exactly, which we pretend to do.	 */}/* NOTE: clamping at TCP_RTO_MIN is not required, current algo * guarantees that rto is higher. */static inline void tcp_bound_rto(struct sock *sk){	if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX)		inet_csk(sk)->icsk_rto = TCP_RTO_MAX;}/* Save metrics learned by this TCP session.   This function is called only, when TCP finishes successfully   i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE. */void tcp_update_metrics(struct sock *sk){	struct tcp_sock *tp = tcp_sk(sk);	struct dst_entry *dst = __sk_dst_get(sk);	if (sysctl_tcp_nometrics_save)		return;	dst_confirm(dst);	if (dst && (dst->flags&DST_HOST)) {		const struct inet_connection_sock *icsk = inet_csk(sk);		int m;		if (icsk->icsk_backoff || !tp->srtt) {			/* This session failed to estimate rtt. Why?			 * Probably, no packets returned in time.			 * Reset our results.			 */			if (!(dst_metric_locked(dst, RTAX_RTT)))				dst->metrics[RTAX_RTT-1] = 0;			return;		}		m = dst_metric(dst, RTAX_RTT) - tp->srtt;		/* If newly calculated rtt larger than stored one,		 * store new one. Otherwise, use EWMA. Remember,		 * rtt overestimation is always better than underestimation.		 */		if (!(dst_metric_locked(dst, RTAX_RTT))) {			if (m <= 0)				dst->metrics[RTAX_RTT-1] = tp->srtt;			else				dst->metrics[RTAX_RTT-1] -= (m>>3);		}		if (!(dst_metric_locked(dst, RTAX_RTTVAR))) {			if (m < 0)				m = -m;			/* Scale deviation to rttvar fixed point */			m >>= 1;			if (m < tp->mdev)				m = tp->mdev;			if (m >= dst_metric(dst, RTAX_RTTVAR))				dst->metrics[RTAX_RTTVAR-1] = m;			else				dst->metrics[RTAX_RTTVAR-1] -=					(dst->metrics[RTAX_RTTVAR-1] - m)>>2;		}		if (tp->snd_ssthresh >= 0xFFFF) {			/* Slow start still did not finish. */			if (dst_metric(dst, RTAX_SSTHRESH) &&			    !dst_metric_locked(dst, RTAX_SSTHRESH) &&			    (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH))				dst->metrics[RTAX_SSTHRESH-1] = tp->snd_cwnd >> 1;			if (!dst_metric_locked(dst, RTAX_CWND) &&			    tp->snd_cwnd > dst_metric(dst, RTAX_CWND))				dst->metrics[RTAX_CWND-1] = tp->snd_cwnd;		} else if (tp->snd_cwnd > tp->snd_ssthresh &&			   icsk->icsk_ca_state == TCP_CA_Open) {			/* Cong. avoidance phase, cwnd is reliable. */			if (!dst_metric_locked(dst, RTAX_SSTHRESH))				dst->metrics[RTAX_SSTHRESH-1] =					max(tp->snd_cwnd >> 1, tp->snd_ssthresh);			if (!dst_metric_locked(dst, RTAX_CWND))				dst->metrics[RTAX_CWND-1] = (dst->metrics[RTAX_CWND-1] + tp->snd_cwnd) >> 1;		} else {			/* Else slow start did not finish, cwnd is non-sense,			   ssthresh may be also invalid.			 */			if (!dst_metric_locked(dst, RTAX_CWND))				dst->metrics[RTAX_CWND-1] = (dst->metrics[RTAX_CWND-1] + tp->snd_ssthresh) >> 1;			if (dst->metrics[RTAX_SSTHRESH-1] &&			    !dst_metric_locked(dst, RTAX_SSTHRESH) &&			    tp->snd_ssthresh > dst->metrics[RTAX_SSTHRESH-1])				dst->metrics[RTAX_SSTHRESH-1] = tp->snd_ssthresh;		}		if (!dst_metric_locked(dst, RTAX_REORDERING)) {			if (dst->metrics[RTAX_REORDERING-1] < tp->reordering &&			    tp->reordering != sysctl_tcp_reordering)				dst->metrics[RTAX_REORDERING-1] = tp->reordering;		}	}}/* Numbers are taken from RFC3390. * * John Heffner states: * *	The RFC specifies a window of no more than 4380 bytes *	unless 2*MSS > 4380.  Reading the pseudocode in the RFC *	is a bit misleading because they use a clamp at 4380 bytes *	rather than use a multiplier in the relevant range. */__u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -