📄 tcp_input.c

📁 Linux Kernel 2.6.9 for OMAP1710
💻 C
📖 第 1 页 / 共 5 页
字号:
	if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))		return;	tcp_rcv_rtt_update(tp,			   jiffies - tp->rcv_rtt_est.time,			   1);new_measure:	tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd;	tp->rcv_rtt_est.time = tcp_time_stamp;}static inline void tcp_rcv_rtt_measure_ts(struct tcp_opt *tp, struct sk_buff *skb){	if (tp->rcv_tsecr &&	    (TCP_SKB_CB(skb)->end_seq -	     TCP_SKB_CB(skb)->seq >= tp->ack.rcv_mss))		tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rcv_tsecr, 0);}/* * This function should be called every time data is copied to user space. * It calculates the appropriate TCP receive buffer space. */void tcp_rcv_space_adjust(struct sock *sk){	struct tcp_opt *tp = tcp_sk(sk);	int time;	int space;		if (tp->rcvq_space.time == 0)		goto new_measure;		time = tcp_time_stamp - tp->rcvq_space.time;	if (time < (tp->rcv_rtt_est.rtt >> 3) ||	    tp->rcv_rtt_est.rtt == 0)		return;		space = 2 * (tp->copied_seq - tp->rcvq_space.seq);	space = max(tp->rcvq_space.space, space);	if (tp->rcvq_space.space != space) {		int rcvmem;		tp->rcvq_space.space = space;		if (sysctl_tcp_moderate_rcvbuf) {			int new_clamp = space;			/* Receive space grows, normalize in order to			 * take into account packet headers and sk_buff			 * structure overhead.			 */			space /= tp->advmss;			if (!space)				space = 1;			rcvmem = (tp->advmss + MAX_TCP_HEADER +				  16 + sizeof(struct sk_buff));			while (tcp_win_from_space(rcvmem) < tp->advmss)				rcvmem += 128;			space *= rcvmem;			space = min(space, sysctl_tcp_rmem[2]);			if (space > sk->sk_rcvbuf) {				sk->sk_rcvbuf = space;				/* Make the window clamp follow along.  */				tp->window_clamp = new_clamp;			}		}	}	new_measure:	tp->rcvq_space.seq = tp->copied_seq;	tp->rcvq_space.time = tcp_time_stamp;}/* There is something which you must keep in mind when you analyze the * behavior of the tp->ato delayed ack timeout interval.  When a * connection starts up, we want to ack as quickly as possible.  The * problem is that "good" TCP's do slow start at the beginning of data * transmission.  The means that until we send the first few ACK's the * sender will sit on his end and only queue most of his data, because * he can only send snd_cwnd unacked packets at any given time.  For * each ACK we send, he increments snd_cwnd and transmits more of his * queue.  -DaveM */static void tcp_event_data_recv(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb){	u32 now;	tcp_schedule_ack(tp);	tcp_measure_rcv_mss(tp, skb);	tcp_rcv_rtt_measure(tp);		now = tcp_time_stamp;	if (!tp->ack.ato) {		/* The _first_ data packet received, initialize		 * delayed ACK engine.		 */		tcp_incr_quickack(tp);		tp->ack.ato = TCP_ATO_MIN;	} else {		int m = now - tp->ack.lrcvtime;		if (m <= TCP_ATO_MIN/2) {			/* The fastest case is the first. */			tp->ack.ato = (tp->ack.ato>>1) + TCP_ATO_MIN/2;		} else if (m < tp->ack.ato) {			tp->ack.ato = (tp->ack.ato>>1) + m;			if (tp->ack.ato > tp->rto)				tp->ack.ato = tp->rto;		} else if (m > tp->rto) {			/* Too long gap. Apparently sender falled to			 * restart window, so that we send ACKs quickly.			 */			tcp_incr_quickack(tp);			sk_stream_mem_reclaim(sk);		}	}	tp->ack.lrcvtime = now;	TCP_ECN_check_ce(tp, skb);	if (skb->len >= 128)		tcp_grow_window(sk, tp, skb);}/* When starting a new connection, pin down the current choice of  * congestion algorithm. */void tcp_ca_init(struct tcp_opt *tp){	if (sysctl_tcp_westwood) 		tp->adv_cong = TCP_WESTWOOD;	else if (sysctl_tcp_bic)		tp->adv_cong = TCP_BIC;	else if (sysctl_tcp_vegas_cong_avoid) {		tp->adv_cong = TCP_VEGAS;		tp->vegas.baseRTT = 0x7fffffff;		tcp_vegas_enable(tp);	} }/* Do RTT sampling needed for Vegas. * Basically we: *   o min-filter RTT samples from within an RTT to get the current *     propagation delay + queuing delay (we are min-filtering to try to *     avoid the effects of delayed ACKs) *   o min-filter RTT samples from a much longer window (forever for now) *     to find the propagation delay (baseRTT) */static inline void vegas_rtt_calc(struct tcp_opt *tp, __u32 rtt){	__u32 vrtt = rtt + 1; /* Never allow zero rtt or baseRTT */	/* Filter to find propagation delay: */	if (vrtt < tp->vegas.baseRTT) 		tp->vegas.baseRTT = vrtt;	/* Find the min RTT during the last RTT to find	 * the current prop. delay + queuing delay:	 */	tp->vegas.minRTT = min(tp->vegas.minRTT, vrtt);	tp->vegas.cntRTT++;}/* Called to compute a smoothed rtt estimate. The data fed to this * routine either comes from timestamps, or from segments that were * known _not_ to have been retransmitted [see Karn/Partridge * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88 * piece by Van Jacobson. * NOTE: the next three routines used to be one big routine. * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */static void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt){	long m = mrtt; /* RTT */	if (tcp_vegas_enabled(tp))		vegas_rtt_calc(tp, mrtt);	/*	The following amusing code comes from Jacobson's	 *	article in SIGCOMM '88.  Note that rtt and mdev	 *	are scaled versions of rtt and mean deviation.	 *	This is designed to be as fast as possible 	 *	m stands for "measurement".	 *	 *	On a 1990 paper the rto value is changed to:	 *	RTO = rtt + 4 * mdev	 *	 * Funny. This algorithm seems to be very broken.	 * These formulae increase RTO, when it should be decreased, increase	 * too slowly, when it should be incresed fastly, decrease too fastly	 * etc. I guess in BSD RTO takes ONE value, so that it is absolutely	 * does not matter how to _calculate_ it. Seems, it was trap	 * that VJ failed to avoid. 8)	 */	if(m == 0)		m = 1;	if (tp->srtt != 0) {		m -= (tp->srtt >> 3);	/* m is now error in rtt est */		tp->srtt += m;		/* rtt = 7/8 rtt + 1/8 new */		if (m < 0) {			m = -m;		/* m is now abs(error) */			m -= (tp->mdev >> 2);   /* similar update on mdev */			/* This is similar to one of Eifel findings.			 * Eifel blocks mdev updates when rtt decreases.			 * This solution is a bit different: we use finer gain			 * for mdev in this case (alpha*beta).			 * Like Eifel it also prevents growth of rto,			 * but also it limits too fast rto decreases,			 * happening in pure Eifel.			 */			if (m > 0)				m >>= 3;		} else {			m -= (tp->mdev >> 2);   /* similar update on mdev */		}		tp->mdev += m;	    	/* mdev = 3/4 mdev + 1/4 new */		if (tp->mdev > tp->mdev_max) {			tp->mdev_max = tp->mdev;			if (tp->mdev_max > tp->rttvar)				tp->rttvar = tp->mdev_max;		}		if (after(tp->snd_una, tp->rtt_seq)) {			if (tp->mdev_max < tp->rttvar)				tp->rttvar -= (tp->rttvar-tp->mdev_max)>>2;			tp->rtt_seq = tp->snd_nxt;			tp->mdev_max = TCP_RTO_MIN;		}	} else {		/* no previous measure. */		tp->srtt = m<<3;	/* take the measured time to be rtt */		tp->mdev = m<<1;	/* make sure rto = 3*rtt */		tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN);		tp->rtt_seq = tp->snd_nxt;	}	tcp_westwood_update_rtt(tp, tp->srtt >> 3);}/* Calculate rto without backoff.  This is the second half of Van Jacobson's * routine referred to above. */static __inline__ void tcp_set_rto(struct tcp_opt *tp){	/* Old crap is replaced with new one. 8)	 *	 * More seriously:	 * 1. If rtt variance happened to be less 50msec, it is hallucination.	 *    It cannot be less due to utterly erratic ACK generation made	 *    at least by solaris and freebsd. "Erratic ACKs" has _nothing_	 *    to do with delayed acks, because at cwnd>2 true delack timeout	 *    is invisible. Actually, Linux-2.4 also generates erratic	 *    ACKs in some curcumstances.	 */	tp->rto = (tp->srtt >> 3) + tp->rttvar;	/* 2. Fixups made earlier cannot be right.	 *    If we do not estimate RTO correctly without them,	 *    all the algo is pure shit and should be replaced	 *    with correct one. It is exaclty, which we pretend to do.	 */}/* NOTE: clamping at TCP_RTO_MIN is not required, current algo * guarantees that rto is higher. */static __inline__ void tcp_bound_rto(struct tcp_opt *tp){	if (tp->rto > TCP_RTO_MAX)		tp->rto = TCP_RTO_MAX;}/* Save metrics learned by this TCP session.   This function is called only, when TCP finishes successfully   i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE. */void tcp_update_metrics(struct sock *sk){	struct tcp_opt *tp = tcp_sk(sk);	struct dst_entry *dst = __sk_dst_get(sk);	if (sysctl_tcp_nometrics_save)		return;	dst_confirm(dst);	if (dst && (dst->flags&DST_HOST)) {		int m;		if (tp->backoff || !tp->srtt) {			/* This session failed to estimate rtt. Why?			 * Probably, no packets returned in time.			 * Reset our results.			 */			if (!(dst_metric_locked(dst, RTAX_RTT)))				dst->metrics[RTAX_RTT-1] = 0;			return;		}		m = dst_metric(dst, RTAX_RTT) - tp->srtt;		/* If newly calculated rtt larger than stored one,		 * store new one. Otherwise, use EWMA. Remember,		 * rtt overestimation is always better than underestimation.		 */		if (!(dst_metric_locked(dst, RTAX_RTT))) {			if (m <= 0)				dst->metrics[RTAX_RTT-1] = tp->srtt;			else				dst->metrics[RTAX_RTT-1] -= (m>>3);		}		if (!(dst_metric_locked(dst, RTAX_RTTVAR))) {			if (m < 0)				m = -m;			/* Scale deviation to rttvar fixed point */			m >>= 1;			if (m < tp->mdev)				m = tp->mdev;			if (m >= dst_metric(dst, RTAX_RTTVAR))				dst->metrics[RTAX_RTTVAR-1] = m;			else				dst->metrics[RTAX_RTTVAR-1] -=					(dst->metrics[RTAX_RTTVAR-1] - m)>>2;		}		if (tp->snd_ssthresh >= 0xFFFF) {			/* Slow start still did not finish. */			if (dst_metric(dst, RTAX_SSTHRESH) &&			    !dst_metric_locked(dst, RTAX_SSTHRESH) &&			    (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH))				dst->metrics[RTAX_SSTHRESH-1] = tp->snd_cwnd >> 1;			if (!dst_metric_locked(dst, RTAX_CWND) &&			    tp->snd_cwnd > dst_metric(dst, RTAX_CWND))				dst->metrics[RTAX_CWND-1] = tp->snd_cwnd;		} else if (tp->snd_cwnd > tp->snd_ssthresh &&			   tp->ca_state == TCP_CA_Open) {			/* Cong. avoidance phase, cwnd is reliable. */			if (!dst_metric_locked(dst, RTAX_SSTHRESH))				dst->metrics[RTAX_SSTHRESH-1] =					max(tp->snd_cwnd >> 1, tp->snd_ssthresh);			if (!dst_metric_locked(dst, RTAX_CWND))				dst->metrics[RTAX_CWND-1] = (dst->metrics[RTAX_CWND-1] + tp->snd_cwnd) >> 1;		} else {			/* Else slow start did not finish, cwnd is non-sense,			   ssthresh may be also invalid.			 */			if (!dst_metric_locked(dst, RTAX_CWND))				dst->metrics[RTAX_CWND-1] = (dst->metrics[RTAX_CWND-1] + tp->snd_ssthresh) >> 1;			if (dst->metrics[RTAX_SSTHRESH-1] &&			    !dst_metric_locked(dst, RTAX_SSTHRESH) &&			    tp->snd_ssthresh > dst->metrics[RTAX_SSTHRESH-1])				dst->metrics[RTAX_SSTHRESH-1] = tp->snd_ssthresh;		}		if (!dst_metric_locked(dst, RTAX_REORDERING)) {			if (dst->metrics[RTAX_REORDERING-1] < tp->reordering &&			    tp->reordering != sysctl_tcp_reordering)				dst->metrics[RTAX_REORDERING-1] = tp->reordering;		}	}}/* Numbers are taken from RFC2414.  */__u32 tcp_init_cwnd(struct tcp_opt *tp, struct dst_entry *dst){	__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);	if (!cwnd) {		if (tp->mss_cache_std > 1460)			cwnd = 2;		else			cwnd = (tp->mss_cache_std > 1095) ? 3 : 4;	}	return min_t(__u32, cwnd, tp->snd_cwnd_clamp);}/* Initialize metrics on socket. */static void tcp_init_metrics(struct sock *sk){	struct tcp_opt *tp = tcp_sk(sk);	struct dst_entry *dst = __sk_dst_get(sk);	if (dst == NULL)		goto reset;	dst_confirm(dst);	if (dst_metric_locked(dst, RTAX_CWND))		tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND);	if (dst_metric(dst, RTAX_SSTHRESH)) {		tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH);		if (tp->snd_ssthresh > tp->snd_cwnd_clamp)			tp->snd_ssthresh = tp->snd_cwnd_clamp;	}	if (dst_metric(dst, RTAX_REORDERING) &&	    tp->reordering != dst_metric(dst, RTAX_REORDERING)) {		tp->sack_ok &= ~2;		tp->reordering = dst_metric(dst, RTAX_REORDERING);	}	if (dst_metric(dst, RTAX_RTT) == 0)		goto reset;	if (!tp->srtt && dst_metric(dst, RTAX_RTT) < (TCP_TIMEOUT_INIT << 3))		goto reset;	/* Initial rtt is determined from SYN,SYN-ACK.	 * The segment is small and rtt may appear much	 * less than real one. Use per-dst memory	 * to make it more realistic.	 *	 * A bit of theory. RTT is time passed after "normal" sized packet	 * is sent until it is ACKed. In normal curcumstances sending small	 * packets force peer to delay ACKs and calculation is correct too.	 * The algorithm is adaptive and, provided we follow specs, it	 * NEVER underestimate RTT. BUT! If peer tries to make some clever	 * tricks sort of "quick acks" for time long enough to decrease RTT
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -