📄 tcp_output.c

📁 Linux Kernel 2.6.9 for OMAP1710
💻 C
📖 第 1 页 / 共 4 页
字号:
	if (skb->len <= mss_std) {		/* Avoid the costly divide in the normal		 * non-TSO case.		 */		skb_shinfo(skb)->tso_segs = 1;		skb_shinfo(skb)->tso_size = 0;	} else {		unsigned int factor;		factor = skb->len + (mss_std - 1);		factor /= mss_std;		skb_shinfo(skb)->tso_segs = factor;		skb_shinfo(skb)->tso_size = mss_std;	}}/* Function to create two new TCP segments.  Shrinks the given segment * to the specified size and appends a new segment with the rest of the * packet to the list.  This won't be called frequently, I hope.  * Remember, these are still headerless SKBs at this point. */static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len){	struct tcp_opt *tp = tcp_sk(sk);	struct sk_buff *buff;	int nsize = skb->len - len;	u16 flags;	if (skb_cloned(skb) &&	    skb_is_nonlinear(skb) &&	    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))		return -ENOMEM;	/* Get a new skb... force flag on. */	buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC);	if (buff == NULL)		return -ENOMEM; /* We'll just try again later. */	sk_charge_skb(sk, buff);	/* Correct the sequence numbers. */	TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;	TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq;	TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq;	/* PSH and FIN should only be set in the second packet. */	flags = TCP_SKB_CB(skb)->flags;	TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);	TCP_SKB_CB(buff)->flags = flags;	TCP_SKB_CB(buff)->sacked =		(TCP_SKB_CB(skb)->sacked &		 (TCPCB_LOST | TCPCB_EVER_RETRANS | TCPCB_AT_TAIL));	TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL;	if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_HW) {		/* Copy and checksum data tail into the new buffer. */		buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize),						       nsize, 0);		skb_trim(skb, len);		skb->csum = csum_block_sub(skb->csum, buff->csum, len);	} else {		skb->ip_summed = CHECKSUM_HW;		skb_split(skb, buff, len);	}	buff->ip_summed = skb->ip_summed;	/* Looks stupid, but our code really uses when of	 * skbs, which it never sent before. --ANK	 */	TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;	if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {		tcp_dec_pcount(&tp->lost_out, skb);		tcp_dec_pcount(&tp->left_out, skb);	}	/* Fix up tso_factor for both original and new SKB.  */	tcp_set_skb_tso_segs(skb, tp->mss_cache_std);	tcp_set_skb_tso_segs(buff, tp->mss_cache_std);	if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {		tcp_inc_pcount(&tp->lost_out, skb);		tcp_inc_pcount(&tp->left_out, skb);	}	if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) {		tcp_inc_pcount(&tp->lost_out, buff);		tcp_inc_pcount(&tp->left_out, buff);	}	/* Link BUFF into the send queue. */	__skb_append(skb, buff);	return 0;}/* This is similar to __pskb_pull_head() (it will go to core/skbuff.c * eventually). The difference is that pulled data not copied, but * immediately discarded. */static unsigned char *__pskb_trim_head(struct sk_buff *skb, int len){	int i, k, eat;	eat = len;	k = 0;	for (i=0; i<skb_shinfo(skb)->nr_frags; i++) {		if (skb_shinfo(skb)->frags[i].size <= eat) {			put_page(skb_shinfo(skb)->frags[i].page);			eat -= skb_shinfo(skb)->frags[i].size;		} else {			skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];			if (eat) {				skb_shinfo(skb)->frags[k].page_offset += eat;				skb_shinfo(skb)->frags[k].size -= eat;				eat = 0;			}			k++;		}	}	skb_shinfo(skb)->nr_frags = k;	skb->tail = skb->data;	skb->data_len -= len;	skb->len = skb->data_len;	return skb->tail;}int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len){	struct tcp_opt *tp = tcp_sk(sk);	if (skb_cloned(skb) &&	    pskb_expand_head(skb, 0, 0, GFP_ATOMIC))		return -ENOMEM;	if (len <= skb_headlen(skb)) {		__skb_pull(skb, len);	} else {		if (__pskb_trim_head(skb, len-skb_headlen(skb)) == NULL)			return -ENOMEM;	}	TCP_SKB_CB(skb)->seq += len;	skb->ip_summed = CHECKSUM_HW;	skb->truesize	     -= len;	sk->sk_queue_shrunk   = 1;	sk->sk_wmem_queued   -= len;	sk->sk_forward_alloc += len;	/* Any change of skb->len requires recalculation of tso	 * factor and mss.	 */	tcp_set_skb_tso_segs(skb, tp->mss_cache_std);	return 0;}/* This function synchronize snd mss to current pmtu/exthdr set.   tp->user_mss is mss set by user by TCP_MAXSEG. It does NOT counts   for TCP options, but includes only bare TCP header.   tp->mss_clamp is mss negotiated at connection setup.   It is minumum of user_mss and mss received with SYN.   It also does not include TCP options.   tp->pmtu_cookie is last pmtu, seen by this function.   tp->mss_cache is current effective sending mss, including   all tcp options except for SACKs. It is evaluated,   taking into account current pmtu, but never exceeds   tp->mss_clamp.   NOTE1. rfc1122 clearly states that advertised MSS   DOES NOT include either tcp or ip options.   NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside   this function.			--ANK (980731) */unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu){	struct tcp_opt *tp = tcp_sk(sk);	struct dst_entry *dst = __sk_dst_get(sk);	int mss_now;	if (dst && dst->ops->get_mss)		pmtu = dst->ops->get_mss(dst, pmtu);	/* Calculate base mss without TCP options:	   It is MMS_S - sizeof(tcphdr) of rfc1122	 */	mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr);	/* Clamp it (mss_clamp does not include tcp options) */	if (mss_now > tp->mss_clamp)		mss_now = tp->mss_clamp;	/* Now subtract optional transport overhead */	mss_now -= tp->ext_header_len + tp->ext2_header_len;	/* Then reserve room for full set of TCP options and 8 bytes of data */	if (mss_now < 48)		mss_now = 48;	/* Now subtract TCP options size, not including SACKs */	mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);	/* Bound mss with half of window */	if (tp->max_window && mss_now > (tp->max_window>>1))		mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len);	/* And store cached results */	tp->pmtu_cookie = pmtu;	tp->mss_cache = tp->mss_cache_std = mss_now;	return mss_now;}/* Compute the current effective MSS, taking SACKs and IP options, * and even PMTU discovery events into account. * * LARGESEND note: !urg_mode is overkill, only frames up to snd_up * cannot be large. However, taking into account rare use of URG, this * is not a big flaw. */unsigned int tcp_current_mss(struct sock *sk, int large){	struct tcp_opt *tp = tcp_sk(sk);	struct dst_entry *dst = __sk_dst_get(sk);	unsigned int do_large, mss_now;	mss_now = tp->mss_cache_std;	if (dst) {		u32 mtu = dst_pmtu(dst);		if (mtu != tp->pmtu_cookie ||		    tp->ext2_header_len != dst->header_len)			mss_now = tcp_sync_mss(sk, mtu);	}	do_large = (large &&		    (sk->sk_route_caps & NETIF_F_TSO) &&		    !tp->urg_mode);	if (do_large) {		unsigned int large_mss, factor, limit;		large_mss = 65535 - tp->af_specific->net_header_len -			tp->ext_header_len - tp->ext2_header_len -			tp->tcp_header_len;		if (tp->max_window && large_mss > (tp->max_window>>1))			large_mss = max((tp->max_window>>1),					68U - tp->tcp_header_len);		factor = large_mss / mss_now;		/* Always keep large mss multiple of real mss, but		 * do not exceed 1/tso_win_divisor of the congestion window		 * so we can keep the ACK clock ticking and minimize		 * bursting.		 */		limit = tp->snd_cwnd;		if (sysctl_tcp_tso_win_divisor)			limit /= sysctl_tcp_tso_win_divisor;		limit = max(1U, limit);		if (factor > limit)			factor = limit;		tp->mss_cache = mss_now * factor;		mss_now = tp->mss_cache;	}	if (tp->eff_sacks)		mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +			    (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK));	return mss_now;}/* This routine writes packets to the network.  It advances the * send_head.  This happens as incoming acks open up the remote * window for us. * * Returns 1, if no segments are in flight and we have queued segments, but * cannot send anything now because of SWS or another problem. */int tcp_write_xmit(struct sock *sk, int nonagle){	struct tcp_opt *tp = tcp_sk(sk);	unsigned int mss_now;	/* If we are closed, the bytes will have to remain here.	 * In time closedown will finish, we empty the write queue and all	 * will be happy.	 */	if (sk->sk_state != TCP_CLOSE) {		struct sk_buff *skb;		int sent_pkts = 0;		/* Account for SACKS, we may need to fragment due to this.		 * It is just like the real MSS changing on us midstream.		 * We also handle things correctly when the user adds some		 * IP options mid-stream.  Silly to do, but cover it.		 */		mss_now = tcp_current_mss(sk, 1);		while ((skb = sk->sk_send_head) &&		       tcp_snd_test(tp, skb, mss_now,			       	    tcp_skb_is_last(sk, skb) ? nonagle :				    			       TCP_NAGLE_PUSH)) {			if (skb->len > mss_now) {				if (tcp_fragment(sk, skb, mss_now))					break;			}			TCP_SKB_CB(skb)->when = tcp_time_stamp;			if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)))				break;			/* Advance the send_head.  This one is sent out.			 * This call will increment packets_out.			 */			update_send_head(sk, tp, skb);			tcp_minshall_update(tp, mss_now, skb);			sent_pkts = 1;		}		if (sent_pkts) {			tcp_cwnd_validate(sk, tp);			return 0;		}		return !tcp_get_pcount(&tp->packets_out) && sk->sk_send_head;	}	return 0;}/* This function returns the amount that we can raise the * usable window based on the following constraints *   * 1. The window can never be shrunk once it is offered (RFC 793) * 2. We limit memory per socket * * RFC 1122: * "the suggested [SWS] avoidance algorithm for the receiver is to keep *  RECV.NEXT + RCV.WIN fixed until: *  RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)" * * i.e. don't raise the right edge of the window until you can raise * it at least MSS bytes. * * Unfortunately, the recommended algorithm breaks header prediction, * since header prediction assumes th->window stays fixed. * * Strictly speaking, keeping th->window fixed violates the receiver * side SWS prevention criteria. The problem is that under this rule * a stream of single byte packets will cause the right side of the * window to always advance by a single byte. *  * Of course, if the sender implements sender side SWS prevention * then this will not be a problem. *  * BSD seems to make the following compromise: *  *	If the free space is less than the 1/4 of the maximum *	space available and the free space is less than 1/2 mss, *	then set the window to 0. *	[ Actually, bsd uses MSS and 1/4 of maximal _window_ ] *	Otherwise, just prevent the window from shrinking *	and from being larger than the largest representable value. * * This prevents incremental opening of the window in the regime * where TCP is limited by the speed of the reader side taking * data out of the TCP receive queue. It does nothing about * those cases where the window is constrained on the sender side * because the pipeline is full. * * BSD also seems to "accidentally" limit itself to windows that are a * multiple of MSS, at least until the free space gets quite small. * This would appear to be a side effect of the mbuf implementation. * Combining these two algorithms results in the observed behavior * of having a fixed window size at almost all times. * * Below we obtain similar behavior by forcing the offered window to * a multiple of the mss when it is feasible to do so. * * Note, we don't "adjust" for TIMESTAMP or SACK option bytes. * Regular options like TIMESTAMP are taken into account. */u32 __tcp_select_window(struct sock *sk){	struct tcp_opt *tp = tcp_sk(sk);	/* MSS for the peer's data.  Previous verions used mss_clamp	 * here.  I don't know if the value based on our guesses	 * of peer's MSS is better for the performance.  It's more correct	 * but may be worse for the performance because of rcv_mss	 * fluctuations.  --SAW  1998/11/1	 */	int mss = tp->ack.rcv_mss;	int free_space = tcp_space(sk);	int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));	int window;	if (mss > full_space)		mss = full_space; 	if (free_space < full_space/2) {		tp->ack.quick = 0;		if (tcp_memory_pressure)			tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss);		if (free_space < mss)			return 0;	}	if (free_space > tp->rcv_ssthresh)		free_space = tp->rcv_ssthresh;	/* Don't do rounding if we are using window scaling, since the	 * scaled window will not line up with the MSS boundary anyway.	 */	window = tp->rcv_wnd;	if (tp->rcv_wscale) {		window = free_space;
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -