📄 tcp_output.c

📁 linux 内核源代码
💻 C
📖 第 1 页 / 共 5 页
字号:
	TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;	TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;	TCP_SKB_CB(nskb)->flags = TCPCB_FLAG_ACK;	TCP_SKB_CB(nskb)->sacked = 0;	nskb->csum = 0;	nskb->ip_summed = skb->ip_summed;	len = 0;	while (len < probe_size) {		next = tcp_write_queue_next(sk, skb);		copy = min_t(int, skb->len, probe_size - len);		if (nskb->ip_summed)			skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);		else			nskb->csum = skb_copy_and_csum_bits(skb, 0,					 skb_put(nskb, copy), copy, nskb->csum);		if (skb->len <= copy) {			/* We've eaten all the data from this skb.			 * Throw it away. */			TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags;			tcp_unlink_write_queue(skb, sk);			sk_stream_free_skb(sk, skb);		} else {			TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags &						   ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);			if (!skb_shinfo(skb)->nr_frags) {				skb_pull(skb, copy);				if (skb->ip_summed != CHECKSUM_PARTIAL)					skb->csum = csum_partial(skb->data, skb->len, 0);			} else {				__pskb_trim_head(skb, copy);				tcp_set_skb_tso_segs(sk, skb, mss_now);			}			TCP_SKB_CB(skb)->seq += copy;		}		len += copy;		skb = next;	}	tcp_init_tso_segs(sk, nskb, nskb->len);	/* We're ready to send.  If this fails, the probe will	 * be resegmented into mss-sized pieces by tcp_write_xmit(). */	TCP_SKB_CB(nskb)->when = tcp_time_stamp;	if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {		/* Decrement cwnd here because we are sending		* effectively two packets. */		tp->snd_cwnd--;		update_send_head(sk, nskb);		icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);		tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq;		tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;		return 1;	}	return -1;}/* This routine writes packets to the network.  It advances the * send_head.  This happens as incoming acks open up the remote * window for us. * * Returns 1, if no segments are in flight and we have queued segments, but * cannot send anything now because of SWS or another problem. */static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle){	struct tcp_sock *tp = tcp_sk(sk);	struct sk_buff *skb;	unsigned int tso_segs, sent_pkts;	int cwnd_quota;	int result;	/* If we are closed, the bytes will have to remain here.	 * In time closedown will finish, we empty the write queue and all	 * will be happy.	 */	if (unlikely(sk->sk_state == TCP_CLOSE))		return 0;	sent_pkts = 0;	/* Do MTU probing. */	if ((result = tcp_mtu_probe(sk)) == 0) {		return 0;	} else if (result > 0) {		sent_pkts = 1;	}	while ((skb = tcp_send_head(sk))) {		unsigned int limit;		tso_segs = tcp_init_tso_segs(sk, skb, mss_now);		BUG_ON(!tso_segs);		cwnd_quota = tcp_cwnd_test(tp, skb);		if (!cwnd_quota)			break;		if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))			break;		if (tso_segs == 1) {			if (unlikely(!tcp_nagle_test(tp, skb, mss_now,						     (tcp_skb_is_last(sk, skb) ?						      nonagle : TCP_NAGLE_PUSH))))				break;		} else {			if (tcp_tso_should_defer(sk, skb))				break;		}		limit = mss_now;		if (tso_segs > 1) {			limit = tcp_window_allows(tp, skb,						  mss_now, cwnd_quota);			if (skb->len < limit) {				unsigned int trim = skb->len % mss_now;				if (trim)					limit = skb->len - trim;			}		}		if (skb->len > limit &&		    unlikely(tso_fragment(sk, skb, limit, mss_now)))			break;		TCP_SKB_CB(skb)->when = tcp_time_stamp;		if (unlikely(tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC)))			break;		/* Advance the send_head.  This one is sent out.		 * This call will increment packets_out.		 */		update_send_head(sk, skb);		tcp_minshall_update(tp, mss_now, skb);		sent_pkts++;	}	if (likely(sent_pkts)) {		tcp_cwnd_validate(sk);		return 0;	}	return !tp->packets_out && tcp_send_head(sk);}/* Push out any pending frames which were held back due to * TCP_CORK or attempt at coalescing tiny packets. * The socket must be locked by the caller. */void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,			       int nonagle){	struct sk_buff *skb = tcp_send_head(sk);	if (skb) {		if (tcp_write_xmit(sk, cur_mss, nonagle))			tcp_check_probe_timer(sk);	}}/* Send _single_ skb sitting at the send head. This function requires * true push pending frames to setup probe timer etc. */void tcp_push_one(struct sock *sk, unsigned int mss_now){	struct tcp_sock *tp = tcp_sk(sk);	struct sk_buff *skb = tcp_send_head(sk);	unsigned int tso_segs, cwnd_quota;	BUG_ON(!skb || skb->len < mss_now);	tso_segs = tcp_init_tso_segs(sk, skb, mss_now);	cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH);	if (likely(cwnd_quota)) {		unsigned int limit;		BUG_ON(!tso_segs);		limit = mss_now;		if (tso_segs > 1) {			limit = tcp_window_allows(tp, skb,						  mss_now, cwnd_quota);			if (skb->len < limit) {				unsigned int trim = skb->len % mss_now;				if (trim)					limit = skb->len - trim;			}		}		if (skb->len > limit &&		    unlikely(tso_fragment(sk, skb, limit, mss_now)))			return;		/* Send it out now. */		TCP_SKB_CB(skb)->when = tcp_time_stamp;		if (likely(!tcp_transmit_skb(sk, skb, 1, sk->sk_allocation))) {			update_send_head(sk, skb);			tcp_cwnd_validate(sk);			return;		}	}}/* This function returns the amount that we can raise the * usable window based on the following constraints * * 1. The window can never be shrunk once it is offered (RFC 793) * 2. We limit memory per socket * * RFC 1122: * "the suggested [SWS] avoidance algorithm for the receiver is to keep *  RECV.NEXT + RCV.WIN fixed until: *  RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)" * * i.e. don't raise the right edge of the window until you can raise * it at least MSS bytes. * * Unfortunately, the recommended algorithm breaks header prediction, * since header prediction assumes th->window stays fixed. * * Strictly speaking, keeping th->window fixed violates the receiver * side SWS prevention criteria. The problem is that under this rule * a stream of single byte packets will cause the right side of the * window to always advance by a single byte. * * Of course, if the sender implements sender side SWS prevention * then this will not be a problem. * * BSD seems to make the following compromise: * *	If the free space is less than the 1/4 of the maximum *	space available and the free space is less than 1/2 mss, *	then set the window to 0. *	[ Actually, bsd uses MSS and 1/4 of maximal _window_ ] *	Otherwise, just prevent the window from shrinking *	and from being larger than the largest representable value. * * This prevents incremental opening of the window in the regime * where TCP is limited by the speed of the reader side taking * data out of the TCP receive queue. It does nothing about * those cases where the window is constrained on the sender side * because the pipeline is full. * * BSD also seems to "accidentally" limit itself to windows that are a * multiple of MSS, at least until the free space gets quite small. * This would appear to be a side effect of the mbuf implementation. * Combining these two algorithms results in the observed behavior * of having a fixed window size at almost all times. * * Below we obtain similar behavior by forcing the offered window to * a multiple of the mss when it is feasible to do so. * * Note, we don't "adjust" for TIMESTAMP or SACK option bytes. * Regular options like TIMESTAMP are taken into account. */u32 __tcp_select_window(struct sock *sk){	struct inet_connection_sock *icsk = inet_csk(sk);	struct tcp_sock *tp = tcp_sk(sk);	/* MSS for the peer's data.  Previous versions used mss_clamp	 * here.  I don't know if the value based on our guesses	 * of peer's MSS is better for the performance.  It's more correct	 * but may be worse for the performance because of rcv_mss	 * fluctuations.  --SAW  1998/11/1	 */	int mss = icsk->icsk_ack.rcv_mss;	int free_space = tcp_space(sk);	int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));	int window;	if (mss > full_space)		mss = full_space;	if (free_space < full_space/2) {		icsk->icsk_ack.quick = 0;		if (tcp_memory_pressure)			tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss);		if (free_space < mss)			return 0;	}	if (free_space > tp->rcv_ssthresh)		free_space = tp->rcv_ssthresh;	/* Don't do rounding if we are using window scaling, since the	 * scaled window will not line up with the MSS boundary anyway.	 */	window = tp->rcv_wnd;	if (tp->rx_opt.rcv_wscale) {		window = free_space;		/* Advertise enough space so that it won't get scaled away.		 * Import case: prevent zero window announcement if		 * 1<<rcv_wscale > mss.		 */		if (((window >> tp->rx_opt.rcv_wscale) << tp->rx_opt.rcv_wscale) != window)			window = (((window >> tp->rx_opt.rcv_wscale) + 1)				  << tp->rx_opt.rcv_wscale);	} else {		/* Get the largest window that is a nice multiple of mss.		 * Window clamp already applied above.		 * If our current window offering is within 1 mss of the		 * free space we just keep it. This prevents the divide		 * and multiply from happening most of the time.		 * We also don't do any window rounding when the free space		 * is too small.		 */		if (window <= free_space - mss || window > free_space)			window = (free_space/mss)*mss;		else if (mss == full_space &&			 free_space > window + full_space/2)			window = free_space;	}	return window;}/* Attempt to collapse two adjacent SKB's during retransmission. */static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now){	struct tcp_sock *tp = tcp_sk(sk);	struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);	/* The first test we must make is that neither of these two	 * SKB's are still referenced by someone else.	 */	if (!skb_cloned(skb) && !skb_cloned(next_skb)) {		int skb_size = skb->len, next_skb_size = next_skb->len;		u16 flags = TCP_SKB_CB(skb)->flags;		/* Also punt if next skb has been SACK'd. */		if (TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED)			return;		/* Next skb is out of window. */		if (after(TCP_SKB_CB(next_skb)->end_seq, tp->snd_una+tp->snd_wnd))			return;		/* Punt if not enough space exists in the first SKB for		 * the data in the second, or the total combined payload		 * would exceed the MSS.		 */		if ((next_skb_size > skb_tailroom(skb)) ||		    ((skb_size + next_skb_size) > mss_now))			return;		BUG_ON(tcp_skb_pcount(skb) != 1 ||		       tcp_skb_pcount(next_skb) != 1);		if (WARN_ON(tcp_is_sack(tp) && tp->sacked_out &&		    (TCP_SKB_CB(next_skb)->seq == tp->highest_sack)))			return;		/* Ok.	We will be able to collapse the packet. */		tcp_unlink_write_queue(next_skb, sk);		skb_copy_from_linear_data(next_skb,					  skb_put(skb, next_skb_size),					  next_skb_size);		if (next_skb->ip_summed == CHECKSUM_PARTIAL)			skb->ip_summed = CHECKSUM_PARTIAL;		if (skb->ip_summed != CHECKSUM_PARTIAL)			skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size);		/* Update sequence range on original skb. */		TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;		/* Merge over control information. */		flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */		TCP_SKB_CB(skb)->flags = flags;		/* All done, get rid of second SKB and account for it so		 * packet counting does not break.		 */		TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked&(TCPCB_EVER_RETRANS|TCPCB_AT_TAIL);		if (TCP_SKB_CB(next_skb)->sacked&TCPCB_SACKED_RETRANS)			tp->retrans_out -= tcp_skb_pcount(next_skb);		if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST)			tp->lost_out -= tcp_skb_pcount(next_skb);		/* Reno case is special. Sigh... */		if (tcp_is_reno(tp) && tp->sacked_out)			tcp_dec_pcount_approx(&tp->sacked_out, next_skb);		tcp_adjust_fackets_out(tp, next_skb, tcp_skb_pcount(next_skb));		tp->packets_out -= tcp_skb_pcount(next_skb);		/* changed transmit queue under us so clear hints */		tcp_clear_retrans_hints_partial(tp);		/* manually tune sacktag skb hint */		if (tp->fastpath_skb_hint == next_skb) {			tp->fastpath_skb_hint = skb;			tp->fastpath_cnt_hint -= tcp_skb_pcount(skb);		}		sk_stream_free_skb(sk, next_skb);	}}/* Do a simple retransmit without using the backoff mechanisms in * tcp_timer. This is used for path mtu discovery. * The socket is already locked here. */void tcp_simple_retransmit(struct sock *sk){	const struct inet_connection_sock *icsk = inet_csk(sk);	struct tcp_sock *tp = tcp_sk(sk);	struct sk_buff *skb;	unsigned int mss = tcp_current_mss(sk, 0);	int lost = 0;	tcp_for_write_queue(skb, sk) {		if (skb == tcp_send_head(sk))			break;		if (skb->len > mss &&		    !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {			if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {				TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;				tp->retrans_out -= tcp_skb_pcount(skb);			}			if (!(TCP_SKB_CB(skb)->sacked&TCPCB_LOST)) {				TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;				tp->lost_out += tcp_skb_pcount(skb);				lost = 1;			}		}	}	tcp_clear_all_retrans_hints(tp);	if (!lost)
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -