📄 tcp_input.c

📁 linux 内核源代码
💻 C
📖 第 1 页 / 共 5 页
字号:
{	__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);	if (!cwnd) {		if (tp->mss_cache > 1460)			cwnd = 2;		else			cwnd = (tp->mss_cache > 1095) ? 3 : 4;	}	return min_t(__u32, cwnd, tp->snd_cwnd_clamp);}/* Set slow start threshold and cwnd not falling to slow start */void tcp_enter_cwr(struct sock *sk, const int set_ssthresh){	struct tcp_sock *tp = tcp_sk(sk);	const struct inet_connection_sock *icsk = inet_csk(sk);	tp->prior_ssthresh = 0;	tp->bytes_acked = 0;	if (icsk->icsk_ca_state < TCP_CA_CWR) {		tp->undo_marker = 0;		if (set_ssthresh)			tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);		tp->snd_cwnd = min(tp->snd_cwnd,				   tcp_packets_in_flight(tp) + 1U);		tp->snd_cwnd_cnt = 0;		tp->high_seq = tp->snd_nxt;		tp->snd_cwnd_stamp = tcp_time_stamp;		TCP_ECN_queue_cwr(tp);		tcp_set_ca_state(sk, TCP_CA_CWR);	}}/* * Packet counting of FACK is based on in-order assumptions, therefore TCP * disables it when reordering is detected */static void tcp_disable_fack(struct tcp_sock *tp){	tp->rx_opt.sack_ok &= ~2;}/* Take a notice that peer is sending D-SACKs */static void tcp_dsack_seen(struct tcp_sock *tp){	tp->rx_opt.sack_ok |= 4;}/* Initialize metrics on socket. */static void tcp_init_metrics(struct sock *sk){	struct tcp_sock *tp = tcp_sk(sk);	struct dst_entry *dst = __sk_dst_get(sk);	if (dst == NULL)		goto reset;	dst_confirm(dst);	if (dst_metric_locked(dst, RTAX_CWND))		tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND);	if (dst_metric(dst, RTAX_SSTHRESH)) {		tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH);		if (tp->snd_ssthresh > tp->snd_cwnd_clamp)			tp->snd_ssthresh = tp->snd_cwnd_clamp;	}	if (dst_metric(dst, RTAX_REORDERING) &&	    tp->reordering != dst_metric(dst, RTAX_REORDERING)) {		tcp_disable_fack(tp);		tp->reordering = dst_metric(dst, RTAX_REORDERING);	}	if (dst_metric(dst, RTAX_RTT) == 0)		goto reset;	if (!tp->srtt && dst_metric(dst, RTAX_RTT) < (TCP_TIMEOUT_INIT << 3))		goto reset;	/* Initial rtt is determined from SYN,SYN-ACK.	 * The segment is small and rtt may appear much	 * less than real one. Use per-dst memory	 * to make it more realistic.	 *	 * A bit of theory. RTT is time passed after "normal" sized packet	 * is sent until it is ACKed. In normal circumstances sending small	 * packets force peer to delay ACKs and calculation is correct too.	 * The algorithm is adaptive and, provided we follow specs, it	 * NEVER underestimate RTT. BUT! If peer tries to make some clever	 * tricks sort of "quick acks" for time long enough to decrease RTT	 * to low value, and then abruptly stops to do it and starts to delay	 * ACKs, wait for troubles.	 */	if (dst_metric(dst, RTAX_RTT) > tp->srtt) {		tp->srtt = dst_metric(dst, RTAX_RTT);		tp->rtt_seq = tp->snd_nxt;	}	if (dst_metric(dst, RTAX_RTTVAR) > tp->mdev) {		tp->mdev = dst_metric(dst, RTAX_RTTVAR);		tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));	}	tcp_set_rto(sk);	tcp_bound_rto(sk);	if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp)		goto reset;	tp->snd_cwnd = tcp_init_cwnd(tp, dst);	tp->snd_cwnd_stamp = tcp_time_stamp;	return;reset:	/* Play conservative. If timestamps are not	 * supported, TCP will fail to recalculate correct	 * rtt, if initial rto is too small. FORGET ALL AND RESET!	 */	if (!tp->rx_opt.saw_tstamp && tp->srtt) {		tp->srtt = 0;		tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT;		inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;	}}static void tcp_update_reordering(struct sock *sk, const int metric,				  const int ts){	struct tcp_sock *tp = tcp_sk(sk);	if (metric > tp->reordering) {		tp->reordering = min(TCP_MAX_REORDERING, metric);		/* This exciting event is worth to be remembered. 8) */		if (ts)			NET_INC_STATS_BH(LINUX_MIB_TCPTSREORDER);		else if (tcp_is_reno(tp))			NET_INC_STATS_BH(LINUX_MIB_TCPRENOREORDER);		else if (tcp_is_fack(tp))			NET_INC_STATS_BH(LINUX_MIB_TCPFACKREORDER);		else			NET_INC_STATS_BH(LINUX_MIB_TCPSACKREORDER);#if FASTRETRANS_DEBUG > 1		printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n",		       tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state,		       tp->reordering,		       tp->fackets_out,		       tp->sacked_out,		       tp->undo_marker ? tp->undo_retrans : 0);#endif		tcp_disable_fack(tp);	}}/* This procedure tags the retransmission queue when SACKs arrive. * * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L). * Packets in queue with these bits set are counted in variables * sacked_out, retrans_out and lost_out, correspondingly. * * Valid combinations are: * Tag  InFlight	Description * 0	1		- orig segment is in flight. * S	0		- nothing flies, orig reached receiver. * L	0		- nothing flies, orig lost by net. * R	2		- both orig and retransmit are in flight. * L|R	1		- orig is lost, retransmit is in flight. * S|R  1		- orig reached receiver, retrans is still in flight. * (L|S|R is logically valid, it could occur when L|R is sacked, *  but it is equivalent to plain S and code short-curcuits it to S. *  L|S is logically invalid, it would mean -1 packet in flight 8)) * * These 6 states form finite state machine, controlled by the following events: * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue()) * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue()) * 3. Loss detection event of one of three flavors: *	A. Scoreboard estimator decided the packet is lost. *	   A'. Reno "three dupacks" marks head of queue lost. *	   A''. Its FACK modfication, head until snd.fack is lost. *	B. SACK arrives sacking data transmitted after never retransmitted *	   hole was sent out. *	C. SACK arrives sacking SND.NXT at the moment, when the *	   segment was retransmitted. * 4. D-SACK added new rule: D-SACK changes any tag to S. * * It is pleasant to note, that state diagram turns out to be commutative, * so that we are allowed not to be bothered by order of our actions, * when multiple events arrive simultaneously. (see the function below). * * Reordering detection. * -------------------- * Reordering metric is maximal distance, which a packet can be displaced * in packet stream. With SACKs we can estimate it: * * 1. SACK fills old hole and the corresponding segment was not *    ever retransmitted -> reordering. Alas, we cannot use it *    when segment was retransmitted. * 2. The last flaw is solved with D-SACK. D-SACK arrives *    for retransmitted and already SACKed segment -> reordering.. * Both of these heuristics are not used in Loss state, when we cannot * account for retransmits accurately. * * SACK block validation. * ---------------------- * * SACK block range validation checks that the received SACK block fits to * the expected sequence limits, i.e., it is between SND.UNA and SND.NXT. * Note that SND.UNA is not included to the range though being valid because * it means that the receiver is rather inconsistent with itself reporting * SACK reneging when it should advance SND.UNA. Such SACK block this is * perfectly valid, however, in light of RFC2018 which explicitly states * that "SACK block MUST reflect the newest segment.  Even if the newest * segment is going to be discarded ...", not that it looks very clever * in case of head skb. Due to potentional receiver driven attacks, we * choose to avoid immediate execution of a walk in write queue due to * reneging and defer head skb's loss recovery to standard loss recovery * procedure that will eventually trigger (nothing forbids us doing this). * * Implements also blockage to start_seq wrap-around. Problem lies in the * fact that though start_seq (s) is before end_seq (i.e., not reversed), * there's no guarantee that it will be before snd_nxt (n). The problem * happens when start_seq resides between end_seq wrap (e_w) and snd_nxt * wrap (s_w): * *         <- outs wnd ->                          <- wrapzone -> *         u     e      n                         u_w   e_w  s n_w *         |     |      |                          |     |   |  | * |<------------+------+----- TCP seqno space --------------+---------->| * ...-- <2^31 ->|                                           |<--------... * ...---- >2^31 ------>|                                    |<--------... * * Current code wouldn't be vulnerable but it's better still to discard such * crazy SACK blocks. Doing this check for start_seq alone closes somewhat * similar case (end_seq after snd_nxt wrap) as earlier reversed check in * snd_nxt wrap -> snd_una region will then become "well defined", i.e., * equal to the ideal case (infinite seqno space without wrap caused issues). * * With D-SACK the lower bound is extended to cover sequence space below * SND.UNA down to undo_marker, which is the last point of interest. Yet * again, D-SACK block must not to go across snd_una (for the same reason as * for the normal SACK blocks, explained above). But there all simplicity * ends, TCP might receive valid D-SACKs below that. As long as they reside * fully below undo_marker they do not affect behavior in anyway and can * therefore be safely ignored. In rare cases (which are more or less * theoretical ones), the D-SACK will nicely cross that boundary due to skb * fragmentation and packet reordering past skb's retransmission. To consider * them correctly, the acceptable range must be extended even more though * the exact amount is rather hard to quantify. However, tp->max_window can * be used as an exaggerated estimate. */static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack,				  u32 start_seq, u32 end_seq){	/* Too far in future, or reversed (interpretation is ambiguous) */	if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq))		return 0;	/* Nasty start_seq wrap-around check (see comments above) */	if (!before(start_seq, tp->snd_nxt))		return 0;	/* In outstanding window? ...This is valid exit for D-SACKs too.	 * start_seq == snd_una is non-sensical (see comments above)	 */	if (after(start_seq, tp->snd_una))		return 1;	if (!is_dsack || !tp->undo_marker)		return 0;	/* ...Then it's D-SACK, and must reside below snd_una completely */	if (!after(end_seq, tp->snd_una))		return 0;	if (!before(start_seq, tp->undo_marker))		return 1;	/* Too old */	if (!after(end_seq, tp->undo_marker))		return 0;	/* Undo_marker boundary crossing (overestimates a lot). Known already:	 *   start_seq < undo_marker and end_seq >= undo_marker.	 */	return !before(start_seq, end_seq - tp->max_window);}/* Check for lost retransmit. This superb idea is borrowed from "ratehalving". * Event "C". Later note: FACK people cheated me again 8), we have to account * for reordering! Ugly, but should help. * * Search retransmitted skbs from write_queue that were sent when snd_nxt was * less than what is now known to be received by the other end (derived from * SACK blocks by the caller). Also calculate the lowest snd_nxt among the * remaining retransmitted skbs to avoid some costly processing per ACKs. */static int tcp_mark_lost_retrans(struct sock *sk, u32 received_upto){	struct tcp_sock *tp = tcp_sk(sk);	struct sk_buff *skb;	int flag = 0;	int cnt = 0;	u32 new_low_seq = tp->snd_nxt;	tcp_for_write_queue(skb, sk) {		u32 ack_seq = TCP_SKB_CB(skb)->ack_seq;		if (skb == tcp_send_head(sk))			break;		if (cnt == tp->retrans_out)			break;		if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una))			continue;		if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS))			continue;		if (after(received_upto, ack_seq) &&		    (tcp_is_fack(tp) ||		     !before(received_upto,			     ack_seq + tp->reordering * tp->mss_cache))) {			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;			tp->retrans_out -= tcp_skb_pcount(skb);			/* clear lost hint */			tp->retransmit_skb_hint = NULL;			if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) {				tp->lost_out += tcp_skb_pcount(skb);				TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;				flag |= FLAG_DATA_SACKED;				NET_INC_STATS_BH(LINUX_MIB_TCPLOSTRETRANSMIT);			}		} else {			if (before(ack_seq, new_low_seq))				new_low_seq = ack_seq;			cnt += tcp_skb_pcount(skb);		}	}	if (tp->retrans_out)		tp->lost_retrans_low = new_low_seq;	return flag;}static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb,			   struct tcp_sack_block_wire *sp, int num_sacks,			   u32 prior_snd_una){	u32 start_seq_0 = ntohl(get_unaligned(&sp[0].start_seq));	u32 end_seq_0 = ntohl(get_unaligned(&sp[0].end_seq));	int dup_sack = 0;	if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {		dup_sack = 1;		tcp_dsack_seen(tp);		NET_INC_STATS_BH(LINUX_MIB_TCPDSACKRECV);	} else if (num_sacks > 1) {		u32 end_seq_1 = ntohl(get_unaligned(&sp[1].end_seq));		u32 start_seq_1 = ntohl(get_unaligned(&sp[1].start_seq));		if (!after(end_seq_0, end_seq_1) &&		    !before(start_seq_0, start_seq_1)) {			dup_sack = 1;			tcp_dsack_seen(tp);			NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFORECV);		}	}	/* D-SACK for already forgotten data... Do dumb counting. */	if (dup_sack &&	    !after(end_seq_0, prior_snd_una) &&	    after(end_seq_0, tp->undo_marker))		tp->undo_retrans--;	return dup_sack;}/* Check if skb is fully within the SACK block. In presence of GSO skbs, * the incoming SACK may not exactly match but we can find smaller MSS * aligned portion of it that matches. Therefore we might need to fragment * which may fail and creates some hassle (caller must handle error case * returns). */static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,				 u32 start_seq, u32 end_seq){	int in_sack, err;	unsigned int pkt_len;	in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) &&		  !before(end_seq, TCP_SKB_CB(skb)->end_seq);	if (tcp_skb_pcount(skb) > 1 && !in_sack &&	    after(TCP_SKB_CB(skb)->end_seq, start_seq)) {		in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq);		if (!in_sack)			pkt_len = start_seq - TCP_SKB_CB(skb)->seq;		else			pkt_len = end_seq - TCP_SKB_CB(skb)->seq;		err = tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->gso_size);		if (err < 0)			return err;	}	return in_sack;}static inttcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una){	const struct inet_connection_sock *icsk = inet_csk(sk);
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -