📄 tcp_input.c
字号:
{ __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); if (!cwnd) { if (tp->mss_cache > 1460) cwnd = 2; else cwnd = (tp->mss_cache > 1095) ? 3 : 4; } return min_t(__u32, cwnd, tp->snd_cwnd_clamp);}/* Set slow start threshold and cwnd not falling to slow start */void tcp_enter_cwr(struct sock *sk, const int set_ssthresh){ struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); tp->prior_ssthresh = 0; tp->bytes_acked = 0; if (icsk->icsk_ca_state < TCP_CA_CWR) { tp->undo_marker = 0; if (set_ssthresh) tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1U); tp->snd_cwnd_cnt = 0; tp->high_seq = tp->snd_nxt; tp->snd_cwnd_stamp = tcp_time_stamp; TCP_ECN_queue_cwr(tp); tcp_set_ca_state(sk, TCP_CA_CWR); }}/* * Packet counting of FACK is based on in-order assumptions, therefore TCP * disables it when reordering is detected */static void tcp_disable_fack(struct tcp_sock *tp){ tp->rx_opt.sack_ok &= ~2;}/* Take a notice that peer is sending D-SACKs */static void tcp_dsack_seen(struct tcp_sock *tp){ tp->rx_opt.sack_ok |= 4;}/* Initialize metrics on socket. */static void tcp_init_metrics(struct sock *sk){ struct tcp_sock *tp = tcp_sk(sk); struct dst_entry *dst = __sk_dst_get(sk); if (dst == NULL) goto reset; dst_confirm(dst); if (dst_metric_locked(dst, RTAX_CWND)) tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND); if (dst_metric(dst, RTAX_SSTHRESH)) { tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH); if (tp->snd_ssthresh > tp->snd_cwnd_clamp) tp->snd_ssthresh = tp->snd_cwnd_clamp; } if (dst_metric(dst, RTAX_REORDERING) && tp->reordering != dst_metric(dst, RTAX_REORDERING)) { tcp_disable_fack(tp); tp->reordering = dst_metric(dst, RTAX_REORDERING); } if (dst_metric(dst, RTAX_RTT) == 0) goto reset; if (!tp->srtt && dst_metric(dst, RTAX_RTT) < (TCP_TIMEOUT_INIT << 3)) goto reset; /* Initial rtt is determined from SYN,SYN-ACK. * The segment is small and rtt may appear much * less than real one. Use per-dst memory * to make it more realistic. * * A bit of theory. RTT is time passed after "normal" sized packet * is sent until it is ACKed. In normal circumstances sending small * packets force peer to delay ACKs and calculation is correct too. * The algorithm is adaptive and, provided we follow specs, it * NEVER underestimate RTT. BUT! If peer tries to make some clever * tricks sort of "quick acks" for time long enough to decrease RTT * to low value, and then abruptly stops to do it and starts to delay * ACKs, wait for troubles. */ if (dst_metric(dst, RTAX_RTT) > tp->srtt) { tp->srtt = dst_metric(dst, RTAX_RTT); tp->rtt_seq = tp->snd_nxt; } if (dst_metric(dst, RTAX_RTTVAR) > tp->mdev) { tp->mdev = dst_metric(dst, RTAX_RTTVAR); tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); } tcp_set_rto(sk); tcp_bound_rto(sk); if (inet_csk(sk)->icsk_rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) goto reset; tp->snd_cwnd = tcp_init_cwnd(tp, dst); tp->snd_cwnd_stamp = tcp_time_stamp; return;reset: /* Play conservative. If timestamps are not * supported, TCP will fail to recalculate correct * rtt, if initial rto is too small. FORGET ALL AND RESET! */ if (!tp->rx_opt.saw_tstamp && tp->srtt) { tp->srtt = 0; tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT; inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT; }}static void tcp_update_reordering(struct sock *sk, const int metric, const int ts){ struct tcp_sock *tp = tcp_sk(sk); if (metric > tp->reordering) { tp->reordering = min(TCP_MAX_REORDERING, metric); /* This exciting event is worth to be remembered. 8) */ if (ts) NET_INC_STATS_BH(LINUX_MIB_TCPTSREORDER); else if (tcp_is_reno(tp)) NET_INC_STATS_BH(LINUX_MIB_TCPRENOREORDER); else if (tcp_is_fack(tp)) NET_INC_STATS_BH(LINUX_MIB_TCPFACKREORDER); else NET_INC_STATS_BH(LINUX_MIB_TCPSACKREORDER);#if FASTRETRANS_DEBUG > 1 printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n", tp->rx_opt.sack_ok, inet_csk(sk)->icsk_ca_state, tp->reordering, tp->fackets_out, tp->sacked_out, tp->undo_marker ? tp->undo_retrans : 0);#endif tcp_disable_fack(tp); }}/* This procedure tags the retransmission queue when SACKs arrive. * * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L). * Packets in queue with these bits set are counted in variables * sacked_out, retrans_out and lost_out, correspondingly. * * Valid combinations are: * Tag InFlight Description * 0 1 - orig segment is in flight. * S 0 - nothing flies, orig reached receiver. * L 0 - nothing flies, orig lost by net. * R 2 - both orig and retransmit are in flight. * L|R 1 - orig is lost, retransmit is in flight. * S|R 1 - orig reached receiver, retrans is still in flight. * (L|S|R is logically valid, it could occur when L|R is sacked, * but it is equivalent to plain S and code short-curcuits it to S. * L|S is logically invalid, it would mean -1 packet in flight 8)) * * These 6 states form finite state machine, controlled by the following events: * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue()) * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue()) * 3. Loss detection event of one of three flavors: * A. Scoreboard estimator decided the packet is lost. * A'. Reno "three dupacks" marks head of queue lost. * A''. Its FACK modfication, head until snd.fack is lost. * B. SACK arrives sacking data transmitted after never retransmitted * hole was sent out. * C. SACK arrives sacking SND.NXT at the moment, when the * segment was retransmitted. * 4. D-SACK added new rule: D-SACK changes any tag to S. * * It is pleasant to note, that state diagram turns out to be commutative, * so that we are allowed not to be bothered by order of our actions, * when multiple events arrive simultaneously. (see the function below). * * Reordering detection. * -------------------- * Reordering metric is maximal distance, which a packet can be displaced * in packet stream. With SACKs we can estimate it: * * 1. SACK fills old hole and the corresponding segment was not * ever retransmitted -> reordering. Alas, we cannot use it * when segment was retransmitted. * 2. The last flaw is solved with D-SACK. D-SACK arrives * for retransmitted and already SACKed segment -> reordering.. * Both of these heuristics are not used in Loss state, when we cannot * account for retransmits accurately. * * SACK block validation. * ---------------------- * * SACK block range validation checks that the received SACK block fits to * the expected sequence limits, i.e., it is between SND.UNA and SND.NXT. * Note that SND.UNA is not included to the range though being valid because * it means that the receiver is rather inconsistent with itself reporting * SACK reneging when it should advance SND.UNA. Such SACK block this is * perfectly valid, however, in light of RFC2018 which explicitly states * that "SACK block MUST reflect the newest segment. Even if the newest * segment is going to be discarded ...", not that it looks very clever * in case of head skb. Due to potentional receiver driven attacks, we * choose to avoid immediate execution of a walk in write queue due to * reneging and defer head skb's loss recovery to standard loss recovery * procedure that will eventually trigger (nothing forbids us doing this). * * Implements also blockage to start_seq wrap-around. Problem lies in the * fact that though start_seq (s) is before end_seq (i.e., not reversed), * there's no guarantee that it will be before snd_nxt (n). The problem * happens when start_seq resides between end_seq wrap (e_w) and snd_nxt * wrap (s_w): * * <- outs wnd -> <- wrapzone -> * u e n u_w e_w s n_w * | | | | | | | * |<------------+------+----- TCP seqno space --------------+---------->| * ...-- <2^31 ->| |<--------... * ...---- >2^31 ------>| |<--------... * * Current code wouldn't be vulnerable but it's better still to discard such * crazy SACK blocks. Doing this check for start_seq alone closes somewhat * similar case (end_seq after snd_nxt wrap) as earlier reversed check in * snd_nxt wrap -> snd_una region will then become "well defined", i.e., * equal to the ideal case (infinite seqno space without wrap caused issues). * * With D-SACK the lower bound is extended to cover sequence space below * SND.UNA down to undo_marker, which is the last point of interest. Yet * again, D-SACK block must not to go across snd_una (for the same reason as * for the normal SACK blocks, explained above). But there all simplicity * ends, TCP might receive valid D-SACKs below that. As long as they reside * fully below undo_marker they do not affect behavior in anyway and can * therefore be safely ignored. In rare cases (which are more or less * theoretical ones), the D-SACK will nicely cross that boundary due to skb * fragmentation and packet reordering past skb's retransmission. To consider * them correctly, the acceptable range must be extended even more though * the exact amount is rather hard to quantify. However, tp->max_window can * be used as an exaggerated estimate. */static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack, u32 start_seq, u32 end_seq){ /* Too far in future, or reversed (interpretation is ambiguous) */ if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq)) return 0; /* Nasty start_seq wrap-around check (see comments above) */ if (!before(start_seq, tp->snd_nxt)) return 0; /* In outstanding window? ...This is valid exit for D-SACKs too. * start_seq == snd_una is non-sensical (see comments above) */ if (after(start_seq, tp->snd_una)) return 1; if (!is_dsack || !tp->undo_marker) return 0; /* ...Then it's D-SACK, and must reside below snd_una completely */ if (!after(end_seq, tp->snd_una)) return 0; if (!before(start_seq, tp->undo_marker)) return 1; /* Too old */ if (!after(end_seq, tp->undo_marker)) return 0; /* Undo_marker boundary crossing (overestimates a lot). Known already: * start_seq < undo_marker and end_seq >= undo_marker. */ return !before(start_seq, end_seq - tp->max_window);}/* Check for lost retransmit. This superb idea is borrowed from "ratehalving". * Event "C". Later note: FACK people cheated me again 8), we have to account * for reordering! Ugly, but should help. * * Search retransmitted skbs from write_queue that were sent when snd_nxt was * less than what is now known to be received by the other end (derived from * SACK blocks by the caller). Also calculate the lowest snd_nxt among the * remaining retransmitted skbs to avoid some costly processing per ACKs. */static int tcp_mark_lost_retrans(struct sock *sk, u32 received_upto){ struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; int flag = 0; int cnt = 0; u32 new_low_seq = tp->snd_nxt; tcp_for_write_queue(skb, sk) { u32 ack_seq = TCP_SKB_CB(skb)->ack_seq; if (skb == tcp_send_head(sk)) break; if (cnt == tp->retrans_out) break; if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) continue; if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)) continue; if (after(received_upto, ack_seq) && (tcp_is_fack(tp) || !before(received_upto, ack_seq + tp->reordering * tp->mss_cache))) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); /* clear lost hint */ tp->retransmit_skb_hint = NULL; if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_LOST|TCPCB_SACKED_ACKED))) { tp->lost_out += tcp_skb_pcount(skb); TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; flag |= FLAG_DATA_SACKED; NET_INC_STATS_BH(LINUX_MIB_TCPLOSTRETRANSMIT); } } else { if (before(ack_seq, new_low_seq)) new_low_seq = ack_seq; cnt += tcp_skb_pcount(skb); } } if (tp->retrans_out) tp->lost_retrans_low = new_low_seq; return flag;}static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb, struct tcp_sack_block_wire *sp, int num_sacks, u32 prior_snd_una){ u32 start_seq_0 = ntohl(get_unaligned(&sp[0].start_seq)); u32 end_seq_0 = ntohl(get_unaligned(&sp[0].end_seq)); int dup_sack = 0; if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) { dup_sack = 1; tcp_dsack_seen(tp); NET_INC_STATS_BH(LINUX_MIB_TCPDSACKRECV); } else if (num_sacks > 1) { u32 end_seq_1 = ntohl(get_unaligned(&sp[1].end_seq)); u32 start_seq_1 = ntohl(get_unaligned(&sp[1].start_seq)); if (!after(end_seq_0, end_seq_1) && !before(start_seq_0, start_seq_1)) { dup_sack = 1; tcp_dsack_seen(tp); NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFORECV); } } /* D-SACK for already forgotten data... Do dumb counting. */ if (dup_sack && !after(end_seq_0, prior_snd_una) && after(end_seq_0, tp->undo_marker)) tp->undo_retrans--; return dup_sack;}/* Check if skb is fully within the SACK block. In presence of GSO skbs, * the incoming SACK may not exactly match but we can find smaller MSS * aligned portion of it that matches. Therefore we might need to fragment * which may fail and creates some hassle (caller must handle error case * returns). */static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, u32 start_seq, u32 end_seq){ int in_sack, err; unsigned int pkt_len; in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && !before(end_seq, TCP_SKB_CB(skb)->end_seq); if (tcp_skb_pcount(skb) > 1 && !in_sack && after(TCP_SKB_CB(skb)->end_seq, start_seq)) { in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq); if (!in_sack) pkt_len = start_seq - TCP_SKB_CB(skb)->seq; else pkt_len = end_seq - TCP_SKB_CB(skb)->seq; err = tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->gso_size); if (err < 0) return err; } return in_sack;}static inttcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una){ const struct inet_connection_sock *icsk = inet_csk(sk);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -