📄 tcp_input.c
字号:
if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq)) return; tcp_rcv_rtt_update(tp, jiffies - tp->rcv_rtt_est.time, 1);new_measure: tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd; tp->rcv_rtt_est.time = tcp_time_stamp;}static inline void tcp_rcv_rtt_measure_ts(struct tcp_opt *tp, struct sk_buff *skb){ if (tp->rcv_tsecr && (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq >= tp->ack.rcv_mss)) tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rcv_tsecr, 0);}/* * This function should be called every time data is copied to user space. * It calculates the appropriate TCP receive buffer space. */void tcp_rcv_space_adjust(struct sock *sk){ struct tcp_opt *tp = tcp_sk(sk); int time; int space; if (tp->rcvq_space.time == 0) goto new_measure; time = tcp_time_stamp - tp->rcvq_space.time; if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0) return; space = 2 * (tp->copied_seq - tp->rcvq_space.seq); space = max(tp->rcvq_space.space, space); if (tp->rcvq_space.space != space) { int rcvmem; tp->rcvq_space.space = space; if (sysctl_tcp_moderate_rcvbuf) { int new_clamp = space; /* Receive space grows, normalize in order to * take into account packet headers and sk_buff * structure overhead. */ space /= tp->advmss; if (!space) space = 1; rcvmem = (tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff)); while (tcp_win_from_space(rcvmem) < tp->advmss) rcvmem += 128; space *= rcvmem; space = min(space, sysctl_tcp_rmem[2]); if (space > sk->sk_rcvbuf) { sk->sk_rcvbuf = space; /* Make the window clamp follow along. */ tp->window_clamp = new_clamp; } } } new_measure: tp->rcvq_space.seq = tp->copied_seq; tp->rcvq_space.time = tcp_time_stamp;}/* There is something which you must keep in mind when you analyze the * behavior of the tp->ato delayed ack timeout interval. When a * connection starts up, we want to ack as quickly as possible. The * problem is that "good" TCP's do slow start at the beginning of data * transmission. The means that until we send the first few ACK's the * sender will sit on his end and only queue most of his data, because * he can only send snd_cwnd unacked packets at any given time. For * each ACK we send, he increments snd_cwnd and transmits more of his * queue. -DaveM */static void tcp_event_data_recv(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb){ u32 now; tcp_schedule_ack(tp); tcp_measure_rcv_mss(tp, skb); tcp_rcv_rtt_measure(tp); now = tcp_time_stamp; if (!tp->ack.ato) { /* The _first_ data packet received, initialize * delayed ACK engine. */ tcp_incr_quickack(tp); tp->ack.ato = TCP_ATO_MIN; } else { int m = now - tp->ack.lrcvtime; if (m <= TCP_ATO_MIN/2) { /* The fastest case is the first. */ tp->ack.ato = (tp->ack.ato>>1) + TCP_ATO_MIN/2; } else if (m < tp->ack.ato) { tp->ack.ato = (tp->ack.ato>>1) + m; if (tp->ack.ato > tp->rto) tp->ack.ato = tp->rto; } else if (m > tp->rto) { /* Too long gap. Apparently sender falled to * restart window, so that we send ACKs quickly. */ tcp_incr_quickack(tp); sk_stream_mem_reclaim(sk); } } tp->ack.lrcvtime = now; TCP_ECN_check_ce(tp, skb); if (skb->len >= 128) tcp_grow_window(sk, tp, skb);}/* When starting a new connection, pin down the current choice of * congestion algorithm. */void tcp_ca_init(struct tcp_opt *tp){ if (sysctl_tcp_westwood) tp->adv_cong = TCP_WESTWOOD; else if (sysctl_tcp_bic) tp->adv_cong = TCP_BIC; else if (sysctl_tcp_vegas_cong_avoid) { tp->adv_cong = TCP_VEGAS; tp->vegas.baseRTT = 0x7fffffff; tcp_vegas_enable(tp); } }/* Do RTT sampling needed for Vegas. * Basically we: * o min-filter RTT samples from within an RTT to get the current * propagation delay + queuing delay (we are min-filtering to try to * avoid the effects of delayed ACKs) * o min-filter RTT samples from a much longer window (forever for now) * to find the propagation delay (baseRTT) */static inline void vegas_rtt_calc(struct tcp_opt *tp, __u32 rtt){ __u32 vrtt = rtt + 1; /* Never allow zero rtt or baseRTT */ /* Filter to find propagation delay: */ if (vrtt < tp->vegas.baseRTT) tp->vegas.baseRTT = vrtt; /* Find the min RTT during the last RTT to find * the current prop. delay + queuing delay: */ tp->vegas.minRTT = min(tp->vegas.minRTT, vrtt); tp->vegas.cntRTT++;}/* Called to compute a smoothed rtt estimate. The data fed to this * routine either comes from timestamps, or from segments that were * known _not_ to have been retransmitted [see Karn/Partridge * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88 * piece by Van Jacobson. * NOTE: the next three routines used to be one big routine. * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */static void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt){ long m = mrtt; /* RTT */ if (tcp_vegas_enabled(tp)) vegas_rtt_calc(tp, mrtt); /* The following amusing code comes from Jacobson's * article in SIGCOMM '88. Note that rtt and mdev * are scaled versions of rtt and mean deviation. * This is designed to be as fast as possible * m stands for "measurement". * * On a 1990 paper the rto value is changed to: * RTO = rtt + 4 * mdev * * Funny. This algorithm seems to be very broken. * These formulae increase RTO, when it should be decreased, increase * too slowly, when it should be incresed fastly, decrease too fastly * etc. I guess in BSD RTO takes ONE value, so that it is absolutely * does not matter how to _calculate_ it. Seems, it was trap * that VJ failed to avoid. 8) */ if(m == 0) m = 1; if (tp->srtt != 0) { m -= (tp->srtt >> 3); /* m is now error in rtt est */ tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */ if (m < 0) { m = -m; /* m is now abs(error) */ m -= (tp->mdev >> 2); /* similar update on mdev */ /* This is similar to one of Eifel findings. * Eifel blocks mdev updates when rtt decreases. * This solution is a bit different: we use finer gain * for mdev in this case (alpha*beta). * Like Eifel it also prevents growth of rto, * but also it limits too fast rto decreases, * happening in pure Eifel. */ if (m > 0) m >>= 3; } else { m -= (tp->mdev >> 2); /* similar update on mdev */ } tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */ if (tp->mdev > tp->mdev_max) { tp->mdev_max = tp->mdev; if (tp->mdev_max > tp->rttvar) tp->rttvar = tp->mdev_max; } if (after(tp->snd_una, tp->rtt_seq)) { if (tp->mdev_max < tp->rttvar) tp->rttvar -= (tp->rttvar-tp->mdev_max)>>2; tp->rtt_seq = tp->snd_nxt; tp->mdev_max = TCP_RTO_MIN; } } else { /* no previous measure. */ tp->srtt = m<<3; /* take the measured time to be rtt */ tp->mdev = m<<1; /* make sure rto = 3*rtt */ tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN); tp->rtt_seq = tp->snd_nxt; } tcp_westwood_update_rtt(tp, tp->srtt >> 3);}/* Calculate rto without backoff. This is the second half of Van Jacobson's * routine referred to above. */static __inline__ void tcp_set_rto(struct tcp_opt *tp){ /* Old crap is replaced with new one. 8) * * More seriously: * 1. If rtt variance happened to be less 50msec, it is hallucination. * It cannot be less due to utterly erratic ACK generation made * at least by solaris and freebsd. "Erratic ACKs" has _nothing_ * to do with delayed acks, because at cwnd>2 true delack timeout * is invisible. Actually, Linux-2.4 also generates erratic * ACKs in some curcumstances. */ tp->rto = (tp->srtt >> 3) + tp->rttvar; /* 2. Fixups made earlier cannot be right. * If we do not estimate RTO correctly without them, * all the algo is pure shit and should be replaced * with correct one. It is exaclty, which we pretend to do. */}/* NOTE: clamping at TCP_RTO_MIN is not required, current algo * guarantees that rto is higher. */static __inline__ void tcp_bound_rto(struct tcp_opt *tp){ if (tp->rto > TCP_RTO_MAX) tp->rto = TCP_RTO_MAX;}/* Save metrics learned by this TCP session. This function is called only, when TCP finishes successfully i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE. */void tcp_update_metrics(struct sock *sk){ struct tcp_opt *tp = tcp_sk(sk); struct dst_entry *dst = __sk_dst_get(sk); if (sysctl_tcp_nometrics_save) return; dst_confirm(dst); if (dst && (dst->flags&DST_HOST)) { int m; if (tp->backoff || !tp->srtt) { /* This session failed to estimate rtt. Why? * Probably, no packets returned in time. * Reset our results. */ if (!(dst_metric_locked(dst, RTAX_RTT))) dst->metrics[RTAX_RTT-1] = 0; return; } m = dst_metric(dst, RTAX_RTT) - tp->srtt; /* If newly calculated rtt larger than stored one, * store new one. Otherwise, use EWMA. Remember, * rtt overestimation is always better than underestimation. */ if (!(dst_metric_locked(dst, RTAX_RTT))) { if (m <= 0) dst->metrics[RTAX_RTT-1] = tp->srtt; else dst->metrics[RTAX_RTT-1] -= (m>>3); } if (!(dst_metric_locked(dst, RTAX_RTTVAR))) { if (m < 0) m = -m; /* Scale deviation to rttvar fixed point */ m >>= 1; if (m < tp->mdev) m = tp->mdev; if (m >= dst_metric(dst, RTAX_RTTVAR)) dst->metrics[RTAX_RTTVAR-1] = m; else dst->metrics[RTAX_RTTVAR-1] -= (dst->metrics[RTAX_RTTVAR-1] - m)>>2; } if (tp->snd_ssthresh >= 0xFFFF) { /* Slow start still did not finish. */ if (dst_metric(dst, RTAX_SSTHRESH) && !dst_metric_locked(dst, RTAX_SSTHRESH) && (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH)) dst->metrics[RTAX_SSTHRESH-1] = tp->snd_cwnd >> 1; if (!dst_metric_locked(dst, RTAX_CWND) && tp->snd_cwnd > dst_metric(dst, RTAX_CWND)) dst->metrics[RTAX_CWND-1] = tp->snd_cwnd; } else if (tp->snd_cwnd > tp->snd_ssthresh && tp->ca_state == TCP_CA_Open) { /* Cong. avoidance phase, cwnd is reliable. */ if (!dst_metric_locked(dst, RTAX_SSTHRESH)) dst->metrics[RTAX_SSTHRESH-1] = max(tp->snd_cwnd >> 1, tp->snd_ssthresh); if (!dst_metric_locked(dst, RTAX_CWND)) dst->metrics[RTAX_CWND-1] = (dst->metrics[RTAX_CWND-1] + tp->snd_cwnd) >> 1; } else { /* Else slow start did not finish, cwnd is non-sense, ssthresh may be also invalid. */ if (!dst_metric_locked(dst, RTAX_CWND)) dst->metrics[RTAX_CWND-1] = (dst->metrics[RTAX_CWND-1] + tp->snd_ssthresh) >> 1; if (dst->metrics[RTAX_SSTHRESH-1] && !dst_metric_locked(dst, RTAX_SSTHRESH) && tp->snd_ssthresh > dst->metrics[RTAX_SSTHRESH-1]) dst->metrics[RTAX_SSTHRESH-1] = tp->snd_ssthresh; } if (!dst_metric_locked(dst, RTAX_REORDERING)) { if (dst->metrics[RTAX_REORDERING-1] < tp->reordering && tp->reordering != sysctl_tcp_reordering) dst->metrics[RTAX_REORDERING-1] = tp->reordering; } }}/* Numbers are taken from RFC2414. */__u32 tcp_init_cwnd(struct tcp_opt *tp, struct dst_entry *dst){ __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); if (!cwnd) { if (tp->mss_cache_std > 1460) cwnd = 2; else cwnd = (tp->mss_cache_std > 1095) ? 3 : 4; } return min_t(__u32, cwnd, tp->snd_cwnd_clamp);}/* Initialize metrics on socket. */static void tcp_init_metrics(struct sock *sk){ struct tcp_opt *tp = tcp_sk(sk); struct dst_entry *dst = __sk_dst_get(sk); if (dst == NULL) goto reset; dst_confirm(dst); if (dst_metric_locked(dst, RTAX_CWND)) tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND); if (dst_metric(dst, RTAX_SSTHRESH)) { tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH); if (tp->snd_ssthresh > tp->snd_cwnd_clamp) tp->snd_ssthresh = tp->snd_cwnd_clamp; } if (dst_metric(dst, RTAX_REORDERING) && tp->reordering != dst_metric(dst, RTAX_REORDERING)) { tp->sack_ok &= ~2; tp->reordering = dst_metric(dst, RTAX_REORDERING); } if (dst_metric(dst, RTAX_RTT) == 0) goto reset; if (!tp->srtt && dst_metric(dst, RTAX_RTT) < (TCP_TIMEOUT_INIT << 3)) goto reset; /* Initial rtt is determined from SYN,SYN-ACK. * The segment is small and rtt may appear much * less than real one. Use per-dst memory * to make it more realistic. * * A bit of theory. RTT is time passed after "normal" sized packet * is sent until it is ACKed. In normal curcumstances sending small * packets force peer to delay ACKs and calculation is correct too. * The algorithm is adaptive and, provided we follow specs, it * NEVER underestimate RTT. BUT! If peer tries to make some clever * tricks sort of "quick acks" for time long enough to decrease RTT
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -