📄 tcp_output.c
字号:
if (skb->len <= mss_std) { /* Avoid the costly divide in the normal * non-TSO case. */ skb_shinfo(skb)->tso_segs = 1; skb_shinfo(skb)->tso_size = 0; } else { unsigned int factor; factor = skb->len + (mss_std - 1); factor /= mss_std; skb_shinfo(skb)->tso_segs = factor; skb_shinfo(skb)->tso_size = mss_std; }}/* Function to create two new TCP segments. Shrinks the given segment * to the specified size and appends a new segment with the rest of the * packet to the list. This won't be called frequently, I hope. * Remember, these are still headerless SKBs at this point. */static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len){ struct tcp_opt *tp = tcp_sk(sk); struct sk_buff *buff; int nsize = skb->len - len; u16 flags; if (skb_cloned(skb) && skb_is_nonlinear(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) return -ENOMEM; /* Get a new skb... force flag on. */ buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC); if (buff == NULL) return -ENOMEM; /* We'll just try again later. */ sk_charge_skb(sk, buff); /* Correct the sequence numbers. */ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; /* PSH and FIN should only be set in the second packet. */ flags = TCP_SKB_CB(skb)->flags; TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); TCP_SKB_CB(buff)->flags = flags; TCP_SKB_CB(buff)->sacked = (TCP_SKB_CB(skb)->sacked & (TCPCB_LOST | TCPCB_EVER_RETRANS | TCPCB_AT_TAIL)); TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL; if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_HW) { /* Copy and checksum data tail into the new buffer. */ buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize), nsize, 0); skb_trim(skb, len); skb->csum = csum_block_sub(skb->csum, buff->csum, len); } else { skb->ip_summed = CHECKSUM_HW; skb_split(skb, buff, len); } buff->ip_summed = skb->ip_summed; /* Looks stupid, but our code really uses when of * skbs, which it never sent before. --ANK */ TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { tcp_dec_pcount(&tp->lost_out, skb); tcp_dec_pcount(&tp->left_out, skb); } /* Fix up tso_factor for both original and new SKB. */ tcp_set_skb_tso_segs(skb, tp->mss_cache_std); tcp_set_skb_tso_segs(buff, tp->mss_cache_std); if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { tcp_inc_pcount(&tp->lost_out, skb); tcp_inc_pcount(&tp->left_out, skb); } if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) { tcp_inc_pcount(&tp->lost_out, buff); tcp_inc_pcount(&tp->left_out, buff); } /* Link BUFF into the send queue. */ __skb_append(skb, buff); return 0;}/* This is similar to __pskb_pull_head() (it will go to core/skbuff.c * eventually). The difference is that pulled data not copied, but * immediately discarded. */static unsigned char *__pskb_trim_head(struct sk_buff *skb, int len){ int i, k, eat; eat = len; k = 0; for (i=0; i<skb_shinfo(skb)->nr_frags; i++) { if (skb_shinfo(skb)->frags[i].size <= eat) { put_page(skb_shinfo(skb)->frags[i].page); eat -= skb_shinfo(skb)->frags[i].size; } else { skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; if (eat) { skb_shinfo(skb)->frags[k].page_offset += eat; skb_shinfo(skb)->frags[k].size -= eat; eat = 0; } k++; } } skb_shinfo(skb)->nr_frags = k; skb->tail = skb->data; skb->data_len -= len; skb->len = skb->data_len; return skb->tail;}int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len){ struct tcp_opt *tp = tcp_sk(sk); if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) return -ENOMEM; if (len <= skb_headlen(skb)) { __skb_pull(skb, len); } else { if (__pskb_trim_head(skb, len-skb_headlen(skb)) == NULL) return -ENOMEM; } TCP_SKB_CB(skb)->seq += len; skb->ip_summed = CHECKSUM_HW; skb->truesize -= len; sk->sk_queue_shrunk = 1; sk->sk_wmem_queued -= len; sk->sk_forward_alloc += len; /* Any change of skb->len requires recalculation of tso * factor and mss. */ tcp_set_skb_tso_segs(skb, tp->mss_cache_std); return 0;}/* This function synchronize snd mss to current pmtu/exthdr set. tp->user_mss is mss set by user by TCP_MAXSEG. It does NOT counts for TCP options, but includes only bare TCP header. tp->mss_clamp is mss negotiated at connection setup. It is minumum of user_mss and mss received with SYN. It also does not include TCP options. tp->pmtu_cookie is last pmtu, seen by this function. tp->mss_cache is current effective sending mss, including all tcp options except for SACKs. It is evaluated, taking into account current pmtu, but never exceeds tp->mss_clamp. NOTE1. rfc1122 clearly states that advertised MSS DOES NOT include either tcp or ip options. NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside this function. --ANK (980731) */unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu){ struct tcp_opt *tp = tcp_sk(sk); struct dst_entry *dst = __sk_dst_get(sk); int mss_now; if (dst && dst->ops->get_mss) pmtu = dst->ops->get_mss(dst, pmtu); /* Calculate base mss without TCP options: It is MMS_S - sizeof(tcphdr) of rfc1122 */ mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr); /* Clamp it (mss_clamp does not include tcp options) */ if (mss_now > tp->mss_clamp) mss_now = tp->mss_clamp; /* Now subtract optional transport overhead */ mss_now -= tp->ext_header_len + tp->ext2_header_len; /* Then reserve room for full set of TCP options and 8 bytes of data */ if (mss_now < 48) mss_now = 48; /* Now subtract TCP options size, not including SACKs */ mss_now -= tp->tcp_header_len - sizeof(struct tcphdr); /* Bound mss with half of window */ if (tp->max_window && mss_now > (tp->max_window>>1)) mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len); /* And store cached results */ tp->pmtu_cookie = pmtu; tp->mss_cache = tp->mss_cache_std = mss_now; return mss_now;}/* Compute the current effective MSS, taking SACKs and IP options, * and even PMTU discovery events into account. * * LARGESEND note: !urg_mode is overkill, only frames up to snd_up * cannot be large. However, taking into account rare use of URG, this * is not a big flaw. */unsigned int tcp_current_mss(struct sock *sk, int large){ struct tcp_opt *tp = tcp_sk(sk); struct dst_entry *dst = __sk_dst_get(sk); unsigned int do_large, mss_now; mss_now = tp->mss_cache_std; if (dst) { u32 mtu = dst_pmtu(dst); if (mtu != tp->pmtu_cookie || tp->ext2_header_len != dst->header_len) mss_now = tcp_sync_mss(sk, mtu); } do_large = (large && (sk->sk_route_caps & NETIF_F_TSO) && !tp->urg_mode); if (do_large) { unsigned int large_mss, factor, limit; large_mss = 65535 - tp->af_specific->net_header_len - tp->ext_header_len - tp->ext2_header_len - tp->tcp_header_len; if (tp->max_window && large_mss > (tp->max_window>>1)) large_mss = max((tp->max_window>>1), 68U - tp->tcp_header_len); factor = large_mss / mss_now; /* Always keep large mss multiple of real mss, but * do not exceed 1/tso_win_divisor of the congestion window * so we can keep the ACK clock ticking and minimize * bursting. */ limit = tp->snd_cwnd; if (sysctl_tcp_tso_win_divisor) limit /= sysctl_tcp_tso_win_divisor; limit = max(1U, limit); if (factor > limit) factor = limit; tp->mss_cache = mss_now * factor; mss_now = tp->mss_cache; } if (tp->eff_sacks) mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK)); return mss_now;}/* This routine writes packets to the network. It advances the * send_head. This happens as incoming acks open up the remote * window for us. * * Returns 1, if no segments are in flight and we have queued segments, but * cannot send anything now because of SWS or another problem. */int tcp_write_xmit(struct sock *sk, int nonagle){ struct tcp_opt *tp = tcp_sk(sk); unsigned int mss_now; /* If we are closed, the bytes will have to remain here. * In time closedown will finish, we empty the write queue and all * will be happy. */ if (sk->sk_state != TCP_CLOSE) { struct sk_buff *skb; int sent_pkts = 0; /* Account for SACKS, we may need to fragment due to this. * It is just like the real MSS changing on us midstream. * We also handle things correctly when the user adds some * IP options mid-stream. Silly to do, but cover it. */ mss_now = tcp_current_mss(sk, 1); while ((skb = sk->sk_send_head) && tcp_snd_test(tp, skb, mss_now, tcp_skb_is_last(sk, skb) ? nonagle : TCP_NAGLE_PUSH)) { if (skb->len > mss_now) { if (tcp_fragment(sk, skb, mss_now)) break; } TCP_SKB_CB(skb)->when = tcp_time_stamp; if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))) break; /* Advance the send_head. This one is sent out. * This call will increment packets_out. */ update_send_head(sk, tp, skb); tcp_minshall_update(tp, mss_now, skb); sent_pkts = 1; } if (sent_pkts) { tcp_cwnd_validate(sk, tp); return 0; } return !tcp_get_pcount(&tp->packets_out) && sk->sk_send_head; } return 0;}/* This function returns the amount that we can raise the * usable window based on the following constraints * * 1. The window can never be shrunk once it is offered (RFC 793) * 2. We limit memory per socket * * RFC 1122: * "the suggested [SWS] avoidance algorithm for the receiver is to keep * RECV.NEXT + RCV.WIN fixed until: * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)" * * i.e. don't raise the right edge of the window until you can raise * it at least MSS bytes. * * Unfortunately, the recommended algorithm breaks header prediction, * since header prediction assumes th->window stays fixed. * * Strictly speaking, keeping th->window fixed violates the receiver * side SWS prevention criteria. The problem is that under this rule * a stream of single byte packets will cause the right side of the * window to always advance by a single byte. * * Of course, if the sender implements sender side SWS prevention * then this will not be a problem. * * BSD seems to make the following compromise: * * If the free space is less than the 1/4 of the maximum * space available and the free space is less than 1/2 mss, * then set the window to 0. * [ Actually, bsd uses MSS and 1/4 of maximal _window_ ] * Otherwise, just prevent the window from shrinking * and from being larger than the largest representable value. * * This prevents incremental opening of the window in the regime * where TCP is limited by the speed of the reader side taking * data out of the TCP receive queue. It does nothing about * those cases where the window is constrained on the sender side * because the pipeline is full. * * BSD also seems to "accidentally" limit itself to windows that are a * multiple of MSS, at least until the free space gets quite small. * This would appear to be a side effect of the mbuf implementation. * Combining these two algorithms results in the observed behavior * of having a fixed window size at almost all times. * * Below we obtain similar behavior by forcing the offered window to * a multiple of the mss when it is feasible to do so. * * Note, we don't "adjust" for TIMESTAMP or SACK option bytes. * Regular options like TIMESTAMP are taken into account. */u32 __tcp_select_window(struct sock *sk){ struct tcp_opt *tp = tcp_sk(sk); /* MSS for the peer's data. Previous verions used mss_clamp * here. I don't know if the value based on our guesses * of peer's MSS is better for the performance. It's more correct * but may be worse for the performance because of rcv_mss * fluctuations. --SAW 1998/11/1 */ int mss = tp->ack.rcv_mss; int free_space = tcp_space(sk); int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk)); int window; if (mss > full_space) mss = full_space; if (free_space < full_space/2) { tp->ack.quick = 0; if (tcp_memory_pressure) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss); if (free_space < mss) return 0; } if (free_space > tp->rcv_ssthresh) free_space = tp->rcv_ssthresh; /* Don't do rounding if we are using window scaling, since the * scaled window will not line up with the MSS boundary anyway. */ window = tp->rcv_wnd; if (tp->rcv_wscale) { window = free_space;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -