📄 tcp_output.c
字号:
struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); unsigned int mss_now; /* If we are closed, the bytes will have to remain here. * In time closedown will finish, we empty the write queue and all * will be happy. */ if(sk->state != TCP_CLOSE) { struct sk_buff *skb; int sent_pkts = 0; /* Account for SACKS, we may need to fragment due to this. * It is just like the real MSS changing on us midstream. * We also handle things correctly when the user adds some * IP options mid-stream. Silly to do, but cover it. */ mss_now = tcp_current_mss(sk); while((skb = tp->send_head) && tcp_snd_test(tp, skb, mss_now, tcp_skb_is_last(sk, skb) ? tp->nonagle : 1)) { if (skb->len > mss_now) { if (tcp_fragment(sk, skb, mss_now)) break; } TCP_SKB_CB(skb)->when = tcp_time_stamp; if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))) break; /* Advance the send_head. This one is sent out. */ update_send_head(sk, tp, skb); tcp_minshall_update(tp, mss_now, skb); sent_pkts = 1; } if (sent_pkts) { tcp_cwnd_validate(sk, tp); return 0; } return !tp->packets_out && tp->send_head; } return 0;}/* This function returns the amount that we can raise the * usable window based on the following constraints * * 1. The window can never be shrunk once it is offered (RFC 793) * 2. We limit memory per socket * * RFC 1122: * "the suggested [SWS] avoidance algorithm for the receiver is to keep * RECV.NEXT + RCV.WIN fixed until: * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)" * * i.e. don't raise the right edge of the window until you can raise * it at least MSS bytes. * * Unfortunately, the recommended algorithm breaks header prediction, * since header prediction assumes th->window stays fixed. * * Strictly speaking, keeping th->window fixed violates the receiver * side SWS prevention criteria. The problem is that under this rule * a stream of single byte packets will cause the right side of the * window to always advance by a single byte. * * Of course, if the sender implements sender side SWS prevention * then this will not be a problem. * * BSD seems to make the following compromise: * * If the free space is less than the 1/4 of the maximum * space available and the free space is less than 1/2 mss, * then set the window to 0. * [ Actually, bsd uses MSS and 1/4 of maximal _window_ ] * Otherwise, just prevent the window from shrinking * and from being larger than the largest representable value. * * This prevents incremental opening of the window in the regime * where TCP is limited by the speed of the reader side taking * data out of the TCP receive queue. It does nothing about * those cases where the window is constrained on the sender side * because the pipeline is full. * * BSD also seems to "accidentally" limit itself to windows that are a * multiple of MSS, at least until the free space gets quite small. * This would appear to be a side effect of the mbuf implementation. * Combining these two algorithms results in the observed behavior * of having a fixed window size at almost all times. * * Below we obtain similar behavior by forcing the offered window to * a multiple of the mss when it is feasible to do so. * * Note, we don't "adjust" for TIMESTAMP or SACK option bytes. * Regular options like TIMESTAMP are taken into account. */u32 __tcp_select_window(struct sock *sk){ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; /* MSS for the peer's data. Previous verions used mss_clamp * here. I don't know if the value based on our guesses * of peer's MSS is better for the performance. It's more correct * but may be worse for the performance because of rcv_mss * fluctuations. --SAW 1998/11/1 */ unsigned int mss = tp->ack.rcv_mss; int free_space; u32 window; /* Sometimes free_space can be < 0. */ free_space = tcp_space(sk); if (tp->window_clamp < mss) mss = tp->window_clamp; if (free_space < (int)min(tp->window_clamp, tcp_full_space(sk)) / 2) { tp->ack.quick = 0; if (tcp_memory_pressure) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4*tp->advmss); if (free_space < ((int)mss)) return 0; } if (free_space > tp->rcv_ssthresh) free_space = tp->rcv_ssthresh; /* Get the largest window that is a nice multiple of mss. * Window clamp already applied above. * If our current window offering is within 1 mss of the * free space we just keep it. This prevents the divide * and multiply from happening most of the time. * We also don't do any window rounding when the free space * is too small. */ window = tp->rcv_wnd; if ((((int) window) <= (free_space - ((int) mss))) || (((int) window) > free_space)) window = (((unsigned int) free_space)/mss)*mss; return window;}/* Attempt to collapse two adjacent SKB's during retransmission. */static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now){ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; struct sk_buff *next_skb = skb->next; /* The first test we must make is that neither of these two * SKB's are still referenced by someone else. */ if(!skb_cloned(skb) && !skb_cloned(next_skb)) { int skb_size = skb->len, next_skb_size = next_skb->len; u16 flags = TCP_SKB_CB(skb)->flags; /* Also punt if next skb has been SACK'd. */ if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED) return; /* Next skb is out of window. */ if (after(TCP_SKB_CB(next_skb)->end_seq, tp->snd_una+tp->snd_wnd)) return; /* Punt if not enough space exists in the first SKB for * the data in the second, or the total combined payload * would exceed the MSS. */ if ((next_skb_size > skb_tailroom(skb)) || ((skb_size + next_skb_size) > mss_now)) return; /* Ok. We will be able to collapse the packet. */ __skb_unlink(next_skb, next_skb->list); if(skb->len % 4) { /* Must copy and rechecksum all data. */ memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size); skb->csum = csum_partial(skb->data, skb->len, 0); } else { /* Optimize, actually we could also combine next_skb->csum * to skb->csum using a single add w/carry operation too. */ skb->csum = csum_partial_copy_nocheck(next_skb->data, skb_put(skb, next_skb_size), next_skb_size, skb->csum); } /* Update sequence range on original skb. */ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; /* Merge over control information. */ flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */ TCP_SKB_CB(skb)->flags = flags; /* All done, get rid of second SKB and account for it so * packet counting does not break. */ TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked&(TCPCB_EVER_RETRANS|TCPCB_AT_TAIL); if (TCP_SKB_CB(next_skb)->sacked&TCPCB_SACKED_RETRANS) tp->retrans_out--; if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST) { tp->lost_out--; tp->left_out--; } if (!tp->sack_ok && tp->sacked_out) { /* Reno case is special. Sigh... */ tp->sacked_out--; tp->left_out--; } /* Not quite right: it can be > snd.fack, but * it is better to underestimate fackets. */ if (tp->fackets_out) tp->fackets_out--; tcp_free_skb(sk, next_skb); tp->packets_out--; }}/* Do a simple retransmit without using the backoff mechanisms in * tcp_timer. This is used for path mtu discovery. * The socket is already locked here. */ void tcp_simple_retransmit(struct sock *sk){ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sk_buff *skb; unsigned int mss = tcp_current_mss(sk); int lost = 0; for_retrans_queue(skb, sk, tp) { if (skb->len > mss && !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) { if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out--; } if (!(TCP_SKB_CB(skb)->sacked&TCPCB_LOST)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out++; lost = 1; } } } if (!lost) return; tp->left_out = tp->sacked_out + tp->lost_out; /* Don't muck with the congestion window here. * Reason is that we do not increase amount of _data_ * in network, but units changed and effective * cwnd/ssthresh really reduced now. */ if (tp->ca_state != TCP_CA_Loss) { tp->high_seq = tp->snd_nxt; tp->snd_ssthresh = tcp_current_ssthresh(tp); tp->prior_ssthresh = 0; tp->undo_marker = 0; tp->ca_state = TCP_CA_Loss; } tcp_xmit_retransmit_queue(sk);}/* This retransmits one SKB. Policy decisions and retransmit queue * state updates are done by the caller. Returns non-zero if an * error occurred which prevented the send. */int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb){ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); unsigned int cur_mss = tcp_current_mss(sk); int err; /* Do not sent more than we queued. 1/4 is reserved for possible * copying overhead: frgagmentation, tunneling, mangling etc. */ if (atomic_read(&sk->wmem_alloc) > min(sk->wmem_queued+(sk->wmem_queued>>2),sk->sndbuf)) return -EAGAIN; if(skb->len > cur_mss) { if(tcp_fragment(sk, skb, cur_mss)) return -ENOMEM; /* We'll try again later. */ /* New SKB created, account for it. */ tp->packets_out++; } /* Collapse two adjacent packets if worthwhile and we can. */ if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) && (skb->len < (cur_mss >> 1)) && (skb->next != tp->send_head) && (skb->next != (struct sk_buff *)&sk->write_queue) && (sysctl_tcp_retrans_collapse != 0)) tcp_retrans_try_collapse(sk, skb, cur_mss); if(tp->af_specific->rebuild_header(sk)) return -EHOSTUNREACH; /* Routing failure or similar. */ /* Some Solaris stacks overoptimize and ignore the FIN on a * retransmit when old data is attached. So strip it off * since it is cheap to do so and saves bytes on the network. */ if(skb->len > 0 && (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) { TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1; skb_trim(skb, 0); skb->csum = 0; } /* Make a copy, if the first transmission SKB clone we made * is still in somebody's hands, else make a clone. */ TCP_SKB_CB(skb)->when = tcp_time_stamp; err = tcp_transmit_skb(sk, (skb_cloned(skb) ? skb_copy(skb, GFP_ATOMIC): skb_clone(skb, GFP_ATOMIC))); if (err == 0) { /* Update global TCP statistics. */ TCP_INC_STATS(TcpRetransSegs);#if FASTRETRANS_DEBUG > 0 if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) { if (net_ratelimit()) printk(KERN_DEBUG "retrans_out leaked.\n"); }#endif TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS; tp->retrans_out++; /* Save stamp of the first retransmit. */ if (!tp->retrans_stamp) tp->retrans_stamp = TCP_SKB_CB(skb)->when; tp->undo_retrans++; /* snd_nxt is stored to detect loss of retransmitted segment, * see tcp_input.c tcp_sacktag_write_queue(). */ TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt; } return err;}/* This gets called after a retransmit timeout, and the initially * retransmitted data is acknowledged. It tries to continue * resending the rest of the retransmit queue, until either * we've sent it all or the congestion window limit is reached. * If doing SACK, the first ACK which comes back for a timeout * based retransmit packet might feed us FACK information again. * If so, we use it to avoid unnecessarily retransmissions. */void tcp_xmit_retransmit_queue(struct sock *sk){ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sk_buff *skb; int packet_cnt = tp->lost_out; /* First pass: retransmit lost packets. */ if (packet_cnt) { for_retrans_queue(skb, sk, tp) { __u8 sacked = TCP_SKB_CB(skb)->sacked; if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) return; if (sacked&TCPCB_LOST) { if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { if (tcp_retransmit_skb(sk, skb)) return; if (tp->ca_state != TCP_CA_Loss) NET_INC_STATS_BH(TCPFastRetrans); else NET_INC_STATS_BH(TCPSlowStartRetrans); if (skb == skb_peek(&sk->write_queue)) tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); } if (--packet_cnt <= 0) break; } } } /* OK, demanded retransmission is finished. */ /* Forward retransmissions are possible only during Recovery. */ if (tp->ca_state != TCP_CA_Recovery) return; /* No forward retransmissions in Reno are possible. */ if (!tp->sack_ok) return; /* Yeah, we have to make difficult choice between forward transmission * and retransmission... Both ways have their merits... * * For now we do not retrnamsit anything, while we have some new * segments to send. */ if (tcp_may_send_now(sk, tp)) return; packet_cnt = 0; for_retrans_queue(skb, sk, tp) { if(++packet_cnt > tp->fackets_out) break; if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) break; if(TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) continue; /* Ok, retransmit it. */ if(tcp_retransmit_skb(sk, skb)) break; if (skb == skb_peek(&sk->write_queue)) tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); NET_INC_STATS_BH(TCPForwardRetrans); }}/* Send a fin. The caller locks the socket for us. This cannot be * allowed to fail queueing a FIN frame under any circumstances. */void tcp_send_fin(struct sock *sk){ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sk_buff *skb = skb_peek_tail(&sk->write_queue); unsigned int mss_now; /* Optimization, tack on the FIN if we have a queue of * unsent frames. But be careful about outgoing SACKS * and IP options. */ mss_now = tcp_current_mss(sk); /* Please, find seven differences of 2.3.33 and loook * what I broke here. 8) --ANK */ if(tp->send_head != NULL) { /* tcp_write_xmit() takes care of the rest. */ TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN; TCP_SKB_CB(skb)->end_seq++; tp->write_seq++; /* Special case to avoid Nagle bogosity. If this * segment is the last segment, and it was queued * due to Nagle/SWS-avoidance, send it out now. */ if(tp->send_head == skb && !after(tp->write_seq, tp->snd_una + tp->snd_wnd)) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -