📄 tcp_output.c
字号:
/* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * Version: $Id: tcp_output.c,v 1.144 2001/11/06 22:21:08 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Florian La Roche, <flla@stud.uni-sb.de> * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> * Linus Torvalds, <torvalds@cs.helsinki.fi> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Matthew Dillon, <dillon@apollo.west.oic.com> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Jorge Cwik, <jorge@laser.satlink.net> *//* * Changes: Pedro Roque : Retransmit queue handled by TCP. * : Fragmentation on mtu decrease * : Segment collapse on retransmit * : AF independence * * Linus Torvalds : send_delayed_ack * David S. Miller : Charge memory using the right skb * during syn/ack processing. * David S. Miller : Output engine completely rewritten. * Andrea Arcangeli: SYNACK carry ts_recent in tsecr. * Cacophonix Gaul : draft-minshall-nagle-01 * J Hadi Salim : ECN support * */#include <net/tcp.h>#include <linux/smp_lock.h>/* People can turn this off for buggy TCP's found in printers etc. */int sysctl_tcp_retrans_collapse = 1;static __inline__void update_send_head(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb){ tp->send_head = skb->next; if (tp->send_head == (struct sk_buff *) &sk->write_queue) tp->send_head = NULL; tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; if (tp->packets_out++ == 0) tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);}/* SND.NXT, if window was not shrunk. * If window has been shrunk, what should we make? It is not clear at all. * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-( * Anything in between SND.UNA...SND.UNA+SND.WND also can be already * invalid. OK, let's make this for now: */static __inline__ __u32 tcp_acceptable_seq(struct sock *sk, struct tcp_opt *tp){ if (!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt)) return tp->snd_nxt; else return tp->snd_una+tp->snd_wnd;}/* Calculate mss to advertise in SYN segment. * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that: * * 1. It is independent of path mtu. * 2. Ideally, it is maximal possible segment size i.e. 65535-40. * 3. For IPv4 it is reasonable to calculate it from maximal MTU of * attached devices, because some buggy hosts are confused by * large MSS. * 4. We do not make 3, we advertise MSS, calculated from first * hop device mtu, but allow to raise it to ip_rt_min_advmss. * This may be overriden via information stored in routing table. * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible, * probably even Jumbo". */static __u16 tcp_advertise_mss(struct sock *sk){ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct dst_entry *dst = __sk_dst_get(sk); int mss = tp->advmss; if (dst && dst->advmss < mss) { mss = dst->advmss; tp->advmss = mss; } return (__u16)mss;}/* RFC2861. Reset CWND after idle period longer RTO to "restart window". * This is the first part of cwnd validation mechanism. */static void tcp_cwnd_restart(struct tcp_opt *tp){ s32 delta = tcp_time_stamp - tp->lsndtime; u32 restart_cwnd = tcp_init_cwnd(tp); u32 cwnd = tp->snd_cwnd; tp->snd_ssthresh = tcp_current_ssthresh(tp); restart_cwnd = min(restart_cwnd, cwnd); while ((delta -= tp->rto) > 0 && cwnd > restart_cwnd) cwnd >>= 1; tp->snd_cwnd = max(cwnd, restart_cwnd); tp->snd_cwnd_stamp = tcp_time_stamp; tp->snd_cwnd_used = 0;}static __inline__ void tcp_event_data_sent(struct tcp_opt *tp, struct sk_buff *skb){ u32 now = tcp_time_stamp; if (!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto) tcp_cwnd_restart(tp); tp->lsndtime = now; /* If it is a reply for ato after last received * packet, enter pingpong mode. */ if ((u32)(now - tp->ack.lrcvtime) < tp->ack.ato) tp->ack.pingpong = 1;}static __inline__ void tcp_event_ack_sent(struct sock *sk){ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); tcp_dec_quickack_mode(tp); tcp_clear_xmit_timer(sk, TCP_TIME_DACK);}/* Chose a new window to advertise, update state in tcp_opt for the * socket, and return result with RFC1323 scaling applied. The return * value can be stuffed directly into th->window for an outgoing * frame. */static __inline__ u16 tcp_select_window(struct sock *sk){ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); u32 cur_win = tcp_receive_window(tp); u32 new_win = __tcp_select_window(sk); /* Never shrink the offered window */ if(new_win < cur_win) { /* Danger Will Robinson! * Don't update rcv_wup/rcv_wnd here or else * we will not be able to advertise a zero * window in time. --DaveM * * Relax Will Robinson. */ new_win = cur_win; } tp->rcv_wnd = new_win; tp->rcv_wup = tp->rcv_nxt; /* RFC1323 scaling applied */ new_win >>= tp->rcv_wscale; /* If we advertise zero window, disable fast path. */ if (new_win == 0) tp->pred_flags = 0; return new_win;}/* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg(). This is used by both the initial * transmission and possible later retransmissions. * All SKB's seen here are completely headerless. It is our * job to build the TCP header, and pass the packet down to * IP so it can do the same plus pass the packet off to the * device. * * We are working here with either a clone of the original * SKB, or a fresh unique copy made by the retransmit engine. */int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb){ if(skb != NULL) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); int tcp_header_size = tp->tcp_header_len; struct tcphdr *th; int sysctl_flags; int err;#define SYSCTL_FLAG_TSTAMPS 0x1#define SYSCTL_FLAG_WSCALE 0x2#define SYSCTL_FLAG_SACK 0x4 sysctl_flags = 0; if (tcb->flags & TCPCB_FLAG_SYN) { tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; if(sysctl_tcp_timestamps) { tcp_header_size += TCPOLEN_TSTAMP_ALIGNED; sysctl_flags |= SYSCTL_FLAG_TSTAMPS; } if(sysctl_tcp_window_scaling) { tcp_header_size += TCPOLEN_WSCALE_ALIGNED; sysctl_flags |= SYSCTL_FLAG_WSCALE; } if(sysctl_tcp_sack) { sysctl_flags |= SYSCTL_FLAG_SACK; if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS)) tcp_header_size += TCPOLEN_SACKPERM_ALIGNED; } } else if (tp->eff_sacks) { /* A SACK is 2 pad bytes, a 2 byte header, plus * 2 32-bit sequence numbers for each SACK block. */ tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED + (tp->eff_sacks * TCPOLEN_SACK_PERBLOCK)); } th = (struct tcphdr *) skb_push(skb, tcp_header_size); skb->h.th = th; skb_set_owner_w(skb, sk); /* Build TCP header and checksum it. */ th->source = sk->sport; th->dest = sk->dport; th->seq = htonl(tcb->seq); th->ack_seq = htonl(tp->rcv_nxt); *(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->flags); if (tcb->flags & TCPCB_FLAG_SYN) { /* RFC1323: The window in SYN & SYN/ACK segments * is never scaled. */ th->window = htons(tp->rcv_wnd); } else { th->window = htons(tcp_select_window(sk)); } th->check = 0; th->urg_ptr = 0; if (tp->urg_mode && between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF)) { th->urg_ptr = htons(tp->snd_up-tcb->seq); th->urg = 1; } if (tcb->flags & TCPCB_FLAG_SYN) { tcp_syn_build_options((__u32 *)(th + 1), tcp_advertise_mss(sk), (sysctl_flags & SYSCTL_FLAG_TSTAMPS), (sysctl_flags & SYSCTL_FLAG_SACK), (sysctl_flags & SYSCTL_FLAG_WSCALE), tp->rcv_wscale, tcb->when, tp->ts_recent); } else { tcp_build_and_update_options((__u32 *)(th + 1), tp, tcb->when); TCP_ECN_send(sk, tp, skb, tcp_header_size); } tp->af_specific->send_check(sk, th, skb->len, skb); if (tcb->flags & TCPCB_FLAG_ACK) tcp_event_ack_sent(sk); if (skb->len != tcp_header_size) tcp_event_data_sent(tp, skb); TCP_INC_STATS(TcpOutSegs); err = tp->af_specific->queue_xmit(skb); if (err <= 0) return err; tcp_enter_cwr(tp); /* NET_XMIT_CN is special. It does not guarantee, * that this packet is lost. It tells that device * is about to start to drop packets or already * drops some packets of the same priority and * invokes us to send less aggressively. */ return err == NET_XMIT_CN ? 0 : err; } return -ENOBUFS;#undef SYSCTL_FLAG_TSTAMPS#undef SYSCTL_FLAG_WSCALE#undef SYSCTL_FLAG_SACK}/* This is the main buffer sending routine. We queue the buffer * and decide whether to queue or transmit now. * * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames, * otherwise socket can stall. */void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue, unsigned cur_mss){ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); /* Advance write_seq and place onto the write_queue. */ tp->write_seq = TCP_SKB_CB(skb)->end_seq; __skb_queue_tail(&sk->write_queue, skb); tcp_charge_skb(sk, skb); if (!force_queue && tp->send_head == NULL && tcp_snd_test(tp, skb, cur_mss, tp->nonagle)) { /* Send it out now. */ TCP_SKB_CB(skb)->when = tcp_time_stamp; if (tcp_transmit_skb(sk, skb_clone(skb, sk->allocation)) == 0) { tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; tcp_minshall_update(tp, cur_mss, skb); if (tp->packets_out++ == 0) tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); return; } } /* Queue it, remembering where we must start sending. */ if (tp->send_head == NULL) tp->send_head = skb;}/* Send _single_ skb sitting at the send head. This function requires * true push pending frames to setup probe timer etc. */void tcp_push_one(struct sock *sk, unsigned cur_mss){ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sk_buff *skb = tp->send_head; if (tcp_snd_test(tp, skb, cur_mss, 1)) { /* Send it out now. */ TCP_SKB_CB(skb)->when = tcp_time_stamp; if (tcp_transmit_skb(sk, skb_clone(skb, sk->allocation)) == 0) { tp->send_head = NULL; tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; if (tp->packets_out++ == 0) tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); return; } }}/* Split fragmented skb to two parts at length len. */static void skb_split(struct sk_buff *skb, struct sk_buff *skb1, u32 len){ int i; int pos = skb->len - skb->data_len; if (len < pos) { /* Split line is inside header. */ memcpy(skb_put(skb1, pos-len), skb->data + len, pos-len); /* And move data appendix as is. */ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags; skb_shinfo(skb)->nr_frags = 0; skb1->data_len = skb->data_len; skb1->len += skb1->data_len; skb->data_len = 0; skb->len = len; skb->tail = skb->data+len; } else { int k = 0; int nfrags = skb_shinfo(skb)->nr_frags; /* Second chunk has no header, nothing to copy. */ skb_shinfo(skb)->nr_frags = 0; skb1->len = skb1->data_len = skb->len - len; skb->len = len; skb->data_len = len - pos; for (i=0; i<nfrags; i++) { int size = skb_shinfo(skb)->frags[i].size; if (pos + size > len) { skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i]; if (pos < len) { /* Split frag. * We have to variants in this case: * 1. Move all the frag to the second * part, if it is possible. F.e. * this approach is mandatory for TUX, * where splitting is expensive. * 2. Split is accurately. We make this. */ get_page(skb_shinfo(skb)->frags[i].page); skb_shinfo(skb1)->frags[0].page_offset += (len-pos); skb_shinfo(skb1)->frags[0].size -= (len-pos); skb_shinfo(skb)->frags[i].size = len-pos; skb_shinfo(skb)->nr_frags++; } k++; } else { skb_shinfo(skb)->nr_frags++; } pos += size; } skb_shinfo(skb1)->nr_frags = k; }}/* Function to create two new TCP segments. Shrinks the given segment * to the specified size and appends a new segment with the rest of the * packet to the list. This won't be called frequently, I hope. * Remember, these are still headerless SKBs at this point. */static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len){ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; struct sk_buff *buff; int nsize = skb->len - len; u16 flags; if (skb_cloned(skb) && skb_is_nonlinear(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) return -ENOMEM; /* Get a new skb... force flag on. */ buff = tcp_alloc_skb(sk, nsize, GFP_ATOMIC); if (buff == NULL) return -ENOMEM; /* We'll just try again later. */ tcp_charge_skb(sk, buff); /* Correct the sequence numbers. */ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; /* PSH and FIN should only be set in the second packet. */ flags = TCP_SKB_CB(skb)->flags; TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); TCP_SKB_CB(buff)->flags = flags; TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_EVER_RETRANS|TCPCB_AT_TAIL); if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) { tp->lost_out++; tp->left_out++; } TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL; if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_HW) { /* Copy and checksum data tail into the new buffer. */ buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize), nsize, 0); skb_trim(skb, len); skb->csum = csum_block_sub(skb->csum, buff->csum, len); } else { skb->ip_summed = CHECKSUM_HW; skb_split(skb, buff, len); } buff->ip_summed = skb->ip_summed; /* Looks stupid, but our code really uses when of * skbs, which it never sent before. --ANK */ TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; /* Link BUFF into the send queue. */ __skb_append(skb, buff); return 0;}/* This function synchronize snd mss to current pmtu/exthdr set. tp->user_mss is mss set by user by TCP_MAXSEG. It does NOT counts for TCP options, but includes only bare TCP header.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -