📄 tcp_output.c
字号:
/* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * Version: $Id: tcp_output.c,v 1.108.2.1 1999/05/14 23:07:36 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Florian La Roche, <flla@stud.uni-sb.de> * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> * Linus Torvalds, <torvalds@cs.helsinki.fi> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Matthew Dillon, <dillon@apollo.west.oic.com> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Jorge Cwik, <jorge@laser.satlink.net> *//* * Changes: Pedro Roque : Retransmit queue handled by TCP. * : Fragmentation on mtu decrease * : Segment collapse on retransmit * : AF independence * * Linus Torvalds : send_delayed_ack * David S. Miller : Charge memory using the right skb * during syn/ack processing. * David S. Miller : Output engine completely rewritten. * Andrea Arcangeli: SYNACK carry ts_recent in tsecr. * */#include <net/tcp.h>extern int sysctl_tcp_timestamps;extern int sysctl_tcp_window_scaling;extern int sysctl_tcp_sack;/* People can turn this off for buggy TCP's found in printers etc. */int sysctl_tcp_retrans_collapse = 1;/* Get rid of any delayed acks, we sent one already.. */static __inline__ void clear_delayed_acks(struct sock * sk){ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); tp->delayed_acks = 0; if(tcp_in_quickack_mode(tp)) tcp_exit_quickack_mode(tp); tcp_clear_xmit_timer(sk, TIME_DACK);}static __inline__ void update_send_head(struct sock *sk){ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; tp->send_head = tp->send_head->next; if (tp->send_head == (struct sk_buff *) &sk->write_queue) tp->send_head = NULL;}/* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg(). This is used by both the initial * transmission and possible later retransmissions. * All SKB's seen here are completely headerless. It is our * job to build the TCP header, and pass the packet down to * IP so it can do the same plus pass the packet off to the * device. * * We are working here with either a clone of the original * SKB, or a fresh unique copy made by the retransmit engine. */void tcp_transmit_skb(struct sock *sk, struct sk_buff *skb){ if(skb != NULL) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); int tcp_header_size = tp->tcp_header_len; struct tcphdr *th; int sysctl_flags;#define SYSCTL_FLAG_TSTAMPS 0x1#define SYSCTL_FLAG_WSCALE 0x2#define SYSCTL_FLAG_SACK 0x4 sysctl_flags = 0; if(tcb->flags & TCPCB_FLAG_SYN) { tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; if(sysctl_tcp_timestamps) { tcp_header_size += TCPOLEN_TSTAMP_ALIGNED; sysctl_flags |= SYSCTL_FLAG_TSTAMPS; } if(sysctl_tcp_window_scaling) { tcp_header_size += TCPOLEN_WSCALE_ALIGNED; sysctl_flags |= SYSCTL_FLAG_WSCALE; } if(sysctl_tcp_sack) { sysctl_flags |= SYSCTL_FLAG_SACK; if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS)) tcp_header_size += TCPOLEN_SACKPERM_ALIGNED; } } else if(tp->sack_ok && tp->num_sacks) { /* A SACK is 2 pad bytes, a 2 byte header, plus * 2 32-bit sequence numbers for each SACK block. */ tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED + (tp->num_sacks * TCPOLEN_SACK_PERBLOCK)); } th = (struct tcphdr *) skb_push(skb, tcp_header_size); skb->h.th = th; skb_set_owner_w(skb, sk); /* Build TCP header and checksum it. */ th->source = sk->sport; th->dest = sk->dport; th->seq = htonl(TCP_SKB_CB(skb)->seq); th->ack_seq = htonl(tp->rcv_nxt); th->doff = (tcp_header_size >> 2); th->res1 = 0; *(((__u8 *)th) + 13) = tcb->flags; if(!(tcb->flags & TCPCB_FLAG_SYN)) th->window = htons(tcp_select_window(sk)); th->check = 0; th->urg_ptr = ntohs(tcb->urg_ptr); if(tcb->flags & TCPCB_FLAG_SYN) { /* RFC1323: The window in SYN & SYN/ACK segments * is never scaled. */ th->window = htons(tp->rcv_wnd); tcp_syn_build_options((__u32 *)(th + 1), tp->mss_clamp, (sysctl_flags & SYSCTL_FLAG_TSTAMPS), (sysctl_flags & SYSCTL_FLAG_SACK), (sysctl_flags & SYSCTL_FLAG_WSCALE), tp->rcv_wscale, TCP_SKB_CB(skb)->when, tp->ts_recent); } else { tcp_build_and_update_options((__u32 *)(th + 1), tp, TCP_SKB_CB(skb)->when); } tp->af_specific->send_check(sk, th, skb->len, skb); clear_delayed_acks(sk); tp->last_ack_sent = tp->rcv_nxt; tcp_statistics.TcpOutSegs++; tp->af_specific->queue_xmit(skb); }#undef SYSCTL_FLAG_TSTAMPS#undef SYSCTL_FLAG_WSCALE#undef SYSCTL_FLAG_SACK}/* This is the main buffer sending routine. We queue the buffer * and decide whether to queue or transmit now. */void tcp_send_skb(struct sock *sk, struct sk_buff *skb, int force_queue){ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); /* Advance write_seq and place onto the write_queue. */ tp->write_seq += (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq); __skb_queue_tail(&sk->write_queue, skb); if (!force_queue && tp->send_head == NULL && tcp_snd_test(sk, skb)) { /* Send it out now. */ TCP_SKB_CB(skb)->when = tcp_time_stamp; tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; tp->packets_out++; tcp_transmit_skb(sk, skb_clone(skb, GFP_KERNEL)); if(!tcp_timer_is_set(sk, TIME_RETRANS)) tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); } else { /* Queue it, remembering where we must start sending. */ if (tp->send_head == NULL) tp->send_head = skb; if (!force_queue && tp->packets_out == 0 && !tp->pending) { tp->pending = TIME_PROBE0; tcp_reset_xmit_timer(sk, TIME_PROBE0, tp->rto); } }}/* Function to create two new TCP segments. Shrinks the given segment * to the specified size and appends a new segment with the rest of the * packet to the list. This won't be called frequently, I hope. * Remember, these are still headerless SKBs at this point. */static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len){ struct sk_buff *buff; int nsize = skb->len - len; u16 flags; /* Get a new skb... force flag on. */ buff = sock_wmalloc(sk, (nsize + MAX_HEADER + sk->prot->max_header), 1, GFP_ATOMIC); if (buff == NULL) return -1; /* We'll just try again later. */ /* Reserve space for headers. */ skb_reserve(buff, MAX_HEADER + sk->prot->max_header); /* Correct the sequence numbers. */ TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; /* PSH and FIN should only be set in the second packet. */ flags = TCP_SKB_CB(skb)->flags; TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH); if(flags & TCPCB_FLAG_URG) { u16 old_urg_ptr = TCP_SKB_CB(skb)->urg_ptr; /* Urgent data is always a pain in the ass. */ if(old_urg_ptr > len) { TCP_SKB_CB(skb)->flags &= ~(TCPCB_FLAG_URG); TCP_SKB_CB(skb)->urg_ptr = 0; TCP_SKB_CB(buff)->urg_ptr = old_urg_ptr - len; } else { flags &= ~(TCPCB_FLAG_URG); } } if(!(flags & TCPCB_FLAG_URG)) TCP_SKB_CB(buff)->urg_ptr = 0; TCP_SKB_CB(buff)->flags = flags; TCP_SKB_CB(buff)->sacked = 0; /* Copy and checksum data tail into the new buffer. */ buff->csum = csum_partial_copy(skb->data + len, skb_put(buff, nsize), nsize, 0); /* This takes care of the FIN sequence number too. */ TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; skb_trim(skb, len); /* Rechecksum original buffer. */ skb->csum = csum_partial(skb->data, skb->len, 0); /* Looks stupid, but our code really uses when of * skbs, which it never sent before. --ANK */ TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; /* Link BUFF into the send queue. */ __skb_append(skb, buff); return 0;}/* This function synchronize snd mss to current pmtu/exthdr set. tp->user_mss is mss set by user by TCP_MAXSEG. It does NOT counts for TCP options, but includes only bare TCP header. tp->mss_clamp is mss negotiated at connection setup. It is minumum of user_mss and mss received with SYN. It also does not include TCP options. tp->pmtu_cookie is last pmtu, seen by this function. tp->mss_cache is current effective sending mss, including all tcp options except for SACKs. It is evaluated, taking into account current pmtu, but never exceeds tp->mss_clamp. NOTE1. rfc1122 clearly states that advertised MSS DOES NOT include either tcp or ip options. NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside this function. --ANK (980731) */int tcp_sync_mss(struct sock *sk, u32 pmtu){ struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; int mss_now; /* Calculate base mss without TCP options: It is MMS_S - sizeof(tcphdr) of rfc1122 */ mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr); /* Clamp it (mss_clamp does not include tcp options) */ if (mss_now > tp->mss_clamp) mss_now = tp->mss_clamp; /* Now subtract TCP options size, not including SACKs */ mss_now -= tp->tcp_header_len - sizeof(struct tcphdr); /* Now subtract optional transport overhead */ mss_now -= tp->ext_header_len; /* It we got too small (or even negative) value, clamp it by 8 from below. Why 8 ? Well, it could be 1 with the same success, but if IP accepted segment of length 1, it would love 8 even more 8) --ANK (980731) */ if (mss_now < 8) mss_now = 8; /* And store cached results */ tp->pmtu_cookie = pmtu; tp->mss_cache = mss_now; return mss_now;}/* This routine writes packets to the network. It advances the * send_head. This happens as incoming acks open up the remote * window for us. */void tcp_write_xmit(struct sock *sk){ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); unsigned int mss_now; /* Account for SACKS, we may need to fragment due to this. * It is just like the real MSS changing on us midstream. * We also handle things correctly when the user adds some * IP options mid-stream. Silly to do, but cover it. */ mss_now = tcp_current_mss(sk); /* If we are zapped, the bytes will have to remain here. * In time closedown will empty the write queue and all * will be happy. */ if(!sk->zapped) { struct sk_buff *skb; int sent_pkts = 0; /* Anything on the transmit queue that fits the window can * be added providing we are: * * a) following SWS avoidance [and Nagle algorithm] * b) not exceeding our congestion window. * c) not retransmitting [Nagle] */ while((skb = tp->send_head) && tcp_snd_test(sk, skb)) { if (skb->len > mss_now) { if (tcp_fragment(sk, skb, mss_now)) break; } /* Advance the send_head. This one is going out. */ update_send_head(sk); TCP_SKB_CB(skb)->when = tcp_time_stamp; tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; tp->packets_out++; tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); sent_pkts = 1; } /* If we sent anything, make sure the retransmit * timer is active. */ if (sent_pkts && !tcp_timer_is_set(sk, TIME_RETRANS)) tcp_reset_xmit_timer(sk, TIME_RETRANS, tp->rto); }}/* This function returns the amount that we can raise the * usable window based on the following constraints * * 1. The window can never be shrunk once it is offered (RFC 793) * 2. We limit memory per socket * * RFC 1122: * "the suggested [SWS] avoidance algorithm for the receiver is to keep * RECV.NEXT + RCV.WIN fixed until: * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)" * * i.e. don't raise the right edge of the window until you can raise * it at least MSS bytes. * * Unfortunately, the recommended algorithm breaks header prediction, * since header prediction assumes th->window stays fixed.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -