📄 tcp_output.c
字号:
/* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * Version: $Id: tcp_output.c,v 1.146 2002/02/01 22:01:04 davem Exp $ * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Florian La Roche, <flla@stud.uni-sb.de> * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> * Linus Torvalds, <torvalds@cs.helsinki.fi> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Matthew Dillon, <dillon@apollo.west.oic.com> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Jorge Cwik, <jorge@laser.satlink.net> *//* * Changes: Pedro Roque : Retransmit queue handled by TCP. * : Fragmentation on mtu decrease * : Segment collapse on retransmit * : AF independence * * Linus Torvalds : send_delayed_ack * David S. Miller : Charge memory using the right skb * during syn/ack processing. * David S. Miller : Output engine completely rewritten. * Andrea Arcangeli: SYNACK carry ts_recent in tsecr. * Cacophonix Gaul : draft-minshall-nagle-01 * J Hadi Salim : ECN support * */#include <net/tcp.h>#include <linux/compiler.h>#include <linux/module.h>/* People can turn this off for buggy TCP's found in printers etc. */int sysctl_tcp_retrans_collapse __read_mostly = 1;/* People can turn this on to work with those rare, broken TCPs that * interpret the window field as a signed quantity. */int sysctl_tcp_workaround_signed_windows __read_mostly = 0;/* This limits the percentage of the congestion window which we * will allow a single TSO frame to consume. Building TSO frames * which are too large can cause TCP streams to be bursty. */int sysctl_tcp_tso_win_divisor __read_mostly = 3;int sysctl_tcp_mtu_probing __read_mostly = 0;int sysctl_tcp_base_mss __read_mostly = 512;/* By default, RFC2861 behavior. */int sysctl_tcp_slow_start_after_idle __read_mostly = 1;static inline void tcp_packets_out_inc(struct sock *sk, const struct sk_buff *skb){ struct tcp_sock *tp = tcp_sk(sk); int orig = tp->packets_out; tp->packets_out += tcp_skb_pcount(skb); if (!orig) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX);}static void update_send_head(struct sock *sk, struct sk_buff *skb){ struct tcp_sock *tp = tcp_sk(sk); tcp_advance_send_head(sk, skb); tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; tcp_packets_out_inc(sk, skb);}/* SND.NXT, if window was not shrunk. * If window has been shrunk, what should we make? It is not clear at all. * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-( * Anything in between SND.UNA...SND.UNA+SND.WND also can be already * invalid. OK, let's make this for now: */static inline __u32 tcp_acceptable_seq(struct sock *sk){ struct tcp_sock *tp = tcp_sk(sk); if (!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt)) return tp->snd_nxt; else return tp->snd_una+tp->snd_wnd;}/* Calculate mss to advertise in SYN segment. * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that: * * 1. It is independent of path mtu. * 2. Ideally, it is maximal possible segment size i.e. 65535-40. * 3. For IPv4 it is reasonable to calculate it from maximal MTU of * attached devices, because some buggy hosts are confused by * large MSS. * 4. We do not make 3, we advertise MSS, calculated from first * hop device mtu, but allow to raise it to ip_rt_min_advmss. * This may be overridden via information stored in routing table. * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible, * probably even Jumbo". */static __u16 tcp_advertise_mss(struct sock *sk){ struct tcp_sock *tp = tcp_sk(sk); struct dst_entry *dst = __sk_dst_get(sk); int mss = tp->advmss; if (dst && dst_metric(dst, RTAX_ADVMSS) < mss) { mss = dst_metric(dst, RTAX_ADVMSS); tp->advmss = mss; } return (__u16)mss;}/* RFC2861. Reset CWND after idle period longer RTO to "restart window". * This is the first part of cwnd validation mechanism. */static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst){ struct tcp_sock *tp = tcp_sk(sk); s32 delta = tcp_time_stamp - tp->lsndtime; u32 restart_cwnd = tcp_init_cwnd(tp, dst); u32 cwnd = tp->snd_cwnd; tcp_ca_event(sk, CA_EVENT_CWND_RESTART); tp->snd_ssthresh = tcp_current_ssthresh(sk); restart_cwnd = min(restart_cwnd, cwnd); while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd) cwnd >>= 1; tp->snd_cwnd = max(cwnd, restart_cwnd); tp->snd_cwnd_stamp = tcp_time_stamp; tp->snd_cwnd_used = 0;}static void tcp_event_data_sent(struct tcp_sock *tp, struct sk_buff *skb, struct sock *sk){ struct inet_connection_sock *icsk = inet_csk(sk); const u32 now = tcp_time_stamp; if (sysctl_tcp_slow_start_after_idle && (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto)) tcp_cwnd_restart(sk, __sk_dst_get(sk)); tp->lsndtime = now; /* If it is a reply for ato after last received * packet, enter pingpong mode. */ if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato) icsk->icsk_ack.pingpong = 1;}static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts){ tcp_dec_quickack_mode(sk, pkts); inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);}/* Determine a window scaling and initial window to offer. * Based on the assumption that the given amount of space * will be offered. Store the results in the tp structure. * NOTE: for smooth operation initial space offering should * be a multiple of mss if possible. We assume here that mss >= 1. * This MUST be enforced by all callers. */void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd, __u32 *window_clamp, int wscale_ok, __u8 *rcv_wscale){ unsigned int space = (__space < 0 ? 0 : __space); /* If no clamp set the clamp to the max possible scaled window */ if (*window_clamp == 0) (*window_clamp) = (65535 << 14); space = min(*window_clamp, space); /* Quantize space offering to a multiple of mss if possible. */ if (space > mss) space = (space / mss) * mss; /* NOTE: offering an initial window larger than 32767 * will break some buggy TCP stacks. If the admin tells us * it is likely we could be speaking with such a buggy stack * we will truncate our initial window offering to 32K-1 * unless the remote has sent us a window scaling option, * which we interpret as a sign the remote TCP is not * misinterpreting the window field as a signed quantity. */ if (sysctl_tcp_workaround_signed_windows) (*rcv_wnd) = min(space, MAX_TCP_WINDOW); else (*rcv_wnd) = space; (*rcv_wscale) = 0; if (wscale_ok) { /* Set window scaling on max possible window * See RFC1323 for an explanation of the limit to 14 */ space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max); space = min_t(u32, space, *window_clamp); while (space > 65535 && (*rcv_wscale) < 14) { space >>= 1; (*rcv_wscale)++; } } /* Set initial window to value enough for senders, * following RFC2414. Senders, not following this RFC, * will be satisfied with 2. */ if (mss > (1<<*rcv_wscale)) { int init_cwnd = 4; if (mss > 1460*3) init_cwnd = 2; else if (mss > 1460) init_cwnd = 3; if (*rcv_wnd > init_cwnd*mss) *rcv_wnd = init_cwnd*mss; } /* Set the clamp no higher than max representable value */ (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp);}/* Chose a new window to advertise, update state in tcp_sock for the * socket, and return result with RFC1323 scaling applied. The return * value can be stuffed directly into th->window for an outgoing * frame. */static u16 tcp_select_window(struct sock *sk){ struct tcp_sock *tp = tcp_sk(sk); u32 cur_win = tcp_receive_window(tp); u32 new_win = __tcp_select_window(sk); /* Never shrink the offered window */ if (new_win < cur_win) { /* Danger Will Robinson! * Don't update rcv_wup/rcv_wnd here or else * we will not be able to advertise a zero * window in time. --DaveM * * Relax Will Robinson. */ new_win = cur_win; } tp->rcv_wnd = new_win; tp->rcv_wup = tp->rcv_nxt; /* Make sure we do not exceed the maximum possible * scaled window. */ if (!tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows) new_win = min(new_win, MAX_TCP_WINDOW); else new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); /* RFC1323 scaling applied */ new_win >>= tp->rx_opt.rcv_wscale; /* If we advertise zero window, disable fast path. */ if (new_win == 0) tp->pred_flags = 0; return new_win;}static inline void TCP_ECN_send_synack(struct tcp_sock *tp, struct sk_buff *skb){ TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_CWR; if (!(tp->ecn_flags&TCP_ECN_OK)) TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_ECE;}static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb){ struct tcp_sock *tp = tcp_sk(sk); tp->ecn_flags = 0; if (sysctl_tcp_ecn) { TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ECE|TCPCB_FLAG_CWR; tp->ecn_flags = TCP_ECN_OK; }}static __inline__ voidTCP_ECN_make_synack(struct request_sock *req, struct tcphdr *th){ if (inet_rsk(req)->ecn_ok) th->ece = 1;}static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb, int tcp_header_len){ struct tcp_sock *tp = tcp_sk(sk); if (tp->ecn_flags & TCP_ECN_OK) { /* Not-retransmitted data segment: set ECT and inject CWR. */ if (skb->len != tcp_header_len && !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) { INET_ECN_xmit(sk); if (tp->ecn_flags&TCP_ECN_QUEUE_CWR) { tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; tcp_hdr(skb)->cwr = 1; skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; } } else { /* ACK or retransmitted segment: clear ECT|CE */ INET_ECN_dontxmit(sk); } if (tp->ecn_flags & TCP_ECN_DEMAND_CWR) tcp_hdr(skb)->ece = 1; }}static void tcp_build_and_update_options(__be32 *ptr, struct tcp_sock *tp, __u32 tstamp, __u8 **md5_hash){ if (tp->rx_opt.tstamp_ok) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); *ptr++ = htonl(tstamp); *ptr++ = htonl(tp->rx_opt.ts_recent); } if (tp->rx_opt.eff_sacks) { struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks; int this_sack; *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_SACK << 8) | (TCPOLEN_SACK_BASE + (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK))); for (this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) { *ptr++ = htonl(sp[this_sack].start_seq); *ptr++ = htonl(sp[this_sack].end_seq); } if (tp->rx_opt.dsack) { tp->rx_opt.dsack = 0; tp->rx_opt.eff_sacks--; } }#ifdef CONFIG_TCP_MD5SIG if (md5_hash) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG); *md5_hash = (__u8 *)ptr; }#endif}/* Construct a tcp options header for a SYN or SYN_ACK packet. * If this is every changed make sure to change the definition of * MAX_SYN_SIZE to match the new maximum number of options that you * can generate. * * Note - that with the RFC2385 TCP option, we make room for the * 16 byte MD5 hash. This will be filled in later, so the pointer for the * location to be filled is passed back up. */static void tcp_syn_build_options(__be32 *ptr, int mss, int ts, int sack, int offer_wscale, int wscale, __u32 tstamp, __u32 ts_recent, __u8 **md5_hash){ /* We always get an MSS option. * The option bytes which will be seen in normal data * packets should timestamps be used, must be in the MSS * advertised. But we subtract them from tp->mss_cache so * that calculations in tcp_sendmsg are simpler etc. * So account for this fact here if necessary. If we * don't do this correctly, as a receiver we won't * recognize data packets as being full sized when we * should, and thus we won't abide by the delayed ACK * rules correctly. * SACKs don't matter, we never delay an ACK when we * have any of those going out. */ *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss); if (ts) { if (sack) *ptr++ = htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); else *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); *ptr++ = htonl(tstamp); /* TSVAL */ *ptr++ = htonl(ts_recent); /* TSECR */ } else if (sack) *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM); if (offer_wscale) *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | (wscale));#ifdef CONFIG_TCP_MD5SIG /* * If MD5 is enabled, then we set the option, and include the size * (always 18). The actual MD5 hash is added just before the * packet is sent. */ if (md5_hash) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG); *md5_hash = (__u8 *) ptr; }#endif}/* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg(). This is used by both the initial * transmission and possible later retransmissions. * All SKB's seen here are completely headerless. It is our * job to build the TCP header, and pass the packet down to * IP so it can do the same plus pass the packet off to the * device. *
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -