⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 tcp_output.c

📁 linux 内核源代码
💻 C
📖 第 1 页 / 共 5 页
字号:
/* * INET		An implementation of the TCP/IP protocol suite for the LINUX *		operating system.  INET is implemented using the  BSD Socket *		interface as the means of communication with the user level. * *		Implementation of the Transmission Control Protocol(TCP). * * Version:	$Id: tcp_output.c,v 1.146 2002/02/01 22:01:04 davem Exp $ * * Authors:	Ross Biro *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> *		Mark Evans, <evansmp@uhura.aston.ac.uk> *		Corey Minyard <wf-rch!minyard@relay.EU.net> *		Florian La Roche, <flla@stud.uni-sb.de> *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu> *		Linus Torvalds, <torvalds@cs.helsinki.fi> *		Alan Cox, <gw4pts@gw4pts.ampr.org> *		Matthew Dillon, <dillon@apollo.west.oic.com> *		Arnt Gulbrandsen, <agulbra@nvg.unit.no> *		Jorge Cwik, <jorge@laser.satlink.net> *//* * Changes:	Pedro Roque	:	Retransmit queue handled by TCP. *				:	Fragmentation on mtu decrease *				:	Segment collapse on retransmit *				:	AF independence * *		Linus Torvalds	:	send_delayed_ack *		David S. Miller	:	Charge memory using the right skb *					during syn/ack processing. *		David S. Miller :	Output engine completely rewritten. *		Andrea Arcangeli:	SYNACK carry ts_recent in tsecr. *		Cacophonix Gaul :	draft-minshall-nagle-01 *		J Hadi Salim	:	ECN support * */#include <net/tcp.h>#include <linux/compiler.h>#include <linux/module.h>/* People can turn this off for buggy TCP's found in printers etc. */int sysctl_tcp_retrans_collapse __read_mostly = 1;/* People can turn this on to  work with those rare, broken TCPs that * interpret the window field as a signed quantity. */int sysctl_tcp_workaround_signed_windows __read_mostly = 0;/* This limits the percentage of the congestion window which we * will allow a single TSO frame to consume.  Building TSO frames * which are too large can cause TCP streams to be bursty. */int sysctl_tcp_tso_win_divisor __read_mostly = 3;int sysctl_tcp_mtu_probing __read_mostly = 0;int sysctl_tcp_base_mss __read_mostly = 512;/* By default, RFC2861 behavior.  */int sysctl_tcp_slow_start_after_idle __read_mostly = 1;static inline void tcp_packets_out_inc(struct sock *sk,				       const struct sk_buff *skb){	struct tcp_sock *tp = tcp_sk(sk);	int orig = tp->packets_out;	tp->packets_out += tcp_skb_pcount(skb);	if (!orig)		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,					  inet_csk(sk)->icsk_rto, TCP_RTO_MAX);}static void update_send_head(struct sock *sk, struct sk_buff *skb){	struct tcp_sock *tp = tcp_sk(sk);	tcp_advance_send_head(sk, skb);	tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;	tcp_packets_out_inc(sk, skb);}/* SND.NXT, if window was not shrunk. * If window has been shrunk, what should we make? It is not clear at all. * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-( * Anything in between SND.UNA...SND.UNA+SND.WND also can be already * invalid. OK, let's make this for now: */static inline __u32 tcp_acceptable_seq(struct sock *sk){	struct tcp_sock *tp = tcp_sk(sk);	if (!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt))		return tp->snd_nxt;	else		return tp->snd_una+tp->snd_wnd;}/* Calculate mss to advertise in SYN segment. * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that: * * 1. It is independent of path mtu. * 2. Ideally, it is maximal possible segment size i.e. 65535-40. * 3. For IPv4 it is reasonable to calculate it from maximal MTU of *    attached devices, because some buggy hosts are confused by *    large MSS. * 4. We do not make 3, we advertise MSS, calculated from first *    hop device mtu, but allow to raise it to ip_rt_min_advmss. *    This may be overridden via information stored in routing table. * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible, *    probably even Jumbo". */static __u16 tcp_advertise_mss(struct sock *sk){	struct tcp_sock *tp = tcp_sk(sk);	struct dst_entry *dst = __sk_dst_get(sk);	int mss = tp->advmss;	if (dst && dst_metric(dst, RTAX_ADVMSS) < mss) {		mss = dst_metric(dst, RTAX_ADVMSS);		tp->advmss = mss;	}	return (__u16)mss;}/* RFC2861. Reset CWND after idle period longer RTO to "restart window". * This is the first part of cwnd validation mechanism. */static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst){	struct tcp_sock *tp = tcp_sk(sk);	s32 delta = tcp_time_stamp - tp->lsndtime;	u32 restart_cwnd = tcp_init_cwnd(tp, dst);	u32 cwnd = tp->snd_cwnd;	tcp_ca_event(sk, CA_EVENT_CWND_RESTART);	tp->snd_ssthresh = tcp_current_ssthresh(sk);	restart_cwnd = min(restart_cwnd, cwnd);	while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)		cwnd >>= 1;	tp->snd_cwnd = max(cwnd, restart_cwnd);	tp->snd_cwnd_stamp = tcp_time_stamp;	tp->snd_cwnd_used = 0;}static void tcp_event_data_sent(struct tcp_sock *tp,				struct sk_buff *skb, struct sock *sk){	struct inet_connection_sock *icsk = inet_csk(sk);	const u32 now = tcp_time_stamp;	if (sysctl_tcp_slow_start_after_idle &&	    (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto))		tcp_cwnd_restart(sk, __sk_dst_get(sk));	tp->lsndtime = now;	/* If it is a reply for ato after last received	 * packet, enter pingpong mode.	 */	if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)		icsk->icsk_ack.pingpong = 1;}static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts){	tcp_dec_quickack_mode(sk, pkts);	inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);}/* Determine a window scaling and initial window to offer. * Based on the assumption that the given amount of space * will be offered. Store the results in the tp structure. * NOTE: for smooth operation initial space offering should * be a multiple of mss if possible. We assume here that mss >= 1. * This MUST be enforced by all callers. */void tcp_select_initial_window(int __space, __u32 mss,			       __u32 *rcv_wnd, __u32 *window_clamp,			       int wscale_ok, __u8 *rcv_wscale){	unsigned int space = (__space < 0 ? 0 : __space);	/* If no clamp set the clamp to the max possible scaled window */	if (*window_clamp == 0)		(*window_clamp) = (65535 << 14);	space = min(*window_clamp, space);	/* Quantize space offering to a multiple of mss if possible. */	if (space > mss)		space = (space / mss) * mss;	/* NOTE: offering an initial window larger than 32767	 * will break some buggy TCP stacks. If the admin tells us	 * it is likely we could be speaking with such a buggy stack	 * we will truncate our initial window offering to 32K-1	 * unless the remote has sent us a window scaling option,	 * which we interpret as a sign the remote TCP is not	 * misinterpreting the window field as a signed quantity.	 */	if (sysctl_tcp_workaround_signed_windows)		(*rcv_wnd) = min(space, MAX_TCP_WINDOW);	else		(*rcv_wnd) = space;	(*rcv_wscale) = 0;	if (wscale_ok) {		/* Set window scaling on max possible window		 * See RFC1323 for an explanation of the limit to 14		 */		space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max);		space = min_t(u32, space, *window_clamp);		while (space > 65535 && (*rcv_wscale) < 14) {			space >>= 1;			(*rcv_wscale)++;		}	}	/* Set initial window to value enough for senders,	 * following RFC2414. Senders, not following this RFC,	 * will be satisfied with 2.	 */	if (mss > (1<<*rcv_wscale)) {		int init_cwnd = 4;		if (mss > 1460*3)			init_cwnd = 2;		else if (mss > 1460)			init_cwnd = 3;		if (*rcv_wnd > init_cwnd*mss)			*rcv_wnd = init_cwnd*mss;	}	/* Set the clamp no higher than max representable value */	(*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp);}/* Chose a new window to advertise, update state in tcp_sock for the * socket, and return result with RFC1323 scaling applied.  The return * value can be stuffed directly into th->window for an outgoing * frame. */static u16 tcp_select_window(struct sock *sk){	struct tcp_sock *tp = tcp_sk(sk);	u32 cur_win = tcp_receive_window(tp);	u32 new_win = __tcp_select_window(sk);	/* Never shrink the offered window */	if (new_win < cur_win) {		/* Danger Will Robinson!		 * Don't update rcv_wup/rcv_wnd here or else		 * we will not be able to advertise a zero		 * window in time.  --DaveM		 *		 * Relax Will Robinson.		 */		new_win = cur_win;	}	tp->rcv_wnd = new_win;	tp->rcv_wup = tp->rcv_nxt;	/* Make sure we do not exceed the maximum possible	 * scaled window.	 */	if (!tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows)		new_win = min(new_win, MAX_TCP_WINDOW);	else		new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));	/* RFC1323 scaling applied */	new_win >>= tp->rx_opt.rcv_wscale;	/* If we advertise zero window, disable fast path. */	if (new_win == 0)		tp->pred_flags = 0;	return new_win;}static inline void TCP_ECN_send_synack(struct tcp_sock *tp,				       struct sk_buff *skb){	TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_CWR;	if (!(tp->ecn_flags&TCP_ECN_OK))		TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_ECE;}static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb){	struct tcp_sock *tp = tcp_sk(sk);	tp->ecn_flags = 0;	if (sysctl_tcp_ecn) {		TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ECE|TCPCB_FLAG_CWR;		tp->ecn_flags = TCP_ECN_OK;	}}static __inline__ voidTCP_ECN_make_synack(struct request_sock *req, struct tcphdr *th){	if (inet_rsk(req)->ecn_ok)		th->ece = 1;}static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,				int tcp_header_len){	struct tcp_sock *tp = tcp_sk(sk);	if (tp->ecn_flags & TCP_ECN_OK) {		/* Not-retransmitted data segment: set ECT and inject CWR. */		if (skb->len != tcp_header_len &&		    !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {			INET_ECN_xmit(sk);			if (tp->ecn_flags&TCP_ECN_QUEUE_CWR) {				tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;				tcp_hdr(skb)->cwr = 1;				skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;			}		} else {			/* ACK or retransmitted segment: clear ECT|CE */			INET_ECN_dontxmit(sk);		}		if (tp->ecn_flags & TCP_ECN_DEMAND_CWR)			tcp_hdr(skb)->ece = 1;	}}static void tcp_build_and_update_options(__be32 *ptr, struct tcp_sock *tp,					 __u32 tstamp, __u8 **md5_hash){	if (tp->rx_opt.tstamp_ok) {		*ptr++ = htonl((TCPOPT_NOP << 24) |			       (TCPOPT_NOP << 16) |			       (TCPOPT_TIMESTAMP << 8) |			       TCPOLEN_TIMESTAMP);		*ptr++ = htonl(tstamp);		*ptr++ = htonl(tp->rx_opt.ts_recent);	}	if (tp->rx_opt.eff_sacks) {		struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks;		int this_sack;		*ptr++ = htonl((TCPOPT_NOP  << 24) |			       (TCPOPT_NOP  << 16) |			       (TCPOPT_SACK <<  8) |			       (TCPOLEN_SACK_BASE + (tp->rx_opt.eff_sacks *						     TCPOLEN_SACK_PERBLOCK)));		for (this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) {			*ptr++ = htonl(sp[this_sack].start_seq);			*ptr++ = htonl(sp[this_sack].end_seq);		}		if (tp->rx_opt.dsack) {			tp->rx_opt.dsack = 0;			tp->rx_opt.eff_sacks--;		}	}#ifdef CONFIG_TCP_MD5SIG	if (md5_hash) {		*ptr++ = htonl((TCPOPT_NOP << 24) |			       (TCPOPT_NOP << 16) |			       (TCPOPT_MD5SIG << 8) |			       TCPOLEN_MD5SIG);		*md5_hash = (__u8 *)ptr;	}#endif}/* Construct a tcp options header for a SYN or SYN_ACK packet. * If this is every changed make sure to change the definition of * MAX_SYN_SIZE to match the new maximum number of options that you * can generate. * * Note - that with the RFC2385 TCP option, we make room for the * 16 byte MD5 hash. This will be filled in later, so the pointer for the * location to be filled is passed back up. */static void tcp_syn_build_options(__be32 *ptr, int mss, int ts, int sack,				  int offer_wscale, int wscale, __u32 tstamp,				  __u32 ts_recent, __u8 **md5_hash){	/* We always get an MSS option.	 * The option bytes which will be seen in normal data	 * packets should timestamps be used, must be in the MSS	 * advertised.  But we subtract them from tp->mss_cache so	 * that calculations in tcp_sendmsg are simpler etc.	 * So account for this fact here if necessary.  If we	 * don't do this correctly, as a receiver we won't	 * recognize data packets as being full sized when we	 * should, and thus we won't abide by the delayed ACK	 * rules correctly.	 * SACKs don't matter, we never delay an ACK when we	 * have any of those going out.	 */	*ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss);	if (ts) {		if (sack)			*ptr++ = htonl((TCPOPT_SACK_PERM << 24) |				       (TCPOLEN_SACK_PERM << 16) |				       (TCPOPT_TIMESTAMP << 8) |				       TCPOLEN_TIMESTAMP);		else			*ptr++ = htonl((TCPOPT_NOP << 24) |				       (TCPOPT_NOP << 16) |				       (TCPOPT_TIMESTAMP << 8) |				       TCPOLEN_TIMESTAMP);		*ptr++ = htonl(tstamp);		/* TSVAL */		*ptr++ = htonl(ts_recent);	/* TSECR */	} else if (sack)		*ptr++ = htonl((TCPOPT_NOP << 24) |			       (TCPOPT_NOP << 16) |			       (TCPOPT_SACK_PERM << 8) |			       TCPOLEN_SACK_PERM);	if (offer_wscale)		*ptr++ = htonl((TCPOPT_NOP << 24) |			       (TCPOPT_WINDOW << 16) |			       (TCPOLEN_WINDOW << 8) |			       (wscale));#ifdef CONFIG_TCP_MD5SIG	/*	 * If MD5 is enabled, then we set the option, and include the size	 * (always 18). The actual MD5 hash is added just before the	 * packet is sent.	 */	if (md5_hash) {		*ptr++ = htonl((TCPOPT_NOP << 24) |			       (TCPOPT_NOP << 16) |			       (TCPOPT_MD5SIG << 8) |			       TCPOLEN_MD5SIG);		*md5_hash = (__u8 *) ptr;	}#endif}/* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg().  This is used by both the initial * transmission and possible later retransmissions. * All SKB's seen here are completely headerless.  It is our * job to build the TCP header, and pass the packet down to * IP so it can do the same plus pass the packet off to the * device. *

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -