📄 tcp_input.c
字号:
/* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * Version: $Id: tcp_input.c,v 1.164.2.8 1999/09/23 19:21:23 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Florian La Roche, <flla@stud.uni-sb.de> * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> * Linus Torvalds, <torvalds@cs.helsinki.fi> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Matthew Dillon, <dillon@apollo.west.oic.com> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Jorge Cwik, <jorge@laser.satlink.net> *//* * Changes: * Pedro Roque : Fast Retransmit/Recovery. * Two receive queues. * Retransmit queue handled by TCP. * Better retransmit timer handling. * New congestion avoidance. * Header prediction. * Variable renaming. * * Eric : Fast Retransmit. * Randy Scott : MSS option defines. * Eric Schenk : Fixes to slow start algorithm. * Eric Schenk : Yet another double ACK bug. * Eric Schenk : Delayed ACK bug fixes. * Eric Schenk : Floyd style fast retrans war avoidance. * David S. Miller : Don't allow zero congestion window. * Eric Schenk : Fix retransmitter so that it sends * next packet on ack of previous packet. * Andi Kleen : Moved open_request checking here * and process RSTs for open_requests. * Andi Kleen : Better prune_queue, and other fixes. * Andrey Savochkin: Fix RTT measurements in the presnce of * timestamps. * Andrey Savochkin: Check sequence numbers correctly when * removing SACKs due to in sequence incoming * data segments. * Andi Kleen: Make sure we never ack data there is not * enough room for. Also make this condition * a fatal error if it might still happen. * Andi Kleen: Add tcp_measure_rcv_mss to make * connections with MSS<min(MTU,ann. MSS) * work without delayed acks. * Andi Kleen: Process packets with PSH set in the * fast path. */#include <linux/config.h>#include <linux/mm.h>#include <linux/sysctl.h>#include <net/tcp.h>#include <linux/ipsec.h>#ifdef CONFIG_SYSCTL#define SYNC_INIT 0 /* let the user enable it */#else#define SYNC_INIT 1#endifextern int sysctl_tcp_fin_timeout;/* These are on by default so the code paths get tested. * For the final 2.2 this may be undone at our discretion. -DaveM */int sysctl_tcp_timestamps = 1;int sysctl_tcp_window_scaling = 1;int sysctl_tcp_sack = 1;int sysctl_tcp_syncookies = SYNC_INIT; int sysctl_tcp_stdurg;int sysctl_tcp_rfc1337;static int prune_queue(struct sock *sk);/* There is something which you must keep in mind when you analyze the * behavior of the tp->ato delayed ack timeout interval. When a * connection starts up, we want to ack as quickly as possible. The * problem is that "good" TCP's do slow start at the beginning of data * transmission. The means that until we send the first few ACK's the * sender will sit on his end and only queue most of his data, because * he can only send snd_cwnd unacked packets at any given time. For * each ACK we send, he increments snd_cwnd and transmits more of his * queue. -DaveM */static void tcp_delack_estimator(struct tcp_opt *tp){ if(tp->ato == 0) { tp->lrcvtime = tcp_time_stamp; /* Help sender leave slow start quickly, * and also makes sure we do not take this * branch ever again for this connection. */ tp->ato = 1; tcp_enter_quickack_mode(tp); } else { int m = tcp_time_stamp - tp->lrcvtime; tp->lrcvtime = tcp_time_stamp; if(m <= 0) m = 1; if(m > tp->rto) tp->ato = tp->rto; else { /* This funny shift makes sure we * clear the "quick ack mode" bit. */ tp->ato = ((tp->ato << 1) >> 2) + m; } }}/* * Remember to send an ACK later. */static __inline__ void tcp_remember_ack(struct tcp_opt *tp, struct tcphdr *th, struct sk_buff *skb){ tp->delayed_acks++; /* Tiny-grams with PSH set artifically deflate our * ato measurement, but with a lower bound. */ if(th->psh && (skb->len < (tp->mss_cache >> 1))) { /* Preserve the quickack state. */ if((tp->ato & 0x7fffffff) > HZ/50) tp->ato = ((tp->ato & 0x80000000) | (HZ/50)); }} /* Called to compute a smoothed rtt estimate. The data fed to this * routine either comes from timestamps, or from segments that were * known _not_ to have been retransmitted [see Karn/Partridge * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88 * piece by Van Jacobson. * NOTE: the next three routines used to be one big routine. * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt){ long m = mrtt; /* RTT */ /* The following amusing code comes from Jacobson's * article in SIGCOMM '88. Note that rtt and mdev * are scaled versions of rtt and mean deviation. * This is designed to be as fast as possible * m stands for "measurement". * * On a 1990 paper the rto value is changed to: * RTO = rtt + 4 * mdev */ if(m == 0) m = 1; if (tp->srtt != 0) { m -= (tp->srtt >> 3); /* m is now error in rtt est */ tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */ if (m < 0) m = -m; /* m is now abs(error) */ m -= (tp->mdev >> 2); /* similar update on mdev */ tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */ } else { /* no previous measure. */ tp->srtt = m<<3; /* take the measured time to be rtt */ tp->mdev = m<<2; /* make sure rto = 3*rtt */ }}/* Calculate rto without backoff. This is the second half of Van Jacobson's * routine referred to above. */static __inline__ void tcp_set_rto(struct tcp_opt *tp){ tp->rto = (tp->srtt >> 3) + tp->mdev; tp->rto += (tp->rto >> 2) + (tp->rto >> (tp->snd_cwnd-1));} /* Keep the rto between HZ/5 and 120*HZ. 120*HZ is the upper bound * on packet lifetime in the internet. We need the HZ/5 lower * bound to behave correctly against BSD stacks with a fixed * delayed ack. * FIXME: It's not entirely clear this lower bound is the best * way to avoid the problem. Is it possible to drop the lower * bound and still avoid trouble with BSD stacks? Perhaps * some modification to the RTO calculation that takes delayed * ack bias into account? This needs serious thought. -- erics */static __inline__ void tcp_bound_rto(struct tcp_opt *tp){ if (tp->rto > 120*HZ) tp->rto = 120*HZ; if (tp->rto < HZ/5) tp->rto = HZ/5;}/* WARNING: this must not be called if tp->saw_timestamp was false. */extern __inline__ void tcp_replace_ts_recent(struct sock *sk, struct tcp_opt *tp, __u32 start_seq, __u32 end_seq){ /* It is start_seq <= last_ack_seq combined with in window check. If start_seq<=last_ack_seq<=rcv_nxt, then segment is in window if end_seq>=rcv_nxt. */ if (!after(start_seq, tp->last_ack_sent) && !before(end_seq, tp->rcv_nxt)) { /* PAWS bug workaround wrt. ACK frames, the PAWS discard * extra check below makes sure this can only happen * for pure ACK frames. -DaveM * * Plus: expired timestamps. * * Plus: resets failing PAWS. */ if((s32)(tp->rcv_tsval - tp->ts_recent) >= 0) { tp->ts_recent = tp->rcv_tsval; tp->ts_recent_stamp = tcp_time_stamp; } }}#define PAWS_24DAYS (HZ * 60 * 60 * 24 * 24)extern __inline__ int tcp_paws_discard(struct tcp_opt *tp, struct tcphdr *th, unsigned len){ return ((s32)(tp->rcv_tsval - tp->ts_recent) < 0 && (s32)(tcp_time_stamp - tp->ts_recent_stamp) < PAWS_24DAYS && /* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM */ len != (th->doff * 4));}static int __tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq){ u32 end_window = tp->rcv_wup + tp->rcv_wnd; if (tp->rcv_wnd && after(end_seq, tp->rcv_nxt) && before(seq, end_window)) return 1; if (seq != end_window) return 0; return (seq == end_seq);}/* This functions checks to see if the tcp header is actually acceptable. */extern __inline__ int tcp_sequence(struct tcp_opt *tp, u32 seq, u32 end_seq){ if (seq == tp->rcv_nxt) return (tp->rcv_wnd || (end_seq == seq)); return __tcp_sequence(tp, seq, end_seq);}/* When we get a reset we do this. */static void tcp_reset(struct sock *sk){ sk->zapped = 1; /* We want the right error as BSD sees it (and indeed as we do). */ switch (sk->state) { case TCP_SYN_SENT: sk->err = ECONNREFUSED; break; case TCP_CLOSE_WAIT: sk->err = EPIPE; break; default: sk->err = ECONNRESET; }; tcp_set_state(sk, TCP_CLOSE); sk->shutdown = SHUTDOWN_MASK; if (!sk->dead) sk->state_change(sk);}/* This tags the retransmission queue when SACKs arrive. */static void tcp_sacktag_write_queue(struct sock *sk, struct tcp_sack_block *sp, int nsacks){ struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); int i = nsacks; while(i--) { struct sk_buff *skb = skb_peek(&sk->write_queue); __u32 start_seq = ntohl(sp->start_seq); __u32 end_seq = ntohl(sp->end_seq); int fack_count = 0; while((skb != NULL) && (skb != tp->send_head) && (skb != (struct sk_buff *)&sk->write_queue)) { /* The retransmission queue is always in order, so * we can short-circuit the walk early. */ if(after(TCP_SKB_CB(skb)->seq, end_seq)) break; /* We play conservative, we don't allow SACKS to partially * tag a sequence space. */ fack_count++; if(!after(start_seq, TCP_SKB_CB(skb)->seq) && !before(end_seq, TCP_SKB_CB(skb)->end_seq)) { /* If this was a retransmitted frame, account for it. */ if((TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) && tp->retrans_out) tp->retrans_out--; TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED; /* RULE: All new SACKs will either decrease retrans_out * or advance fackets_out. */ if(fack_count > tp->fackets_out) tp->fackets_out = fack_count; } skb = skb->next; } sp++; /* Move on to the next SACK block. */ }}/* Look for tcp options. Normally only called on SYN and SYNACK packets. * But, this can also be called on packets in the established flow when * the fast version below fails. */void tcp_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp, int no_fancy){ unsigned char *ptr; int length=(th->doff*4)-sizeof(struct tcphdr); int saw_mss = 0; ptr = (unsigned char *)(th + 1); tp->saw_tstamp = 0; while(length>0) { int opcode=*ptr++; int opsize; switch (opcode) { case TCPOPT_EOL: return; case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ length--; continue; default: opsize=*ptr++; if (opsize < 2) /* "silly options" */ return; if (opsize > length) break; /* don't parse partial options */ switch(opcode) { case TCPOPT_MSS: if(opsize==TCPOLEN_MSS && th->syn) { u16 in_mss = ntohs(*(__u16 *)ptr); if (in_mss == 0) in_mss = 536; if (tp->mss_clamp > in_mss) tp->mss_clamp = in_mss; saw_mss = 1; } break; case TCPOPT_WINDOW: if(opsize==TCPOLEN_WINDOW && th->syn) if (!no_fancy && sysctl_tcp_window_scaling) { tp->wscale_ok = 1; tp->snd_wscale = *(__u8 *)ptr; if(tp->snd_wscale > 14) { if(net_ratelimit()) printk("tcp_parse_options: Illegal window " "scaling value %d >14 received.", tp->snd_wscale); tp->snd_wscale = 14; } } break; case TCPOPT_TIMESTAMP: if(opsize==TCPOLEN_TIMESTAMP) { if (sysctl_tcp_timestamps && !no_fancy) { tp->tstamp_ok = 1; tp->saw_tstamp = 1; tp->rcv_tsval = ntohl(*(__u32 *)ptr); tp->rcv_tsecr = ntohl(*(__u32 *)(ptr+4)); } } break; case TCPOPT_SACK_PERM: if(opsize==TCPOLEN_SACK_PERM && th->syn) { if (sysctl_tcp_sack && !no_fancy) { tp->sack_ok = 1; tp->num_sacks = 0; } } break; case TCPOPT_SACK: if((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) && sysctl_tcp_sack && (sk != NULL) && !th->syn) { int sack_bytes = opsize - TCPOLEN_SACK_BASE; if(!(sack_bytes % TCPOLEN_SACK_PERBLOCK)) { int num_sacks = sack_bytes >> 3; struct tcp_sack_block *sackp; sackp = (struct tcp_sack_block *)ptr; tcp_sacktag_write_queue(sk, sackp, num_sacks); } } }; ptr+=opsize-2; length-=opsize; }; } if(th->syn && saw_mss == 0) tp->mss_clamp = 536;}/* Fast parse options. This hopes to only see timestamps. * If it is wrong it falls back on tcp_parse_options(). */static __inline__ int tcp_fast_parse_options(struct sock *sk, struct tcphdr *th, struct tcp_opt *tp){ /* If we didn't send out any options ignore them all. */ if (tp->tcp_header_len == sizeof(struct tcphdr)) return 0; if (th->doff == sizeof(struct tcphdr)>>2) { tp->saw_tstamp = 0; return 0; } else if (th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) { __u32 *ptr = (__u32 *)(th + 1);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -