📄 tcp_ipv4.c
字号:
/* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * Version: $Id: tcp_ipv4.c,v 1.175.2.13 1999/11/16 06:33:53 davem Exp $ * * IPv4 specific functions * * * code split from: * linux/ipv4/tcp.c * linux/ipv4/tcp_input.c * linux/ipv4/tcp_output.c * * See tcp.c for author information * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. *//* * Changes: * David S. Miller : New socket lookup architecture. * This code is dedicated to John Dyson. * David S. Miller : Change semantics of established hash, * half is devoted to TIME_WAIT sockets * and the rest go in the other half. * Andi Kleen : Add support for syncookies and fixed * some bugs: ip options weren't passed to * the TCP layer, missed a check for an ACK bit. * Andi Kleen : Implemented fast path mtu discovery. * Fixed many serious bugs in the * open_request handling and moved * most of it into the af independent code. * Added tail drop and some other bugfixes. * Added new listen sematics. * Mike McLagan : Routing by source * Juan Jose Ciarlante: ip_dynaddr bits * Andi Kleen: various fixes. * Vitaly E. Lavrov : Transparent proxy revived after year coma. * Andi Kleen : Fix new listen. * Andi Kleen : Fix accept error reporting. */#include <linux/config.h>#include <linux/types.h>#include <linux/fcntl.h>#include <linux/random.h>#include <linux/init.h>#include <linux/ipsec.h>#include <net/icmp.h>#include <net/tcp.h>#include <net/ipv6.h>#include <asm/segment.h>#include <linux/inet.h>#include <linux/stddef.h>extern int sysctl_tcp_timestamps;extern int sysctl_tcp_window_scaling;extern int sysctl_tcp_sack;extern int sysctl_tcp_syncookies;extern int sysctl_ip_dynaddr;extern __u32 sysctl_wmem_max;extern __u32 sysctl_rmem_max;/* Check TCP sequence numbers in ICMP packets. */#define ICMP_MIN_LENGTH 8/* Socket used for sending RSTs */ struct inode tcp_inode;struct socket *tcp_socket=&tcp_inode.u.socket_i;static void tcp_v4_send_reset(struct sk_buff *skb);void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, struct sk_buff *skb);/* This is for sockets with full identity only. Sockets here will always * be without wildcards and will have the following invariant: * TCP_ESTABLISHED <= sk->state < TCP_CLOSE * * First half of the table is for sockets not in TIME_WAIT, second half * is for TIME_WAIT sockets only. */unsigned int tcp_ehash_size;struct sock **tcp_ehash;/* Ok, let's try this, I give up, we do need a local binding * TCP hash as well as the others for fast bind/connect. */unsigned int tcp_bhash_size;struct tcp_bind_bucket **tcp_bhash;/* All sockets in TCP_LISTEN state will be in here. This is the only table * where wildcard'd TCP sockets can exist. Hash function here is just local * port number. */struct sock *tcp_listening_hash[TCP_LHTABLE_SIZE];/* Register cache. */struct sock *tcp_regs[TCP_NUM_REGS];/* * This array holds the first and last local port number. * For high-usage systems, use sysctl to change this to * 32768-61000 */int sysctl_local_port_range[2] = { 1024, 4999 };int tcp_port_rover = (1024 - 1);static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport, __u32 faddr, __u16 fport){ return ((laddr ^ lport) ^ (faddr ^ fport)) & ((tcp_ehash_size/2) - 1);}static __inline__ int tcp_sk_hashfn(struct sock *sk){ __u32 laddr = sk->rcv_saddr; __u16 lport = sk->num; __u32 faddr = sk->daddr; __u16 fport = sk->dport; return tcp_hashfn(laddr, lport, faddr, fport);}/* Allocate and initialize a new TCP local port bind bucket. * Always runs inside the socket hashing lock. */struct tcp_bind_bucket *tcp_bucket_create(unsigned short snum){ struct tcp_bind_bucket *tb; tb = kmem_cache_alloc(tcp_bucket_cachep, SLAB_ATOMIC); if(tb != NULL) { struct tcp_bind_bucket **head = &tcp_bhash[tcp_bhashfn(snum)]; tb->port = snum; tb->fastreuse = 0; tb->owners = NULL; if((tb->next = *head) != NULL) tb->next->pprev = &tb->next; *head = tb; tb->pprev = head; } return tb;}#ifdef CONFIG_IP_TRANSPARENT_PROXY/* Ensure that the bound bucket for the port exists. * Return 0 and bump bucket reference count on success. * * Must run in a BH atomic section. */static __inline__ int __tcp_bucket_check(unsigned short snum){ struct tcp_bind_bucket *tb; tb = tcp_bhash[tcp_bhashfn(snum)]; for( ; (tb && (tb->port != snum)); tb = tb->next) ; if (tb == NULL) { if ((tb = tcp_bucket_create(snum)) == NULL) return 1; } return 0;}#endifstatic __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child){ struct tcp_bind_bucket *tb = (struct tcp_bind_bucket *)sk->prev;#ifdef CONFIG_IP_TRANSPARENT_PROXY if (child->num != sk->num) { unsigned short snum = child->num; for(tb = tcp_bhash[tcp_bhashfn(snum)]; tb && tb->port != snum; tb = tb->next) ; if (tb == NULL) tb = (struct tcp_bind_bucket *)sk->prev; }#endif if ((child->bind_next = tb->owners) != NULL) tb->owners->bind_pprev = &child->bind_next; tb->owners = child; child->bind_pprev = &tb->owners; child->prev = (struct sock *) tb;}__inline__ void tcp_inherit_port(struct sock *sk, struct sock *child){ SOCKHASH_LOCK(); __tcp_inherit_port(sk, child); SOCKHASH_UNLOCK();}/* Obtain a reference to a local port for the given sock, * if snum is zero it means select any available local port. */static int tcp_v4_get_port(struct sock *sk, unsigned short snum){ struct tcp_bind_bucket *tb; SOCKHASH_LOCK(); if (snum == 0) { int rover = tcp_port_rover; int low = sysctl_local_port_range[0]; int high = sysctl_local_port_range[1]; int remaining = (high - low) + 1; do { rover++; if ((rover < low) || (rover > high)) rover = low; tb = tcp_bhash[tcp_bhashfn(rover)]; for ( ; tb; tb = tb->next) if (tb->port == rover) goto next; break; next: ; /* Do nothing. */ } while (--remaining > 0); tcp_port_rover = rover; /* Exhausted local port range during search? */ if (remaining <= 0) goto fail; /* OK, here is the one we will use. */ snum = rover; tb = NULL; } else { for (tb = tcp_bhash[tcp_bhashfn(snum)]; tb != NULL; tb = tb->next) if (tb->port == snum) break; } if (tb != NULL && tb->owners != NULL) { if (tb->fastreuse != 0 && sk->reuse != 0) { goto success; } else { struct sock *sk2 = tb->owners; int sk_reuse = sk->reuse; for( ; sk2 != NULL; sk2 = sk2->bind_next) { if (sk->bound_dev_if == sk2->bound_dev_if) { if (!sk_reuse || !sk2->reuse || sk2->state == TCP_LISTEN) { if (!sk2->rcv_saddr || !sk->rcv_saddr || (sk2->rcv_saddr == sk->rcv_saddr)) break; } } } /* If we found a conflict, fail. */ if (sk2 != NULL) goto fail; } } if (tb == NULL && (tb = tcp_bucket_create(snum)) == NULL) goto fail; if (tb->owners == NULL) { if (sk->reuse && sk->state != TCP_LISTEN) tb->fastreuse = 1; else tb->fastreuse = 0; } else if (tb->fastreuse && ((sk->reuse == 0) || (sk->state == TCP_LISTEN))) tb->fastreuse = 0;success: sk->num = snum; if ((sk->bind_next = tb->owners) != NULL) tb->owners->bind_pprev = &sk->bind_next; tb->owners = sk; sk->bind_pprev = &tb->owners; sk->prev = (struct sock *) tb; SOCKHASH_UNLOCK(); return 0;fail: SOCKHASH_UNLOCK(); return 1;}/* Get rid of any references to a local port held by the * given sock. */__inline__ void __tcp_put_port(struct sock *sk){ struct tcp_bind_bucket *tb; tb = (struct tcp_bind_bucket *) sk->prev; if (sk->bind_next) sk->bind_next->bind_pprev = sk->bind_pprev; *(sk->bind_pprev) = sk->bind_next; sk->prev = NULL; if (tb->owners == NULL) { if (tb->next) tb->next->pprev = tb->pprev; *(tb->pprev) = tb->next; kmem_cache_free(tcp_bucket_cachep, tb); }}void tcp_put_port(struct sock *sk){ SOCKHASH_LOCK(); __tcp_put_port(sk); SOCKHASH_UNLOCK();}static __inline__ void __tcp_v4_hash(struct sock *sk){ struct sock **skp; if(sk->state == TCP_LISTEN) skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)]; else skp = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))]; if((sk->next = *skp) != NULL) (*skp)->pprev = &sk->next; *skp = sk; sk->pprev = skp;}static void tcp_v4_hash(struct sock *sk){ if (sk->state != TCP_CLOSE) { SOCKHASH_LOCK(); __tcp_v4_hash(sk); SOCKHASH_UNLOCK(); }}static void tcp_v4_unhash(struct sock *sk){ SOCKHASH_LOCK(); if(sk->pprev) { if(sk->next) sk->next->pprev = sk->pprev; *sk->pprev = sk->next; sk->pprev = NULL; tcp_reg_zap(sk); __tcp_put_port(sk); } SOCKHASH_UNLOCK();}/* Don't inline this cruft. Here are some nice properties to * exploit here. The BSD API does not allow a listening TCP * to specify the remote port nor the remote address for the * connection. So always assume those are both wildcarded * during the search since they can never be otherwise. */static struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int dif){ struct sock *sk; struct sock *result = NULL; int score, hiscore; hiscore=0; for(sk = tcp_listening_hash[tcp_lhashfn(hnum)]; sk; sk = sk->next) { if(sk->num == hnum) { __u32 rcv_saddr = sk->rcv_saddr; score = 1; if(rcv_saddr) { if (rcv_saddr != daddr) continue; score++; } if (sk->bound_dev_if) { if (sk->bound_dev_if != dif) continue; score++; } if (score == 3) return sk; if (score > hiscore) { hiscore = score; result = sk; } } } return result;}/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM * It is assumed that this code only gets called from within NET_BH. */static inline struct sock *__tcp_v4_lookup(struct tcphdr *th, u32 saddr, u16 sport, u32 daddr, u16 dport, int dif){ TCP_V4_ADDR_COOKIE(acookie, saddr, daddr) __u16 hnum = ntohs(dport); __u32 ports = TCP_COMBINED_PORTS(sport, hnum); struct sock *sk; int hash; /* Check TCP register quick cache first. */ sk = TCP_RHASH(sport); if(sk && TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif)) goto hit; /* Optimize here for direct hit, only listening connections can * have wildcards anyways. */ hash = tcp_hashfn(daddr, hnum, saddr, sport); for(sk = tcp_ehash[hash]; sk; sk = sk->next) { if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif)) { if (sk->state == TCP_ESTABLISHED) TCP_RHASH(sport) = sk; goto hit; /* You sunk my battleship! */ } } /* Must check for a TIME_WAIT'er before going to listener hash. */ for(sk = tcp_ehash[hash+(tcp_ehash_size/2)]; sk; sk = sk->next) if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif)) goto hit; sk = tcp_v4_lookup_listener(daddr, hnum, dif);hit: return sk;}__inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif){ return __tcp_v4_lookup(0, saddr, sport, daddr, dport, dif);}#ifdef CONFIG_IP_TRANSPARENT_PROXY/* Cleaned up a little and adapted to new bind bucket scheme. * Oddly, this should increase performance here for * transparent proxy, as tests within the inner loop have * been eliminated. -DaveM */static struct sock *tcp_v4_proxy_lookup(unsigned short num, unsigned long raddr, unsigned short rnum, unsigned long laddr, struct device *dev, unsigned short pnum, int dif){ struct sock *s, *result = NULL; int badness = -1; u32 paddr = 0; unsigned short hnum = ntohs(num); unsigned short hpnum = ntohs(pnum); int firstpass = 1; if(dev && dev->ip_ptr) { struct in_device *idev = dev->ip_ptr; if(idev->ifa_list) paddr = idev->ifa_list->ifa_local; } /* This code must run only from NET_BH. */ { struct tcp_bind_bucket *tb = tcp_bhash[tcp_bhashfn(hnum)]; for( ; (tb && tb->port != hnum); tb = tb->next) ; if(tb == NULL) goto next; s = tb->owners; }pass2: for(; s; s = s->bind_next) { int score = 0; if(s->rcv_saddr) { if((s->num != hpnum || s->rcv_saddr != paddr) && (s->num != hnum || s->rcv_saddr != laddr)) continue; score++; } if(s->daddr) { if(s->daddr != raddr) continue; score++; } if(s->dport) { if(s->dport != rnum) continue; score++; } if(s->bound_dev_if) { if(s->bound_dev_if != dif) continue; score++; } if(score == 4 && s->num == hnum) { result = s; goto gotit; } else if(score > badness && (s->num == hpnum || s->rcv_saddr)) { result = s; badness = score; } }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -