📄 tcp_ipv4.c
字号:
/* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * Version: $Id: tcp_ipv4.c,v 1.222 2000/12/08 17:15:53 davem Exp $ * * IPv4 specific functions * * * code split from: * linux/ipv4/tcp.c * linux/ipv4/tcp_input.c * linux/ipv4/tcp_output.c * * See tcp.c for author information * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. *//* * Changes: * David S. Miller : New socket lookup architecture. * This code is dedicated to John Dyson. * David S. Miller : Change semantics of established hash, * half is devoted to TIME_WAIT sockets * and the rest go in the other half. * Andi Kleen : Add support for syncookies and fixed * some bugs: ip options weren't passed to * the TCP layer, missed a check for an ACK bit. * Andi Kleen : Implemented fast path mtu discovery. * Fixed many serious bugs in the * open_request handling and moved * most of it into the af independent code. * Added tail drop and some other bugfixes. * Added new listen sematics. * Mike McLagan : Routing by source * Juan Jose Ciarlante: ip_dynaddr bits * Andi Kleen: various fixes. * Vitaly E. Lavrov : Transparent proxy revived after year coma. * Andi Kleen : Fix new listen. * Andi Kleen : Fix accept error reporting. */#include <linux/config.h>#include <linux/types.h>#include <linux/fcntl.h>#include <linux/random.h>#include <linux/cache.h>#include <linux/init.h>#include <net/icmp.h>#include <net/tcp.h>#include <net/ipv6.h>#include <net/inet_common.h>#include <linux/inet.h>#include <linux/stddef.h>#include <linux/ipsec.h>extern int sysctl_ip_dynaddr;/* Check TCP sequence numbers in ICMP packets. */#define ICMP_MIN_LENGTH 8/* Socket used for sending RSTs */ static struct inode tcp_inode;static struct socket *tcp_socket=&tcp_inode.u.socket_i;void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, struct sk_buff *skb);/* * ALL members must be initialised to prevent gcc-2.7.2.3 miscompilation */struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = { __tcp_ehash: NULL, __tcp_bhash: NULL, __tcp_bhash_size: 0, __tcp_ehash_size: 0, __tcp_listening_hash: { NULL, }, __tcp_lhash_lock: RW_LOCK_UNLOCKED, __tcp_lhash_users: ATOMIC_INIT(0), __tcp_lhash_wait: __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait), __tcp_portalloc_lock: SPIN_LOCK_UNLOCKED};/* * This array holds the first and last local port number. * For high-usage systems, use sysctl to change this to * 32768-61000 */int sysctl_local_port_range[2] = { 1024, 4999 };int tcp_port_rover = (1024 - 1);static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport, __u32 faddr, __u16 fport){ int h = ((laddr ^ lport) ^ (faddr ^ fport)); h ^= h>>16; h ^= h>>8; return h & (tcp_ehash_size - 1);}static __inline__ int tcp_sk_hashfn(struct sock *sk){ __u32 laddr = sk->rcv_saddr; __u16 lport = sk->num; __u32 faddr = sk->daddr; __u16 fport = sk->dport; return tcp_hashfn(laddr, lport, faddr, fport);}/* Allocate and initialize a new TCP local port bind bucket. * The bindhash mutex for snum's hash chain must be held here. */struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head, unsigned short snum){ struct tcp_bind_bucket *tb; tb = kmem_cache_alloc(tcp_bucket_cachep, SLAB_ATOMIC); if(tb != NULL) { tb->port = snum; tb->fastreuse = 0; tb->owners = NULL; if((tb->next = head->chain) != NULL) tb->next->pprev = &tb->next; head->chain = tb; tb->pprev = &head->chain; } return tb;}/* Caller must disable local BH processing. */static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child){ struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(child->num)]; struct tcp_bind_bucket *tb; spin_lock(&head->lock); tb = (struct tcp_bind_bucket *)sk->prev; if ((child->bind_next = tb->owners) != NULL) tb->owners->bind_pprev = &child->bind_next; tb->owners = child; child->bind_pprev = &tb->owners; child->prev = (struct sock *) tb; spin_unlock(&head->lock);}__inline__ void tcp_inherit_port(struct sock *sk, struct sock *child){ local_bh_disable(); __tcp_inherit_port(sk, child); local_bh_enable();}/* Obtain a reference to a local port for the given sock, * if snum is zero it means select any available local port. */static int tcp_v4_get_port(struct sock *sk, unsigned short snum){ struct tcp_bind_hashbucket *head; struct tcp_bind_bucket *tb; int ret; local_bh_disable(); if (snum == 0) { int low = sysctl_local_port_range[0]; int high = sysctl_local_port_range[1]; int remaining = (high - low) + 1; int rover; spin_lock(&tcp_portalloc_lock); rover = tcp_port_rover; do { rover++; if ((rover < low) || (rover > high)) rover = low; head = &tcp_bhash[tcp_bhashfn(rover)]; spin_lock(&head->lock); for (tb = head->chain; tb; tb = tb->next) if (tb->port == rover) goto next; break; next: spin_unlock(&head->lock); } while (--remaining > 0); tcp_port_rover = rover; spin_unlock(&tcp_portalloc_lock); /* Exhausted local port range during search? */ ret = 1; if (remaining <= 0) goto fail; /* OK, here is the one we will use. HEAD is * non-NULL and we hold it's mutex. */ snum = rover; tb = NULL; } else { head = &tcp_bhash[tcp_bhashfn(snum)]; spin_lock(&head->lock); for (tb = head->chain; tb != NULL; tb = tb->next) if (tb->port == snum) break; } if (tb != NULL && tb->owners != NULL) { if (tb->fastreuse != 0 && sk->reuse != 0 && sk->state != TCP_LISTEN) { goto success; } else { struct sock *sk2 = tb->owners; int sk_reuse = sk->reuse; for( ; sk2 != NULL; sk2 = sk2->bind_next) { if (sk != sk2 && sk->bound_dev_if == sk2->bound_dev_if) { if (!sk_reuse || !sk2->reuse || sk2->state == TCP_LISTEN) { if (!sk2->rcv_saddr || !sk->rcv_saddr || (sk2->rcv_saddr == sk->rcv_saddr)) break; } } } /* If we found a conflict, fail. */ ret = 1; if (sk2 != NULL) goto fail_unlock; } } ret = 1; if (tb == NULL && (tb = tcp_bucket_create(head, snum)) == NULL) goto fail_unlock; if (tb->owners == NULL) { if (sk->reuse && sk->state != TCP_LISTEN) tb->fastreuse = 1; else tb->fastreuse = 0; } else if (tb->fastreuse && ((sk->reuse == 0) || (sk->state == TCP_LISTEN))) tb->fastreuse = 0;success: sk->num = snum; if (sk->prev == NULL) { if ((sk->bind_next = tb->owners) != NULL) tb->owners->bind_pprev = &sk->bind_next; tb->owners = sk; sk->bind_pprev = &tb->owners; sk->prev = (struct sock *) tb; } else { BUG_TRAP(sk->prev == (struct sock *) tb); } ret = 0;fail_unlock: spin_unlock(&head->lock);fail: local_bh_enable(); return ret;}/* Get rid of any references to a local port held by the * given sock. */__inline__ void __tcp_put_port(struct sock *sk){ struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(sk->num)]; struct tcp_bind_bucket *tb; spin_lock(&head->lock); tb = (struct tcp_bind_bucket *) sk->prev; if (sk->bind_next) sk->bind_next->bind_pprev = sk->bind_pprev; *(sk->bind_pprev) = sk->bind_next; sk->prev = NULL; sk->num = 0; if (tb->owners == NULL) { if (tb->next) tb->next->pprev = tb->pprev; *(tb->pprev) = tb->next; kmem_cache_free(tcp_bucket_cachep, tb); } spin_unlock(&head->lock);}void tcp_put_port(struct sock *sk){ local_bh_disable(); __tcp_put_port(sk); local_bh_enable();}/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP. * Look, when several writers sleep and reader wakes them up, all but one * immediately hit write lock and grab all the cpus. Exclusive sleep solves * this, _but_ remember, it adds useless work on UP machines (wake up each * exclusive lock release). It should be ifdefed really. */void tcp_listen_wlock(void){ write_lock(&tcp_lhash_lock); if (atomic_read(&tcp_lhash_users)) { DECLARE_WAITQUEUE(wait, current); add_wait_queue_exclusive(&tcp_lhash_wait, &wait); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (atomic_read(&tcp_lhash_users) == 0) break; write_unlock_bh(&tcp_lhash_lock); schedule(); write_lock_bh(&tcp_lhash_lock); } __set_current_state(TASK_RUNNING); remove_wait_queue(&tcp_lhash_wait, &wait); }}static __inline__ void __tcp_v4_hash(struct sock *sk){ struct sock **skp; rwlock_t *lock; BUG_TRAP(sk->pprev==NULL); if(sk->state == TCP_LISTEN) { skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)]; lock = &tcp_lhash_lock; tcp_listen_wlock(); } else { skp = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))].chain; lock = &tcp_ehash[sk->hashent].lock; write_lock(lock); } if((sk->next = *skp) != NULL) (*skp)->pprev = &sk->next; *skp = sk; sk->pprev = skp; sock_prot_inc_use(sk->prot); write_unlock(lock); if (sk->state == TCP_LISTEN) wake_up(&tcp_lhash_wait);}static void tcp_v4_hash(struct sock *sk){ if (sk->state != TCP_CLOSE) { local_bh_disable(); __tcp_v4_hash(sk); local_bh_enable(); }}void tcp_unhash(struct sock *sk){ rwlock_t *lock; if (sk->state == TCP_LISTEN) { local_bh_disable(); tcp_listen_wlock(); lock = &tcp_lhash_lock; } else { struct tcp_ehash_bucket *head = &tcp_ehash[sk->hashent]; lock = &head->lock; write_lock_bh(&head->lock); } if(sk->pprev) { if(sk->next) sk->next->pprev = sk->pprev; *sk->pprev = sk->next; sk->pprev = NULL; sock_prot_dec_use(sk->prot); } write_unlock_bh(lock); if (sk->state == TCP_LISTEN) wake_up(&tcp_lhash_wait);}/* Don't inline this cruft. Here are some nice properties to * exploit here. The BSD API does not allow a listening TCP * to specify the remote port nor the remote address for the * connection. So always assume those are both wildcarded * during the search since they can never be otherwise. */static struct sock *__tcp_v4_lookup_listener(struct sock *sk, u32 daddr, unsigned short hnum, int dif){ struct sock *result = NULL; int score, hiscore; hiscore=0; for(; sk; sk = sk->next) { if(sk->num == hnum) { __u32 rcv_saddr = sk->rcv_saddr; score = 1; if(rcv_saddr) { if (rcv_saddr != daddr) continue; score++; } if (sk->bound_dev_if) { if (sk->bound_dev_if != dif) continue; score++; } if (score == 3) return sk; if (score > hiscore) { hiscore = score; result = sk; } } } return result;}/* Optimize the common listener case. */__inline__ struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int dif){ struct sock *sk; read_lock(&tcp_lhash_lock); sk = tcp_listening_hash[tcp_lhashfn(hnum)]; if (sk) { if (sk->num == hnum && sk->next == NULL && (!sk->rcv_saddr || sk->rcv_saddr == daddr) && !sk->bound_dev_if) goto sherry_cache; sk = __tcp_v4_lookup_listener(sk, daddr, hnum, dif); } if (sk) {sherry_cache: sock_hold(sk); } read_unlock(&tcp_lhash_lock); return sk;}/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM * * Local BH must be disabled here. */static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport, u32 daddr, u16 hnum, int dif){ struct tcp_ehash_bucket *head; TCP_V4_ADDR_COOKIE(acookie, saddr, daddr) __u32 ports = TCP_COMBINED_PORTS(sport, hnum); struct sock *sk; int hash; /* Optimize here for direct hit, only listening connections can * have wildcards anyways. */ hash = tcp_hashfn(daddr, hnum, saddr, sport); head = &tcp_ehash[hash]; read_lock(&head->lock); for(sk = head->chain; sk; sk = sk->next) { if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif)) goto hit; /* You sunk my battleship! */ } /* Must check for a TIME_WAIT'er before going to listener hash. */ for(sk = (head + tcp_ehash_size)->chain; sk; sk = sk->next) if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif)) goto hit; read_unlock(&head->lock); return NULL;hit: sock_hold(sk); read_unlock(&head->lock); return sk;}static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 hnum, int dif){ struct sock *sk; sk = __tcp_v4_lookup_established(saddr, sport, daddr, hnum, dif); if (sk) return sk; return tcp_v4_lookup_listener(daddr, hnum, dif);}__inline__ struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif){ struct sock *sk; local_bh_disable(); sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif); local_bh_enable(); return sk;}static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb){ return secure_tcp_sequence_number(skb->nh.iph->daddr, skb->nh.iph->saddr, skb->h.th->dest, skb->h.th->source);}static int tcp_v4_check_established(struct sock *sk){ u32 daddr = sk->rcv_saddr; u32 saddr = sk->daddr; int dif = sk->bound_dev_if; TCP_V4_ADDR_COOKIE(acookie, saddr, daddr) __u32 ports = TCP_COMBINED_PORTS(sk->dport, sk->num); int hash = tcp_hashfn(daddr, sk->num, saddr, sk->dport); struct tcp_ehash_bucket *head = &tcp_ehash[hash]; struct sock *sk2, **skp; struct tcp_tw_bucket *tw; write_lock_bh(&head->lock); /* Check TIME-WAIT sockets first. */ for(skp = &(head + tcp_ehash_size)->chain; (sk2=*skp) != NULL; skp = &sk2->next) { tw = (struct tcp_tw_bucket*)sk2; if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif)) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); /* With PAWS, it is safe from the viewpoint of data integrity. Even without PAWS it is safe provided sequence spaces do not overlap i.e. at data rates <= 80Mbit/sec. Actually, the idea is close to VJ's one, only timestamp cache is held not per host, but per port pair and TW bucket is used
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -