📄 tcp_ipv4.c
字号:
/* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $ * * IPv4 specific functions * * * code split from: * linux/ipv4/tcp.c * linux/ipv4/tcp_input.c * linux/ipv4/tcp_output.c * * See tcp.c for author information * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. *//* * Changes: * David S. Miller : New socket lookup architecture. * This code is dedicated to John Dyson. * David S. Miller : Change semantics of established hash, * half is devoted to TIME_WAIT sockets * and the rest go in the other half. * Andi Kleen : Add support for syncookies and fixed * some bugs: ip options weren't passed to * the TCP layer, missed a check for an * ACK bit. * Andi Kleen : Implemented fast path mtu discovery. * Fixed many serious bugs in the * open_request handling and moved * most of it into the af independent code. * Added tail drop and some other bugfixes. * Added new listen sematics. * Mike McLagan : Routing by source * Juan Jose Ciarlante: ip_dynaddr bits * Andi Kleen: various fixes. * Vitaly E. Lavrov : Transparent proxy revived after year * coma. * Andi Kleen : Fix new listen. * Andi Kleen : Fix accept error reporting. * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind * a single port at the same time. */#include <linux/config.h>#include <linux/types.h>#include <linux/fcntl.h>#include <linux/module.h>#include <linux/random.h>#include <linux/cache.h>#include <linux/jhash.h>#include <linux/init.h>#include <linux/times.h>#include <net/icmp.h>#include <net/tcp.h>#include <net/ipv6.h>#include <net/inet_common.h>#include <net/xfrm.h>#include <linux/inet.h>#include <linux/ipv6.h>#include <linux/stddef.h>#include <linux/proc_fs.h>#include <linux/seq_file.h>extern int sysctl_ip_dynaddr;int sysctl_tcp_tw_reuse;int sysctl_tcp_low_latency;/* Check TCP sequence numbers in ICMP packets. */#define ICMP_MIN_LENGTH 8/* Socket used for sending RSTs */static struct socket *tcp_socket;void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, struct sk_buff *skb);struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = { .__tcp_lhash_lock = RW_LOCK_UNLOCKED, .__tcp_lhash_users = ATOMIC_INIT(0), .__tcp_lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait), .__tcp_portalloc_lock = SPIN_LOCK_UNLOCKED};/* * This array holds the first and last local port number. * For high-usage systems, use sysctl to change this to * 32768-61000 */int sysctl_local_port_range[2] = { 1024, 4999 };int tcp_port_rover = 1024 - 1;static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport, __u32 faddr, __u16 fport){ int h = (laddr ^ lport) ^ (faddr ^ fport); h ^= h >> 16; h ^= h >> 8; return h & (tcp_ehash_size - 1);}static __inline__ int tcp_sk_hashfn(struct sock *sk){ struct inet_opt *inet = inet_sk(sk); __u32 laddr = inet->rcv_saddr; __u16 lport = inet->num; __u32 faddr = inet->daddr; __u16 fport = inet->dport; return tcp_hashfn(laddr, lport, faddr, fport);}/* Allocate and initialize a new TCP local port bind bucket. * The bindhash mutex for snum's hash chain must be held here. */struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head, unsigned short snum){ struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep, SLAB_ATOMIC); if (tb) { tb->port = snum; tb->fastreuse = 0; INIT_HLIST_HEAD(&tb->owners); hlist_add_head(&tb->node, &head->chain); } return tb;}/* Caller must hold hashbucket lock for this tb with local BH disabled */void tcp_bucket_destroy(struct tcp_bind_bucket *tb){ if (hlist_empty(&tb->owners)) { __hlist_del(&tb->node); kmem_cache_free(tcp_bucket_cachep, tb); }}/* Caller must disable local BH processing. */static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child){ struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)]; struct tcp_bind_bucket *tb; spin_lock(&head->lock); tb = tcp_sk(sk)->bind_hash; sk_add_bind_node(child, &tb->owners); tcp_sk(child)->bind_hash = tb; spin_unlock(&head->lock);}inline void tcp_inherit_port(struct sock *sk, struct sock *child){ local_bh_disable(); __tcp_inherit_port(sk, child); local_bh_enable();}void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb, unsigned short snum){ inet_sk(sk)->num = snum; sk_add_bind_node(sk, &tb->owners); tcp_sk(sk)->bind_hash = tb;}static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb){ const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk); struct sock *sk2; struct hlist_node *node; int reuse = sk->sk_reuse; sk_for_each_bound(sk2, node, &tb->owners) { if (sk != sk2 && !tcp_v6_ipv6only(sk2) && (!sk->sk_bound_dev_if || !sk2->sk_bound_dev_if || sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { if (!reuse || !sk2->sk_reuse || sk2->sk_state == TCP_LISTEN) { const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2); if (!sk2_rcv_saddr || !sk_rcv_saddr || sk2_rcv_saddr == sk_rcv_saddr) break; } } } return node != NULL;}/* Obtain a reference to a local port for the given sock, * if snum is zero it means select any available local port. */static int tcp_v4_get_port(struct sock *sk, unsigned short snum){ struct tcp_bind_hashbucket *head; struct hlist_node *node; struct tcp_bind_bucket *tb; int ret; local_bh_disable(); if (!snum) { int low = sysctl_local_port_range[0]; int high = sysctl_local_port_range[1]; int remaining = (high - low) + 1; int rover; spin_lock(&tcp_portalloc_lock); rover = tcp_port_rover; do { rover++; if (rover < low || rover > high) rover = low; head = &tcp_bhash[tcp_bhashfn(rover)]; spin_lock(&head->lock); tb_for_each(tb, node, &head->chain) if (tb->port == rover) goto next; break; next: spin_unlock(&head->lock); } while (--remaining > 0); tcp_port_rover = rover; spin_unlock(&tcp_portalloc_lock); /* Exhausted local port range during search? */ ret = 1; if (remaining <= 0) goto fail; /* OK, here is the one we will use. HEAD is * non-NULL and we hold it's mutex. */ snum = rover; } else { head = &tcp_bhash[tcp_bhashfn(snum)]; spin_lock(&head->lock); tb_for_each(tb, node, &head->chain) if (tb->port == snum) goto tb_found; } tb = NULL; goto tb_not_found;tb_found: if (!hlist_empty(&tb->owners)) { if (sk->sk_reuse > 1) goto success; if (tb->fastreuse > 0 && sk->sk_reuse && sk->sk_state != TCP_LISTEN) { goto success; } else { ret = 1; if (tcp_bind_conflict(sk, tb)) goto fail_unlock; } }tb_not_found: ret = 1; if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL) goto fail_unlock; if (hlist_empty(&tb->owners)) { if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) tb->fastreuse = 1; else tb->fastreuse = 0; } else if (tb->fastreuse && (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) tb->fastreuse = 0;success: if (!tcp_sk(sk)->bind_hash) tcp_bind_hash(sk, tb, snum); BUG_TRAP(tcp_sk(sk)->bind_hash == tb); ret = 0;fail_unlock: spin_unlock(&head->lock);fail: local_bh_enable(); return ret;}/* Get rid of any references to a local port held by the * given sock. */static void __tcp_put_port(struct sock *sk){ struct inet_opt *inet = inet_sk(sk); struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)]; struct tcp_bind_bucket *tb; spin_lock(&head->lock); tb = tcp_sk(sk)->bind_hash; __sk_del_bind_node(sk); tcp_sk(sk)->bind_hash = NULL; inet->num = 0; tcp_bucket_destroy(tb); spin_unlock(&head->lock);}void tcp_put_port(struct sock *sk){ local_bh_disable(); __tcp_put_port(sk); local_bh_enable();}/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP. * Look, when several writers sleep and reader wakes them up, all but one * immediately hit write lock and grab all the cpus. Exclusive sleep solves * this, _but_ remember, it adds useless work on UP machines (wake up each * exclusive lock release). It should be ifdefed really. */void tcp_listen_wlock(void){ write_lock(&tcp_lhash_lock); if (atomic_read(&tcp_lhash_users)) { DEFINE_WAIT(wait); for (;;) { prepare_to_wait_exclusive(&tcp_lhash_wait, &wait, TASK_UNINTERRUPTIBLE); if (!atomic_read(&tcp_lhash_users)) break; write_unlock_bh(&tcp_lhash_lock); schedule(); write_lock_bh(&tcp_lhash_lock); } finish_wait(&tcp_lhash_wait, &wait); }}static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible){ struct hlist_head *list; rwlock_t *lock; BUG_TRAP(sk_unhashed(sk)); if (listen_possible && sk->sk_state == TCP_LISTEN) { list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)]; lock = &tcp_lhash_lock; tcp_listen_wlock(); } else { list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain; lock = &tcp_ehash[sk->sk_hashent].lock; write_lock(lock); } __sk_add_node(sk, list); sock_prot_inc_use(sk->sk_prot); write_unlock(lock); if (listen_possible && sk->sk_state == TCP_LISTEN) wake_up(&tcp_lhash_wait);}static void tcp_v4_hash(struct sock *sk){ if (sk->sk_state != TCP_CLOSE) { local_bh_disable(); __tcp_v4_hash(sk, 1); local_bh_enable(); }}void tcp_unhash(struct sock *sk){ rwlock_t *lock; if (sk_unhashed(sk)) goto ende; if (sk->sk_state == TCP_LISTEN) { local_bh_disable(); tcp_listen_wlock(); lock = &tcp_lhash_lock; } else { struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent]; lock = &head->lock; write_lock_bh(&head->lock); } if (__sk_del_node_init(sk)) sock_prot_dec_use(sk->sk_prot); write_unlock_bh(lock); ende: if (sk->sk_state == TCP_LISTEN) wake_up(&tcp_lhash_wait);}/* Don't inline this cruft. Here are some nice properties to * exploit here. The BSD API does not allow a listening TCP * to specify the remote port nor the remote address for the * connection. So always assume those are both wildcarded * during the search since they can never be otherwise. */static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr, unsigned short hnum, int dif){ struct sock *result = NULL, *sk; struct hlist_node *node; int score, hiscore; hiscore=-1; sk_for_each(sk, node, head) { struct inet_opt *inet = inet_sk(sk); if (inet->num == hnum && !ipv6_only_sock(sk)) { __u32 rcv_saddr = inet->rcv_saddr; score = (sk->sk_family == PF_INET ? 1 : 0); if (rcv_saddr) { if (rcv_saddr != daddr) continue; score+=2; } if (sk->sk_bound_dev_if) { if (sk->sk_bound_dev_if != dif) continue; score+=2; } if (score == 5) return sk; if (score > hiscore) { hiscore = score; result = sk; } } } return result;}/* Optimize the common listener case. */inline struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int dif){ struct sock *sk = NULL; struct hlist_head *head; read_lock(&tcp_lhash_lock); head = &tcp_listening_hash[tcp_lhashfn(hnum)]; if (!hlist_empty(head)) { struct inet_opt *inet = inet_sk((sk = __sk_head(head))); if (inet->num == hnum && !sk->sk_node.next && (!inet->rcv_saddr || inet->rcv_saddr == daddr) && (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) && !sk->sk_bound_dev_if) goto sherry_cache; sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif); } if (sk) {sherry_cache: sock_hold(sk); } read_unlock(&tcp_lhash_lock); return sk;}/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM * * Local BH must be disabled here. */static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport, u32 daddr, u16 hnum, int dif){ struct tcp_ehash_bucket *head; TCP_V4_ADDR_COOKIE(acookie, saddr, daddr) __u32 ports = TCP_COMBINED_PORTS(sport, hnum); struct sock *sk; struct hlist_node *node; /* Optimize here for direct hit, only listening connections can * have wildcards anyways. */ int hash = tcp_hashfn(daddr, hnum, saddr, sport); head = &tcp_ehash[hash]; read_lock(&head->lock); sk_for_each(sk, node, &head->chain) { if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif)) goto hit; /* You sunk my battleship! */ } /* Must check for a TIME_WAIT'er before going to listener hash. */ sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) { if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif)) goto hit; } sk = NULL;out: read_unlock(&head->lock); return sk;hit: sock_hold(sk); goto out;}static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 hnum, int dif){ struct sock *sk = __tcp_v4_lookup_established(saddr, sport, daddr, hnum, dif); return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif);}inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -