📄 tcpcore.h
字号:
#if 1
/* dummy types */
#endif
#define TCP_DEBUG 1
#define FASTRETRANS_DEBUG 1
/* Cancel timers, when they are not required. */
#undef TCP_CLEAR_TIMERS
#if 0
#include <linux/config.h>
#include <linux/tcp.h>
#include <linux/slab.h>
#include <linux/cache.h>
#include <net/checksum.h>
#include <net/sock.h>
#else
#include "linux.h"
#endif
/* This is for all connections with a full identity, no wildcards.
* New scheme, half the table is for TIME_WAIT, the other half is
* for the rest. I'll experiment with dynamic table growth later.
*/
struct tcp_ehash_bucket {
rwlock_t lock;
struct sock *chain;
} __attribute__((__aligned__(8)));
/* This is for listening sockets, thus all sockets which possess wildcards. */
#define TCP_LHTABLE_SIZE 32 /* Yes, really, this is all you need. */
/* There are a few simple rules, which allow for local port reuse by
* an application. In essence:
*
* 1) Sockets bound to different interfaces may share a local port.
* Failing that, goto test 2.
* 2) If all sockets have sk->reuse set, and none of them are in
* TCP_LISTEN state, the port may be shared.
* Failing that, goto test 3.
* 3) If all sockets are bound to a specific sk->rcv_saddr local
* address, and none of them are the same, the port may be
* shared.
* Failing this, the port cannot be shared.
*
* The interesting point, is test #2. This is what an FTP server does
* all day. To optimize this case we use a specific flag bit defined
* below. As we add sockets to a bind bucket list, we perform a
* check of: (newsk->reuse && (newsk->state != TCP_LISTEN))
* As long as all sockets added to a bind bucket pass this test,
* the flag bit will be set.
* The resulting situation is that tcp_v[46]_verify_bind() can just check
* for this flag bit, if it is set and the socket trying to bind has
* sk->reuse set, we don't even have to walk the owners list at all,
* we return that it is ok to bind this socket to the requested local port.
*
* Sounds like a lot of work, but it is worth it. In a more naive
* implementation (ie. current FreeBSD etc.) the entire list of ports
* must be walked for each data port opened by an ftp server. Needless
* to say, this does not scale at all. With a couple thousand FTP
* users logged onto your box, isn't it nice to know that new data
* ports are created in O(1) time? I thought so. ;-) -DaveM
*/
struct tcp_bind_bucket {
unsigned short port;
signed short fastreuse;
struct tcp_bind_bucket *next;
struct sock *owners;
struct tcp_bind_bucket **pprev;
};
struct tcp_bind_hashbucket {
spinlock_t lock;
struct tcp_bind_bucket *chain;
};
extern struct tcp_hashinfo {
/* This is for sockets with full identity only. Sockets here will
* always be without wildcards and will have the following invariant:
*
* TCP_ESTABLISHED <= sk->state < TCP_CLOSE
*
* First half of the table is for sockets not in TIME_WAIT, second half
* is for TIME_WAIT sockets only.
*/
struct tcp_ehash_bucket *__tcp_ehash;
/* Ok, let's try this, I give up, we do need a local binding
* TCP hash as well as the others for fast bind/connect.
*/
struct tcp_bind_hashbucket *__tcp_bhash;
int __tcp_bhash_size;
int __tcp_ehash_size;
/* All sockets in TCP_LISTEN state will be in here. This is the only
* table where wildcard'd TCP sockets can exist. Hash function here
* is just local port number.
*/
struct sock *__tcp_listening_hash[TCP_LHTABLE_SIZE];
/* All the above members are written once at bootup and
* never written again _or_ are predominantly read-access.
*
* Now align to a new cache line as all the following members
* are often dirty.
*/
rwlock_t __tcp_lhash_lock ____cacheline_aligned;
atomic_t __tcp_lhash_users;
wait_queue_head_t __tcp_lhash_wait;
spinlock_t __tcp_portalloc_lock;
} tcp_hashinfo;
#define tcp_ehash (tcp_hashinfo.__tcp_ehash)
#define tcp_bhash (tcp_hashinfo.__tcp_bhash)
#define tcp_ehash_size (tcp_hashinfo.__tcp_ehash_size)
#define tcp_bhash_size (tcp_hashinfo.__tcp_bhash_size)
#define tcp_listening_hash (tcp_hashinfo.__tcp_listening_hash)
#define tcp_lhash_lock (tcp_hashinfo.__tcp_lhash_lock)
#define tcp_lhash_users (tcp_hashinfo.__tcp_lhash_users)
#define tcp_lhash_wait (tcp_hashinfo.__tcp_lhash_wait)
#define tcp_portalloc_lock (tcp_hashinfo.__tcp_portalloc_lock)
extern kmem_cache_t *tcp_bucket_cachep;
extern struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
unsigned short snum);
extern void tcp_bucket_unlock(struct sock *sk);
extern int tcp_port_rover;
extern struct sock *tcp_v4_lookup_listener(u32 addr, unsigned short hnum, int dif);
/* These are AF independent. */
static __inline int tcp_bhashfn(__u16 lport)
{
return (lport & (tcp_bhash_size - 1));
}
/* This is a TIME_WAIT bucket. It works around the memory consumption
* problems of sockets in such a state on heavily loaded servers, but
* without violating the protocol specification.
*/
struct tcp_tw_bucket {
/* These _must_ match the beginning of struct sock precisely.
* XXX Yes I know this is gross, but I'd have to edit every single
* XXX networking file if I created a "struct sock_header". -DaveM
*/
__u32 daddr;
__u32 rcv_saddr;
__u16 dport;
unsigned short num;
int bound_dev_if;
struct sock *next;
struct sock **pprev;
struct sock *bind_next;
struct sock **bind_pprev;
unsigned char state,
substate; /* "zapped" is replaced with "substate" */
__u16 sport;
unsigned short family;
unsigned char reuse,
rcv_wscale; /* It is also TW bucket specific */
atomic_t refcnt;
/* And these are ours. */
int hashent;
int timeout;
__u32 rcv_nxt;
__u32 snd_nxt;
__u32 rcv_wnd;
__u32 ts_recent;
long ts_recent_stamp;
unsigned long ttd;
struct tcp_bind_bucket *tb;
struct tcp_tw_bucket *next_death;
struct tcp_tw_bucket **pprev_death;
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
struct in6_addr v6_daddr;
struct in6_addr v6_rcv_saddr;
#endif
};
extern kmem_cache_t *tcp_timewait_cachep;
static __inline void tcp_tw_put(struct tcp_tw_bucket *tw)
{
if (atomic_dec_and_test(&tw->refcnt)) {
#ifdef INET_REFCNT_DEBUG
printk(KERN_DEBUG "tw_bucket %p released\n", tw);
#endif
kmem_cache_free(tcp_timewait_cachep, tw);
}
}
extern atomic_t tcp_orphan_count;
extern int tcp_tw_count;
extern void tcp_time_wait(struct sock *sk, int state, int timeo);
extern void tcp_timewait_kill(struct tcp_tw_bucket *tw);
extern void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo);
extern void tcp_tw_deschedule(struct tcp_tw_bucket *tw);
/* Socket demux engine toys. */
#ifdef __BIG_ENDIAN
#define TCP_COMBINED_PORTS(__sport, __dport) \
(((__u32)(__sport)<<16) | (__u32)(__dport))
#else /* __LITTLE_ENDIAN */
#define TCP_COMBINED_PORTS(__sport, __dport) \
(((__u32)(__dport)<<16) | (__u32)(__sport))
#endif
#if (BITS_PER_LONG == 64)
#ifdef __BIG_ENDIAN
#define TCP_V4_ADDR_COOKIE(__name, __saddr, __daddr) \
__u64 __name = (((__u64)(__saddr))<<32)|((__u64)(__daddr));
#else /* __LITTLE_ENDIAN */
#define TCP_V4_ADDR_COOKIE(__name, __saddr, __daddr) \
__u64 __name = (((__u64)(__daddr))<<32)|((__u64)(__saddr));
#endif /* __BIG_ENDIAN */
#define TCP_IPV4_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\
(((*((__u64 *)&((__sk)->daddr)))== (__cookie)) && \
((*((__u32 *)&((__sk)->dport)))== (__ports)) && \
(!((__sk)->bound_dev_if) || ((__sk)->bound_dev_if == (__dif))))
#else /* 32-bit arch */
#define TCP_V4_ADDR_COOKIE(__name, __saddr, __daddr)
#define TCP_IPV4_MATCH(__sk, __cookie, __saddr, __daddr, __ports, __dif)\
(((__sk)->daddr == (__saddr)) && \
((__sk)->rcv_saddr == (__daddr)) && \
((*((__u32 *)&((__sk)->dport)))== (__ports)) && \
(!((__sk)->bound_dev_if) || ((__sk)->bound_dev_if == (__dif))))
#endif /* 64-bit arch */
#define TCP_IPV6_MATCH(__sk, __saddr, __daddr, __ports, __dif) \
(((*((__u32 *)&((__sk)->dport)))== (__ports)) && \
((__sk)->family == AF_INET6) && \
!ipv6_addr_cmp(&(__sk)->net_pinfo.af_inet6.daddr, (__saddr)) && \
!ipv6_addr_cmp(&(__sk)->net_pinfo.af_inet6.rcv_saddr, (__daddr)) && \
(!((__sk)->bound_dev_if) || ((__sk)->bound_dev_if == (__dif))))
/* These can have wildcards, don't try too hard. */
static __inline int tcp_lhashfn(unsigned short num)
{
#if 0
return num & (TCP_LHTABLE_SIZE - 1);
#else
return 0;
#endif
}
static __inline int tcp_sk_listen_hashfn(struct sock *sk)
{
#if 0
return tcp_lhashfn(sk->num);
#else
return 0;
#endif
}
#define MAX_TCP_HEADER (128 + MAX_HEADER)
/*
* Never offer a window over 32767 without using window scaling. Some
* poor stacks do signed 16bit maths!
*/
#define MAX_TCP_WINDOW 32767U
/* Minimal accepted MSS. It is (60+60+8) - (20+20). */
#define TCP_MIN_MSS 88U
/* Minimal RCV_MSS. */
#define TCP_MIN_RCVMSS 536U
/* After receiving this amount of duplicate ACKs fast retransmit starts. */
#define TCP_FASTRETRANS_THRESH 3
/* Maximal reordering. */
#define TCP_MAX_REORDERING 127
/* Maximal number of ACKs sent quickly to accelerate slow-start. */
#define TCP_MAX_QUICKACKS 16U
/* urg_data states */
#define TCP_URG_VALID 0x0100
#define TCP_URG_NOTYET 0x0200
#define TCP_URG_READ 0x0400
#define TCP_RETR1 3 /*
* This is how many retries it does before it
* tries to figure out if the gateway is
* down. Minimal RFC value is 3; it corresponds
* to ~3sec-8min depending on RTO.
*/
#define TCP_RETR2 15 /*
* This should take at least
* 90 minutes to time out.
* RFC1122 says that the limit is 100 sec.
* 15 is ~13-30min depending on RTO.
*/
#define TCP_SYN_RETRIES 5 /* number of times to retry active opening a
* connection: ~180sec is RFC minumum */
#define TCP_SYNACK_RETRIES 5 /* number of times to retry passive opening a
* connection: ~180sec is RFC minumum */
#define TCP_ORPHAN_RETRIES 7 /* number of times to retry on an orphaned
* socket. 7 is ~50sec-16min.
*/
#define TCP_TIMEWAIT_LEN (60*1000)
//#define TCP_TIMEWAIT_LEN (60*HZ)
/* how long to wait to destroy TIME-WAIT
* state, about 60 seconds */
#define TCP_FIN_TIMEOUT TCP_TIMEWAIT_LEN
/* BSD style FIN_WAIT2 deadlock breaker.
* It used to be 3min, new value is 60sec,
* to combine FIN-WAIT-2 timeout with
* TIME-WAIT timer.
*/
#define TCP_DELACK_MAX ((unsigned)(HZ/5)) /* maximal time to delay before sending an ACK */
#if HZ >= 100
#define TCP_DELACK_MIN ((unsigned)(HZ/25)) /* minimal time to delay before sending an ACK */
#define TCP_ATO_MIN ((unsigned)(HZ/25))
#else
#define TCP_DELACK_MIN 4U
#define TCP_ATO_MIN 4U
#endif
#define TCP_RTO_MAX ((unsigned)(120*HZ))
#define TCP_RTO_MIN ((unsigned)(HZ/5))
#define TCP_TIMEOUT_INIT ((unsigned)(3*HZ)) /* RFC 1122 initial RTO value */
#define TCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ/2U)) /* Maximal interval between probes
* for local resources.
*/
#define TCP_KEEPALIVE_TIME (120*60*HZ) /* two hours */
#define TCP_KEEPALIVE_PROBES 9 /* Max of 9 keepalive probes */
#define TCP_KEEPALIVE_INTVL (75*HZ)
#define MAX_TCP_KEEPIDLE 32767
#define MAX_TCP_KEEPINTVL 32767
#define MAX_TCP_KEEPCNT 127
#define MAX_TCP_SYNCNT 127
/* TIME_WAIT reaping mechanism. */
#define TCP_TWKILL_SLOTS 8 /* Please keep this a power of 2. */
#define TCP_TWKILL_PERIOD (TCP_TIMEWAIT_LEN/TCP_TWKILL_SLOTS)
#define TCP_SYNQ_INTERVAL (HZ/5) /* Period of SYNACK timer */
#define TCP_SYNQ_HSIZE 512 /* Size of SYNACK hash table */
#define TCP_PAWS_24DAYS (60 * 60 * 24 * 24)
#define TCP_PAWS_MSL 60 /* Per-host timestamps are invalidated
* after this time. It should be equal
* (or greater than) TCP_TIMEWAIT_LEN
* to provide reliability equal to one
* provided by timewait state.
*/
#define TCP_PAWS_WINDOW 1 /* Replay window for per-host
* timestamps. It must be less than
* minimal timewait lifetime.
*/
#define TCP_TW_RECYCLE_SLOTS_LOG 5
#define TCP_TW_RECYCLE_SLOTS (1<<TCP_TW_RECYCLE_SLOTS_LOG)
/* If time > 4sec, it is "slow" path, no recycling is required,
so that we select tick to get range about 4 seconds.
*/
#if 0
#if HZ <= 16 || HZ > 4096
# error Unsupported: HZ <= 16 or HZ > 4096
#elif HZ <= 32
# define TCP_TW_RECYCLE_TICK (5+2-TCP_TW_RECYCLE_SLOTS_LOG)
#elif HZ <= 64
# define TCP_TW_RECYCLE_TICK (6+2-TCP_TW_RECYCLE_SLOTS_LOG)
#elif HZ <= 128
# define TCP_TW_RECYCLE_TICK (7+2-TCP_TW_RECYCLE_SLOTS_LOG)
#elif HZ <= 256
# define TCP_TW_RECYCLE_TICK (8+2-TCP_TW_RECYCLE_SLOTS_LOG)
#elif HZ <= 512
# define TCP_TW_RECYCLE_TICK (9+2-TCP_TW_RECYCLE_SLOTS_LOG)
#elif HZ <= 1024
# define TCP_TW_RECYCLE_TICK (10+2-TCP_TW_RECYCLE_SLOTS_LOG)
#elif HZ <= 2048
# define TCP_TW_RECYCLE_TICK (11+2-TCP_TW_RECYCLE_SLOTS_LOG)
#else
# define TCP_TW_RECYCLE_TICK (12+2-TCP_TW_RECYCLE_SLOTS_LOG)
#endif
#else
#define TCP_TW_RECYCLE_TICK (0)
#endif
/*
* TCP option
*/
#define TCPOPT_NOP 1 /* Padding */
#define TCPOPT_EOL 0 /* End of options */
#define TCPOPT_MSS 2 /* Segment size negotiating */
#define TCPOPT_WINDOW 3 /* Window scaling */
#define TCPOPT_SACK_PERM 4 /* SACK Permitted */
#define TCPOPT_SACK 5 /* SACK Block */
#define TCPOPT_TIMESTAMP 8 /* Better RTT estimations/PAWS */
/*
* TCP option lengths
*/
#define TCPOLEN_MSS 4
#define TCPOLEN_WINDOW 3
#define TCPOLEN_SACK_PERM 2
#define TCPOLEN_TIMESTAMP 10
/* But this is what stacks really send out. */
#define TCPOLEN_TSTAMP_ALIGNED 12
#define TCPOLEN_WSCALE_ALIGNED 4
#define TCPOLEN_SACKPERM_ALIGNED 4
#define TCPOLEN_SACK_BASE 2
#define TCPOLEN_SACK_BASE_ALIGNED 4
#define TCPOLEN_SACK_PERBLOCK 8
#define TCP_TIME_RETRANS 1 /* Retransmit timer */
#define TCP_TIME_DACK 2 /* Delayed ack timer */
#define TCP_TIME_PROBE0 3 /* Zero window probe timer */
#define TCP_TIME_KEEPOPEN 4 /* Keepalive timer */
#if 0
/* sysctl variables for tcp */
extern int sysctl_max_syn_backlog;
extern int sysctl_tcp_timestamps;
extern int sysctl_tcp_window_scaling;
extern int sysctl_tcp_sack;
extern int sysctl_tcp_fin_timeout;
extern int sysctl_tcp_tw_recycle;
extern int sysctl_tcp_keepalive_time;
extern int sysctl_tcp_keepalive_probes;
extern int sysctl_tcp_keepalive_intvl;
extern int sysctl_tcp_syn_retries;
extern int sysctl_tcp_synack_retries;
extern int sysctl_tcp_retries1;
extern int sysctl_tcp_retries2;
extern int sysctl_tcp_orphan_retries;
extern int sysctl_tcp_syncookies;
extern int sysctl_tcp_retrans_collap
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -