📄 route.c
字号:
/* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * ROUTE - implementation of the IP router. * * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $ * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Linus Torvalds, <Linus.Torvalds@helsinki.fi> * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * Fixes: * Alan Cox : Verify area fixes. * Alan Cox : cli() protects routing changes * Rui Oliveira : ICMP routing table updates * (rco@di.uminho.pt) Routing table insertion and update * Linus Torvalds : Rewrote bits to be sensible * Alan Cox : Added BSD route gw semantics * Alan Cox : Super /proc >4K * Alan Cox : MTU in route table * Alan Cox : MSS actually. Also added the window * clamper. * Sam Lantinga : Fixed route matching in rt_del() * Alan Cox : Routing cache support. * Alan Cox : Removed compatibility cruft. * Alan Cox : RTF_REJECT support. * Alan Cox : TCP irtt support. * Jonathan Naylor : Added Metric support. * Miquel van Smoorenburg : BSD API fixes. * Miquel van Smoorenburg : Metrics. * Alan Cox : Use __u32 properly * Alan Cox : Aligned routing errors more closely with BSD * our system is still very different. * Alan Cox : Faster /proc handling * Alexey Kuznetsov : Massive rework to support tree based routing, * routing caches and better behaviour. * * Olaf Erb : irtt wasn't being copied right. * Bjorn Ekwall : Kerneld route support. * Alan Cox : Multicast fixed (I hope) * Pavel Krauz : Limited broadcast fixed * Mike McLagan : Routing by source * Alexey Kuznetsov : End of old history. Split to fib.c and * route.c and rewritten from scratch. * Andi Kleen : Load-limit warning messages. * Vitaly E. Lavrov : Transparent proxy revived after year coma. * Vitaly E. Lavrov : Race condition in ip_route_input_slow. * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow. * Vladimir V. Ivanov : IP rule info (flowid) is really useful. * Marc Boucher : routing by fwmark * Robert Olsson : Added rt_cache statistics * Arnaldo C. Melo : Convert proc stuff to seq_file * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes. * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect * Ilia Sotnikov : Removed TOS from hash calculations * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */#include <linux/module.h>#include <asm/uaccess.h>#include <asm/system.h>#include <linux/bitops.h>#include <linux/types.h>#include <linux/kernel.h>#include <linux/mm.h>#include <linux/bootmem.h>#include <linux/string.h>#include <linux/socket.h>#include <linux/sockios.h>#include <linux/errno.h>#include <linux/in.h>#include <linux/inet.h>#include <linux/netdevice.h>#include <linux/proc_fs.h>#include <linux/init.h>#include <linux/workqueue.h>#include <linux/skbuff.h>#include <linux/inetdevice.h>#include <linux/igmp.h>#include <linux/pkt_sched.h>#include <linux/mroute.h>#include <linux/netfilter_ipv4.h>#include <linux/random.h>#include <linux/jhash.h>#include <linux/rcupdate.h>#include <linux/times.h>#include <net/net_namespace.h>#include <net/protocol.h>#include <net/ip.h>#include <net/route.h>#include <net/inetpeer.h>#include <net/sock.h>#include <net/ip_fib.h>#include <net/arp.h>#include <net/tcp.h>#include <net/icmp.h>#include <net/xfrm.h>#include <net/netevent.h>#include <net/rtnetlink.h>#ifdef CONFIG_SYSCTL#include <linux/sysctl.h>#endif#define RT_FL_TOS(oldflp) \ ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))#define IP_MAX_MTU 0xFFF0#define RT_GC_TIMEOUT (300*HZ)static int ip_rt_min_delay = 2 * HZ;static int ip_rt_max_delay = 10 * HZ;static int ip_rt_max_size;static int ip_rt_gc_timeout = RT_GC_TIMEOUT;static int ip_rt_gc_interval = 60 * HZ;static int ip_rt_gc_min_interval = HZ / 2;static int ip_rt_redirect_number = 9;static int ip_rt_redirect_load = HZ / 50;static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));static int ip_rt_error_cost = HZ;static int ip_rt_error_burst = 5 * HZ;static int ip_rt_gc_elasticity = 8;static int ip_rt_mtu_expires = 10 * 60 * HZ;static int ip_rt_min_pmtu = 512 + 20 + 20;static int ip_rt_min_advmss = 256;static int ip_rt_secret_interval = 10 * 60 * HZ;static unsigned long rt_deadline;#define RTprint(a...) printk(KERN_DEBUG a)static struct timer_list rt_flush_timer;static void rt_check_expire(struct work_struct *work);static DECLARE_DELAYED_WORK(expires_work, rt_check_expire);static struct timer_list rt_secret_timer;/* * Interface to generic destination cache. */static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);static void ipv4_dst_destroy(struct dst_entry *dst);static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, int how);static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);static void ipv4_link_failure(struct sk_buff *skb);static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);static int rt_garbage_collect(void);static struct dst_ops ipv4_dst_ops = { .family = AF_INET, .protocol = __constant_htons(ETH_P_IP), .gc = rt_garbage_collect, .check = ipv4_dst_check, .destroy = ipv4_dst_destroy, .ifdown = ipv4_dst_ifdown, .negative_advice = ipv4_negative_advice, .link_failure = ipv4_link_failure, .update_pmtu = ip_rt_update_pmtu, .entry_size = sizeof(struct rtable),};#define ECN_OR_COST(class) TC_PRIO_##classconst __u8 ip_tos2prio[16] = { TC_PRIO_BESTEFFORT, ECN_OR_COST(FILLER), TC_PRIO_BESTEFFORT, ECN_OR_COST(BESTEFFORT), TC_PRIO_BULK, ECN_OR_COST(BULK), TC_PRIO_BULK, ECN_OR_COST(BULK), TC_PRIO_INTERACTIVE, ECN_OR_COST(INTERACTIVE), TC_PRIO_INTERACTIVE, ECN_OR_COST(INTERACTIVE), TC_PRIO_INTERACTIVE_BULK, ECN_OR_COST(INTERACTIVE_BULK), TC_PRIO_INTERACTIVE_BULK, ECN_OR_COST(INTERACTIVE_BULK)};/* * Route cache. *//* The locking scheme is rather straight forward: * * 1) Read-Copy Update protects the buckets of the central route hash. * 2) Only writers remove entries, and they hold the lock * as they look at rtable reference counts. * 3) Only readers acquire references to rtable entries, * they do so with atomic increments and with the * lock held. */struct rt_hash_bucket { struct rtable *chain;};#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ defined(CONFIG_PROVE_LOCKING)/* * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks * The size of this table is a power of two and depends on the number of CPUS. * (on lockdep we have a quite big spinlock_t, so keep the size down there) */#ifdef CONFIG_LOCKDEP# define RT_HASH_LOCK_SZ 256#else# if NR_CPUS >= 32# define RT_HASH_LOCK_SZ 4096# elif NR_CPUS >= 16# define RT_HASH_LOCK_SZ 2048# elif NR_CPUS >= 8# define RT_HASH_LOCK_SZ 1024# elif NR_CPUS >= 4# define RT_HASH_LOCK_SZ 512# else# define RT_HASH_LOCK_SZ 256# endif#endifstatic spinlock_t *rt_hash_locks;# define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]# define rt_hash_lock_init() { \ int i; \ rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \ if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \ for (i = 0; i < RT_HASH_LOCK_SZ; i++) \ spin_lock_init(&rt_hash_locks[i]); \ }#else# define rt_hash_lock_addr(slot) NULL# define rt_hash_lock_init()#endifstatic struct rt_hash_bucket *rt_hash_table;static unsigned rt_hash_mask;static unsigned int rt_hash_log;static unsigned int rt_hash_rnd;static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);#define RT_CACHE_STAT_INC(field) \ (__raw_get_cpu_var(rt_cache_stat).field++)static int rt_intern_hash(unsigned hash, struct rtable *rth, struct rtable **res);static unsigned int rt_hash_code(u32 daddr, u32 saddr){ return (jhash_2words(daddr, saddr, rt_hash_rnd) & rt_hash_mask);}#define rt_hash(daddr, saddr, idx) \ rt_hash_code((__force u32)(__be32)(daddr),\ (__force u32)(__be32)(saddr) ^ ((idx) << 5))#ifdef CONFIG_PROC_FSstruct rt_cache_iter_state { int bucket;};static struct rtable *rt_cache_get_first(struct seq_file *seq){ struct rtable *r = NULL; struct rt_cache_iter_state *st = seq->private; for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { rcu_read_lock_bh(); r = rt_hash_table[st->bucket].chain; if (r) break; rcu_read_unlock_bh(); } return rcu_dereference(r);}static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r){ struct rt_cache_iter_state *st = seq->private; r = r->u.dst.rt_next; while (!r) { rcu_read_unlock_bh(); if (--st->bucket < 0) break; rcu_read_lock_bh(); r = rt_hash_table[st->bucket].chain; } return rcu_dereference(r);}static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos){ struct rtable *r = rt_cache_get_first(seq); if (r) while (pos && (r = rt_cache_get_next(seq, r))) --pos; return pos ? NULL : r;}static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos){ return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;}static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos){ struct rtable *r = NULL; if (v == SEQ_START_TOKEN) r = rt_cache_get_first(seq); else r = rt_cache_get_next(seq, v); ++*pos; return r;}static void rt_cache_seq_stop(struct seq_file *seq, void *v){ if (v && v != SEQ_START_TOKEN) rcu_read_unlock_bh();}static int rt_cache_seq_show(struct seq_file *seq, void *v){ if (v == SEQ_START_TOKEN) seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" "HHUptod\tSpecDst"); else { struct rtable *r = v; char temp[256]; sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t" "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X", r->u.dst.dev ? r->u.dst.dev->name : "*", (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway, r->rt_flags, atomic_read(&r->u.dst.__refcnt), r->u.dst.__use, 0, (unsigned long)r->rt_src, (dst_metric(&r->u.dst, RTAX_ADVMSS) ? (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0), dst_metric(&r->u.dst, RTAX_WINDOW), (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) + dst_metric(&r->u.dst, RTAX_RTTVAR)), r->fl.fl4_tos, r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1, r->u.dst.hh ? (r->u.dst.hh->hh_output == dev_queue_xmit) : 0, r->rt_spec_dst); seq_printf(seq, "%-127s\n", temp); } return 0;}static const struct seq_operations rt_cache_seq_ops = { .start = rt_cache_seq_start, .next = rt_cache_seq_next, .stop = rt_cache_seq_stop, .show = rt_cache_seq_show,};static int rt_cache_seq_open(struct inode *inode, struct file *file){ return seq_open_private(file, &rt_cache_seq_ops, sizeof(struct rt_cache_iter_state));}static const struct file_operations rt_cache_seq_fops = { .owner = THIS_MODULE, .open = rt_cache_seq_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release_private,};static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos){ int cpu; if (*pos == 0) return SEQ_START_TOKEN; for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu+1; return &per_cpu(rt_cache_stat, cpu); } return NULL;}static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos){ int cpu; for (cpu = *pos; cpu < NR_CPUS; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu+1; return &per_cpu(rt_cache_stat, cpu); } return NULL;}static void rt_cpu_seq_stop(struct seq_file *seq, void *v){}static int rt_cpu_seq_show(struct seq_file *seq, void *v){ struct rt_cache_stat *st = v; if (v == SEQ_START_TOKEN) { seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n"); return 0; } seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x " " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n", atomic_read(&ipv4_dst_ops.entries), st->in_hit, st->in_slow_tot, st->in_slow_mc, st->in_no_route, st->in_brd, st->in_martian_dst, st->in_martian_src, st->out_hit, st->out_slow_tot, st->out_slow_mc, st->gc_total, st->gc_ignored, st->gc_goal_miss, st->gc_dst_overflow, st->in_hlist_search, st->out_hlist_search ); return 0;}static const struct seq_operations rt_cpu_seq_ops = { .start = rt_cpu_seq_start, .next = rt_cpu_seq_next, .stop = rt_cpu_seq_stop, .show = rt_cpu_seq_show,};static int rt_cpu_seq_open(struct inode *inode, struct file *file){ return seq_open(file, &rt_cpu_seq_ops);}static const struct file_operations rt_cpu_seq_fops = { .owner = THIS_MODULE, .open = rt_cpu_seq_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release,};#endif /* CONFIG_PROC_FS */static __inline__ void rt_free(struct rtable *rt){ call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);}static __inline__ void rt_drop(struct rtable *rt){ ip_rt_put(rt); call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);}static __inline__ int rt_fast_clean(struct rtable *rth){ /* Kill broadcast/multicast entries very aggresively, if they collide in hash table with more useful entries */ return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && rth->fl.iif && rth->u.dst.rt_next;}static __inline__ int rt_valuable(struct rtable *rth){ return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || rth->u.dst.expires;}static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2){ unsigned long age; int ret = 0; if (atomic_read(&rth->u.dst.__refcnt)) goto out; ret = 1; if (rth->u.dst.expires && time_after_eq(jiffies, rth->u.dst.expires)) goto out;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -