📄 route.c
字号:
/* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * ROUTE - implementation of the IP router. * * Version: $Id: route.c,v 1.102.2.1 2002/01/12 07:43:57 davem Exp $ * * Authors: Ross Biro, <bir7@leland.Stanford.Edu> * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Linus Torvalds, <Linus.Torvalds@helsinki.fi> * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * Fixes: * Alan Cox : Verify area fixes. * Alan Cox : cli() protects routing changes * Rui Oliveira : ICMP routing table updates * (rco@di.uminho.pt) Routing table insertion and update * Linus Torvalds : Rewrote bits to be sensible * Alan Cox : Added BSD route gw semantics * Alan Cox : Super /proc >4K * Alan Cox : MTU in route table * Alan Cox : MSS actually. Also added the window * clamper. * Sam Lantinga : Fixed route matching in rt_del() * Alan Cox : Routing cache support. * Alan Cox : Removed compatibility cruft. * Alan Cox : RTF_REJECT support. * Alan Cox : TCP irtt support. * Jonathan Naylor : Added Metric support. * Miquel van Smoorenburg : BSD API fixes. * Miquel van Smoorenburg : Metrics. * Alan Cox : Use __u32 properly * Alan Cox : Aligned routing errors more closely with BSD * our system is still very different. * Alan Cox : Faster /proc handling * Alexey Kuznetsov : Massive rework to support tree based routing, * routing caches and better behaviour. * * Olaf Erb : irtt wasn't being copied right. * Bjorn Ekwall : Kerneld route support. * Alan Cox : Multicast fixed (I hope) * Pavel Krauz : Limited broadcast fixed * Mike McLagan : Routing by source * Alexey Kuznetsov : End of old history. Splitted to fib.c and * route.c and rewritten from scratch. * Andi Kleen : Load-limit warning messages. * Vitaly E. Lavrov : Transparent proxy revived after year coma. * Vitaly E. Lavrov : Race condition in ip_route_input_slow. * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow. * Vladimir V. Ivanov : IP rule info (flowid) is really useful. * Marc Boucher : routing by fwmark * Robert Olsson : Added rt_cache statistics * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */#include <linux/config.h>#include <asm/uaccess.h>#include <asm/system.h>#include <asm/bitops.h>#include <linux/types.h>#include <linux/kernel.h>#include <linux/sched.h>#include <linux/mm.h>#include <linux/string.h>#include <linux/socket.h>#include <linux/sockios.h>#include <linux/errno.h>#include <linux/in.h>#include <linux/inet.h>#include <linux/netdevice.h>#include <linux/proc_fs.h>#include <linux/init.h>#include <linux/skbuff.h>#include <linux/rtnetlink.h>#include <linux/inetdevice.h>#include <linux/igmp.h>#include <linux/pkt_sched.h>#include <linux/mroute.h>#include <linux/netfilter_ipv4.h>#include <linux/random.h>#include <net/protocol.h>#include <net/ip.h>#include <net/route.h>#include <net/inetpeer.h>#include <net/sock.h>#include <net/ip_fib.h>#include <net/arp.h>#include <net/tcp.h>#include <net/icmp.h>#ifdef CONFIG_SYSCTL#include <linux/sysctl.h>#endif#define IP_MAX_MTU 0xFFF0#define RT_GC_TIMEOUT (300*HZ)int ip_rt_min_delay = 2 * HZ;int ip_rt_max_delay = 10 * HZ;int ip_rt_max_size;int ip_rt_gc_timeout = RT_GC_TIMEOUT;int ip_rt_gc_interval = 60 * HZ;int ip_rt_gc_min_interval = 5 * HZ;int ip_rt_redirect_number = 9;int ip_rt_redirect_load = HZ / 50;int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));int ip_rt_error_cost = HZ;int ip_rt_error_burst = 5 * HZ;int ip_rt_gc_elasticity = 8;int ip_rt_mtu_expires = 10 * 60 * HZ;int ip_rt_min_pmtu = 512 + 20 + 20;int ip_rt_min_advmss = 256;static unsigned long rt_deadline;#define RTprint(a...) printk(KERN_DEBUG a)static struct timer_list rt_flush_timer;static struct timer_list rt_periodic_timer;/* * Interface to generic destination cache. */static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);static struct dst_entry *ipv4_dst_reroute(struct dst_entry *dst, struct sk_buff *skb);static void ipv4_dst_destroy(struct dst_entry *dst);static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);static void ipv4_link_failure(struct sk_buff *skb);static int rt_garbage_collect(void);struct dst_ops ipv4_dst_ops = { family: AF_INET, protocol: __constant_htons(ETH_P_IP), gc: rt_garbage_collect, check: ipv4_dst_check, reroute: ipv4_dst_reroute, destroy: ipv4_dst_destroy, negative_advice: ipv4_negative_advice, link_failure: ipv4_link_failure, entry_size: sizeof(struct rtable),};#define ECN_OR_COST(class) TC_PRIO_##class__u8 ip_tos2prio[16] = { TC_PRIO_BESTEFFORT, ECN_OR_COST(FILLER), TC_PRIO_BESTEFFORT, ECN_OR_COST(BESTEFFORT), TC_PRIO_BULK, ECN_OR_COST(BULK), TC_PRIO_BULK, ECN_OR_COST(BULK), TC_PRIO_INTERACTIVE, ECN_OR_COST(INTERACTIVE), TC_PRIO_INTERACTIVE, ECN_OR_COST(INTERACTIVE), TC_PRIO_INTERACTIVE_BULK, ECN_OR_COST(INTERACTIVE_BULK), TC_PRIO_INTERACTIVE_BULK, ECN_OR_COST(INTERACTIVE_BULK)};/* * Route cache. *//* The locking scheme is rather straight forward: * * 1) A BH protected rwlocks protect buckets of the central route hash. * 2) Only writers remove entries, and they hold the lock * as they look at rtable reference counts. * 3) Only readers acquire references to rtable entries, * they do so with atomic increments and with the * lock held. */struct rt_hash_bucket { struct rtable *chain; rwlock_t lock;} __attribute__((__aligned__(8)));static struct rt_hash_bucket *rt_hash_table;static unsigned rt_hash_mask;static int rt_hash_log;struct rt_cache_stat rt_cache_stat[NR_CPUS];static int rt_intern_hash(unsigned hash, struct rtable *rth, struct rtable **res);static __inline__ unsigned rt_hash_code(u32 daddr, u32 saddr, u8 tos){ unsigned hash = ((daddr & 0xF0F0F0F0) >> 4) | ((daddr & 0x0F0F0F0F) << 4); hash ^= saddr ^ tos; hash ^= (hash >> 16); return (hash ^ (hash >> 8)) & rt_hash_mask;}static int rt_cache_get_info(char *buffer, char **start, off_t offset, int length){ int len = 0; off_t pos = 128; char temp[256]; struct rtable *r; int i; if (offset < 128) { sprintf(buffer, "%-127s\n", "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" "HHUptod\tSpecDst"); len = 128; } for (i = rt_hash_mask; i >= 0; i--) { read_lock_bh(&rt_hash_table[i].lock); for (r = rt_hash_table[i].chain; r; r = r->u.rt_next) { /* * Spin through entries until we are ready */ pos += 128; if (pos <= offset) { len = 0; continue; } sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t" "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X", r->u.dst.dev ? r->u.dst.dev->name : "*", (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway, r->rt_flags, atomic_read(&r->u.dst.__refcnt), r->u.dst.__use, 0, (unsigned long)r->rt_src, (int)r->u.dst.advmss + 40, r->u.dst.window, (int)((r->u.dst.rtt >> 3) + r->u.dst.rttvar), r->key.tos, r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1, r->u.dst.hh ? (r->u.dst.hh->hh_output == dev_queue_xmit) : 0, r->rt_spec_dst); sprintf(buffer + len, "%-127s\n", temp); len += 128; if (pos >= offset+length) { read_unlock_bh(&rt_hash_table[i].lock); goto done; } } read_unlock_bh(&rt_hash_table[i].lock); }done: *start = buffer + len - (pos - offset); len = pos - offset; if (len > length) len = length; return len;}static int rt_cache_stat_get_info(char *buffer, char **start, off_t offset, int length){ unsigned int dst_entries = atomic_read(&ipv4_dst_ops.entries); int i, lcpu; int len = 0; for (lcpu = 0; lcpu < smp_num_cpus; lcpu++) { i = cpu_logical_map(lcpu); len += sprintf(buffer+len, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n", dst_entries, rt_cache_stat[i].in_hit, rt_cache_stat[i].in_slow_tot, rt_cache_stat[i].in_slow_mc, rt_cache_stat[i].in_no_route, rt_cache_stat[i].in_brd, rt_cache_stat[i].in_martian_dst, rt_cache_stat[i].in_martian_src, rt_cache_stat[i].out_hit, rt_cache_stat[i].out_slow_tot, rt_cache_stat[i].out_slow_mc ); } len -= offset; if (len > length) len = length; if (len < 0) len = 0; *start = buffer + offset; return len;} static __inline__ void rt_free(struct rtable *rt){ dst_free(&rt->u.dst);}static __inline__ void rt_drop(struct rtable *rt){ ip_rt_put(rt); dst_free(&rt->u.dst);}static __inline__ int rt_fast_clean(struct rtable *rth){ /* Kill broadcast/multicast entries very aggresively, if they collide in hash table with more useful entries */ return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && rth->key.iif && rth->u.rt_next;}static __inline__ int rt_valuable(struct rtable *rth){ return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || rth->u.dst.expires;}static __inline__ int rt_may_expire(struct rtable *rth, int tmo1, int tmo2){ int age; int ret = 0; if (atomic_read(&rth->u.dst.__refcnt)) goto out; ret = 1; if (rth->u.dst.expires && (long)(rth->u.dst.expires - jiffies) <= 0) goto out; age = jiffies - rth->u.dst.lastuse; ret = 0; if ((age <= tmo1 && !rt_fast_clean(rth)) || (age <= tmo2 && rt_valuable(rth))) goto out; ret = 1;out: return ret;}/* This runs via a timer and thus is always in BH context. */static void SMP_TIMER_NAME(rt_check_expire)(unsigned long dummy){ static int rover; int i = rover, t; struct rtable *rth, **rthp; unsigned long now = jiffies; for (t = ip_rt_gc_interval << rt_hash_log; t >= 0; t -= ip_rt_gc_timeout) { unsigned tmo = ip_rt_gc_timeout; i = (i + 1) & rt_hash_mask; rthp = &rt_hash_table[i].chain; write_lock(&rt_hash_table[i].lock); while ((rth = *rthp) != NULL) { if (rth->u.dst.expires) { /* Entry is expired even if it is in use */ if ((long)(now - rth->u.dst.expires) <= 0) { tmo >>= 1; rthp = &rth->u.rt_next; continue; } } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) { tmo >>= 1; rthp = &rth->u.rt_next; continue; } /* Cleanup aged off entries. */ *rthp = rth->u.rt_next; rt_free(rth); } write_unlock(&rt_hash_table[i].lock); /* Fallback loop breaker. */ if ((jiffies - now) > 0) break; } rover = i; mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);}SMP_TIMER_DEFINE(rt_check_expire, rt_gc_task);/* This can run from both BH and non-BH contexts, the latter * in the case of a forced flush event. */static void SMP_TIMER_NAME(rt_run_flush)(unsigned long dummy){ int i; struct rtable *rth, *next; rt_deadline = 0; for (i = rt_hash_mask; i >= 0; i--) { write_lock_bh(&rt_hash_table[i].lock); rth = rt_hash_table[i].chain; if (rth) rt_hash_table[i].chain = NULL; write_unlock_bh(&rt_hash_table[i].lock); for (; rth; rth = next) { next = rth->u.rt_next; rt_free(rth); } }}SMP_TIMER_DEFINE(rt_run_flush, rt_cache_flush_task); static spinlock_t rt_flush_lock = SPIN_LOCK_UNLOCKED;void rt_cache_flush(int delay){ unsigned long now = jiffies; int user_mode = !in_softirq(); if (delay < 0) delay = ip_rt_min_delay; spin_lock_bh(&rt_flush_lock); if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) { long tmo = (long)(rt_deadline - now); /* If flush timer is already running and flush request is not immediate (delay > 0): if deadline is not achieved, prolongate timer to "delay", otherwise fire it at deadline time. */ if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay) tmo = 0; if (delay > tmo) delay = tmo; } if (delay <= 0) { spin_unlock_bh(&rt_flush_lock); SMP_TIMER_NAME(rt_run_flush)(0); return; } if (rt_deadline == 0) rt_deadline = now + ip_rt_max_delay; mod_timer(&rt_flush_timer, now+delay); spin_unlock_bh(&rt_flush_lock);}/* Short description of GC goals. We want to build algorithm, which will keep routing cache at some equilibrium point, when number of aged off entries is kept approximately equal to newly generated ones. Current expiration strength is variable "expire". We try to adjust it dynamically, so that if networking is idle expires is large enough to keep enough of warm entries, and when load increases it reduces to limit cache size. */static int rt_garbage_collect(void){ static unsigned expire = RT_GC_TIMEOUT; static unsigned long last_gc; static int rover; static int equilibrium; struct rtable *rth, **rthp; unsigned long now = jiffies; int goal; /* * Garbage collection is pretty expensive, * do not make it too frequently. */ if (now - last_gc < ip_rt_gc_min_interval && atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) goto out;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -