route.c

来自「linux 内核源代码」· C语言 代码 · 共 2,594 行 · 第 1/5 页

C
2,594
字号
	age = jiffies - rth->u.dst.lastuse;	ret = 0;	if ((age <= tmo1 && !rt_fast_clean(rth)) ||	    (age <= tmo2 && rt_valuable(rth)))		goto out;	ret = 1;out:	return ret;}/* Bits of score are: * 31: very valuable * 30: not quite useless * 29..0: usage counter */static inline u32 rt_score(struct rtable *rt){	u32 score = jiffies - rt->u.dst.lastuse;	score = ~score & ~(3<<30);	if (rt_valuable(rt))		score |= (1<<31);	if (!rt->fl.iif ||	    !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))		score |= (1<<30);	return score;}static inline int compare_keys(struct flowi *fl1, struct flowi *fl2){	return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |		(fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |		(fl1->mark ^ fl2->mark) |		(*(u16 *)&fl1->nl_u.ip4_u.tos ^		 *(u16 *)&fl2->nl_u.ip4_u.tos) |		(fl1->oif ^ fl2->oif) |		(fl1->iif ^ fl2->iif)) == 0;}static void rt_check_expire(struct work_struct *work){	static unsigned int rover;	unsigned int i = rover, goal;	struct rtable *rth, **rthp;	u64 mult;	mult = ((u64)ip_rt_gc_interval) << rt_hash_log;	if (ip_rt_gc_timeout > 1)		do_div(mult, ip_rt_gc_timeout);	goal = (unsigned int)mult;	if (goal > rt_hash_mask)		goal = rt_hash_mask + 1;	for (; goal > 0; goal--) {		unsigned long tmo = ip_rt_gc_timeout;		i = (i + 1) & rt_hash_mask;		rthp = &rt_hash_table[i].chain;		if (need_resched())			cond_resched();		if (*rthp == NULL)			continue;		spin_lock_bh(rt_hash_lock_addr(i));		while ((rth = *rthp) != NULL) {			if (rth->u.dst.expires) {				/* Entry is expired even if it is in use */				if (time_before_eq(jiffies, rth->u.dst.expires)) {					tmo >>= 1;					rthp = &rth->u.dst.rt_next;					continue;				}			} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {				tmo >>= 1;				rthp = &rth->u.dst.rt_next;				continue;			}			/* Cleanup aged off entries. */			*rthp = rth->u.dst.rt_next;			rt_free(rth);		}		spin_unlock_bh(rt_hash_lock_addr(i));	}	rover = i;	schedule_delayed_work(&expires_work, ip_rt_gc_interval);}/* This can run from both BH and non-BH contexts, the latter * in the case of a forced flush event. */static void rt_run_flush(unsigned long dummy){	int i;	struct rtable *rth, *next;	rt_deadline = 0;	get_random_bytes(&rt_hash_rnd, 4);	for (i = rt_hash_mask; i >= 0; i--) {		spin_lock_bh(rt_hash_lock_addr(i));		rth = rt_hash_table[i].chain;		if (rth)			rt_hash_table[i].chain = NULL;		spin_unlock_bh(rt_hash_lock_addr(i));		for (; rth; rth = next) {			next = rth->u.dst.rt_next;			rt_free(rth);		}	}}static DEFINE_SPINLOCK(rt_flush_lock);void rt_cache_flush(int delay){	unsigned long now = jiffies;	int user_mode = !in_softirq();	if (delay < 0)		delay = ip_rt_min_delay;	spin_lock_bh(&rt_flush_lock);	if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {		long tmo = (long)(rt_deadline - now);		/* If flush timer is already running		   and flush request is not immediate (delay > 0):		   if deadline is not achieved, prolongate timer to "delay",		   otherwise fire it at deadline time.		 */		if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)			tmo = 0;		if (delay > tmo)			delay = tmo;	}	if (delay <= 0) {		spin_unlock_bh(&rt_flush_lock);		rt_run_flush(0);		return;	}	if (rt_deadline == 0)		rt_deadline = now + ip_rt_max_delay;	mod_timer(&rt_flush_timer, now+delay);	spin_unlock_bh(&rt_flush_lock);}static void rt_secret_rebuild(unsigned long dummy){	unsigned long now = jiffies;	rt_cache_flush(0);	mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);}/*   Short description of GC goals.   We want to build algorithm, which will keep routing cache   at some equilibrium point, when number of aged off entries   is kept approximately equal to newly generated ones.   Current expiration strength is variable "expire".   We try to adjust it dynamically, so that if networking   is idle expires is large enough to keep enough of warm entries,   and when load increases it reduces to limit cache size. */static int rt_garbage_collect(void){	static unsigned long expire = RT_GC_TIMEOUT;	static unsigned long last_gc;	static int rover;	static int equilibrium;	struct rtable *rth, **rthp;	unsigned long now = jiffies;	int goal;	/*	 * Garbage collection is pretty expensive,	 * do not make it too frequently.	 */	RT_CACHE_STAT_INC(gc_total);	if (now - last_gc < ip_rt_gc_min_interval &&	    atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {		RT_CACHE_STAT_INC(gc_ignored);		goto out;	}	/* Calculate number of entries, which we want to expire now. */	goal = atomic_read(&ipv4_dst_ops.entries) -		(ip_rt_gc_elasticity << rt_hash_log);	if (goal <= 0) {		if (equilibrium < ipv4_dst_ops.gc_thresh)			equilibrium = ipv4_dst_ops.gc_thresh;		goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;		if (goal > 0) {			equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);			goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;		}	} else {		/* We are in dangerous area. Try to reduce cache really		 * aggressively.		 */		goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);		equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;	}	if (now - last_gc >= ip_rt_gc_min_interval)		last_gc = now;	if (goal <= 0) {		equilibrium += goal;		goto work_done;	}	do {		int i, k;		for (i = rt_hash_mask, k = rover; i >= 0; i--) {			unsigned long tmo = expire;			k = (k + 1) & rt_hash_mask;			rthp = &rt_hash_table[k].chain;			spin_lock_bh(rt_hash_lock_addr(k));			while ((rth = *rthp) != NULL) {				if (!rt_may_expire(rth, tmo, expire)) {					tmo >>= 1;					rthp = &rth->u.dst.rt_next;					continue;				}				*rthp = rth->u.dst.rt_next;				rt_free(rth);				goal--;			}			spin_unlock_bh(rt_hash_lock_addr(k));			if (goal <= 0)				break;		}		rover = k;		if (goal <= 0)			goto work_done;		/* Goal is not achieved. We stop process if:		   - if expire reduced to zero. Otherwise, expire is halfed.		   - if table is not full.		   - if we are called from interrupt.		   - jiffies check is just fallback/debug loop breaker.		     We will not spin here for long time in any case.		 */		RT_CACHE_STAT_INC(gc_goal_miss);		if (expire == 0)			break;		expire >>= 1;#if RT_CACHE_DEBUG >= 2		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,				atomic_read(&ipv4_dst_ops.entries), goal, i);#endif		if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)			goto out;	} while (!in_softirq() && time_before_eq(jiffies, now));	if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)		goto out;	if (net_ratelimit())		printk(KERN_WARNING "dst cache overflow\n");	RT_CACHE_STAT_INC(gc_dst_overflow);	return 1;work_done:	expire += ip_rt_gc_min_interval;	if (expire > ip_rt_gc_timeout ||	    atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)		expire = ip_rt_gc_timeout;#if RT_CACHE_DEBUG >= 2	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,			atomic_read(&ipv4_dst_ops.entries), goal, rover);#endifout:	return 0;}static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp){	struct rtable	*rth, **rthp;	unsigned long	now;	struct rtable *cand, **candp;	u32 		min_score;	int		chain_length;	int attempts = !in_softirq();restart:	chain_length = 0;	min_score = ~(u32)0;	cand = NULL;	candp = NULL;	now = jiffies;	rthp = &rt_hash_table[hash].chain;	spin_lock_bh(rt_hash_lock_addr(hash));	while ((rth = *rthp) != NULL) {		if (compare_keys(&rth->fl, &rt->fl)) {			/* Put it first */			*rthp = rth->u.dst.rt_next;			/*			 * Since lookup is lockfree, the deletion			 * must be visible to another weakly ordered CPU before			 * the insertion at the start of the hash chain.			 */			rcu_assign_pointer(rth->u.dst.rt_next,					   rt_hash_table[hash].chain);			/*			 * Since lookup is lockfree, the update writes			 * must be ordered for consistency on SMP.			 */			rcu_assign_pointer(rt_hash_table[hash].chain, rth);			dst_use(&rth->u.dst, now);			spin_unlock_bh(rt_hash_lock_addr(hash));			rt_drop(rt);			*rp = rth;			return 0;		}		if (!atomic_read(&rth->u.dst.__refcnt)) {			u32 score = rt_score(rth);			if (score <= min_score) {				cand = rth;				candp = rthp;				min_score = score;			}		}		chain_length++;		rthp = &rth->u.dst.rt_next;	}	if (cand) {		/* ip_rt_gc_elasticity used to be average length of chain		 * length, when exceeded gc becomes really aggressive.		 *		 * The second limit is less certain. At the moment it allows		 * only 2 entries per bucket. We will see.		 */		if (chain_length > ip_rt_gc_elasticity) {			*candp = cand->u.dst.rt_next;			rt_free(cand);		}	}	/* Try to bind route to arp only if it is output	   route or unicast forwarding path.	 */	if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {		int err = arp_bind_neighbour(&rt->u.dst);		if (err) {			spin_unlock_bh(rt_hash_lock_addr(hash));			if (err != -ENOBUFS) {				rt_drop(rt);				return err;			}			/* Neighbour tables are full and nothing			   can be released. Try to shrink route cache,			   it is most likely it holds some neighbour records.			 */			if (attempts-- > 0) {				int saved_elasticity = ip_rt_gc_elasticity;				int saved_int = ip_rt_gc_min_interval;				ip_rt_gc_elasticity	= 1;				ip_rt_gc_min_interval	= 0;				rt_garbage_collect();				ip_rt_gc_min_interval	= saved_int;				ip_rt_gc_elasticity	= saved_elasticity;				goto restart;			}			if (net_ratelimit())				printk(KERN_WARNING "Neighbour table overflow.\n");			rt_drop(rt);			return -ENOBUFS;		}	}	rt->u.dst.rt_next = rt_hash_table[hash].chain;#if RT_CACHE_DEBUG >= 2	if (rt->u.dst.rt_next) {		struct rtable *trt;		printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,		       NIPQUAD(rt->rt_dst));		for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)			printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));		printk("\n");	}#endif	rt_hash_table[hash].chain = rt;	spin_unlock_bh(rt_hash_lock_addr(hash));	*rp = rt;	return 0;}void rt_bind_peer(struct rtable *rt, int create){	static DEFINE_SPINLOCK(rt_peer_lock);	struct inet_peer *peer;	peer = inet_getpeer(rt->rt_dst, create);	spin_lock_bh(&rt_peer_lock);	if (rt->peer == NULL) {		rt->peer = peer;		peer = NULL;	}	spin_unlock_bh(&rt_peer_lock);	if (peer)		inet_putpeer(peer);}/* * Peer allocation may fail only in serious out-of-memory conditions.  However * we still can generate some output. * Random ID selection looks a bit dangerous because we have no chances to * select ID being unique in a reasonable period of time. * But broken packet identifier may be better than no packet at all. */static void ip_select_fb_ident(struct iphdr *iph){	static DEFINE_SPINLOCK(ip_fb_id_lock);	static u32 ip_fallback_id;	u32 salt;	spin_lock_bh(&ip_fb_id_lock);	salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);	iph->id = htons(salt & 0xFFFF);	ip_fallback_id = salt;	spin_unlock_bh(&ip_fb_id_lock);}void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more){	struct rtable *rt = (struct rtable *) dst;	if (rt) {		if (rt->peer == NULL)			rt_bind_peer(rt, 1);		/* If peer is attached to destination, it is never detached,		   so that we need not to grab a lock to dereference it.		 */		if (rt->peer) {			iph->id = htons(inet_getid(rt->peer, more));			return;		}	} else		printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",		       __builtin_return_address(0));	ip_select_fb_ident(iph);}static void rt_del(unsigned hash, struct rtable *rt){	struct rtable **rthp;	spin_lock_bh(rt_hash_lock_addr(hash));	ip_rt_put(rt);	for (rthp = &rt_hash_table[hash].chain; *rthp;	     rthp = &(*rthp)->u.dst.rt_next)		if (*rthp == rt) {			*rthp = rt->u.dst.rt_next;			rt_free(rt);			break;		}	spin_unlock_bh(rt_hash_lock_addr(hash));}void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,		    __be32 saddr, struct net_device *dev){	int i, k;	struct in_device *in_dev = in_dev_get(dev);	struct rtable *rth, **rthp;	__be32  skeys[2] = { saddr, 0 };	int  ikeys[2] = { dev->ifindex, 0 };	struct netevent_redirect netevent;	if (!in_dev)		return;	if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)	    || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))		goto reject_redirect;	if (!IN_DEV_SHARED_MEDIA(in_dev)) {		if (!inet_addr_onlink(in_dev, new_gw, old_gw))

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?