📄 route.c
字号:
i = (i + 1) & rt_hash_mask; rthp = &rt_hash_table[i].chain; spin_lock(&rt_hash_table[i].lock); while ((rth = *rthp) != NULL) { if (rth->u.dst.expires) { /* Entry is expired even if it is in use */ if (time_before_eq(now, rth->u.dst.expires)) { tmo >>= 1; rthp = &rth->u.rt_next; continue; } } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) { tmo >>= 1; rthp = &rth->u.rt_next; continue; } /* Cleanup aged off entries. */ *rthp = rth->u.rt_next; rt_free(rth); } spin_unlock(&rt_hash_table[i].lock); /* Fallback loop breaker. */ if (time_after(jiffies, now)) break; } rover = i; mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval);}/* This can run from both BH and non-BH contexts, the latter * in the case of a forced flush event. */static void rt_run_flush(unsigned long dummy){ int i; struct rtable *rth, *next; rt_deadline = 0; get_random_bytes(&rt_hash_rnd, 4); for (i = rt_hash_mask; i >= 0; i--) { spin_lock_bh(&rt_hash_table[i].lock); rth = rt_hash_table[i].chain; if (rth) rt_hash_table[i].chain = NULL; spin_unlock_bh(&rt_hash_table[i].lock); for (; rth; rth = next) { next = rth->u.rt_next; rt_free(rth); } }}static spinlock_t rt_flush_lock = SPIN_LOCK_UNLOCKED;void rt_cache_flush(int delay){ unsigned long now = jiffies; int user_mode = !in_softirq(); if (delay < 0) delay = ip_rt_min_delay; spin_lock_bh(&rt_flush_lock); if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) { long tmo = (long)(rt_deadline - now); /* If flush timer is already running and flush request is not immediate (delay > 0): if deadline is not achieved, prolongate timer to "delay", otherwise fire it at deadline time. */ if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay) tmo = 0; if (delay > tmo) delay = tmo; } if (delay <= 0) { spin_unlock_bh(&rt_flush_lock); rt_run_flush(0); return; } if (rt_deadline == 0) rt_deadline = now + ip_rt_max_delay; mod_timer(&rt_flush_timer, now+delay); spin_unlock_bh(&rt_flush_lock);}static void rt_secret_rebuild(unsigned long dummy){ unsigned long now = jiffies; rt_cache_flush(0); mod_timer(&rt_secret_timer, now + ip_rt_secret_interval);}/* Short description of GC goals. We want to build algorithm, which will keep routing cache at some equilibrium point, when number of aged off entries is kept approximately equal to newly generated ones. Current expiration strength is variable "expire". We try to adjust it dynamically, so that if networking is idle expires is large enough to keep enough of warm entries, and when load increases it reduces to limit cache size. */static int rt_garbage_collect(void){ static unsigned long expire = RT_GC_TIMEOUT; static unsigned long last_gc; static int rover; static int equilibrium; struct rtable *rth, **rthp; unsigned long now = jiffies; int goal; /* * Garbage collection is pretty expensive, * do not make it too frequently. */ RT_CACHE_STAT_INC(gc_total); if (now - last_gc < ip_rt_gc_min_interval && atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) { RT_CACHE_STAT_INC(gc_ignored); goto out; } /* Calculate number of entries, which we want to expire now. */ goal = atomic_read(&ipv4_dst_ops.entries) - (ip_rt_gc_elasticity << rt_hash_log); if (goal <= 0) { if (equilibrium < ipv4_dst_ops.gc_thresh) equilibrium = ipv4_dst_ops.gc_thresh; goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; if (goal > 0) { equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1); goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; } } else { /* We are in dangerous area. Try to reduce cache really * aggressively. */ goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1); equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal; } if (now - last_gc >= ip_rt_gc_min_interval) last_gc = now; if (goal <= 0) { equilibrium += goal; goto work_done; } do { int i, k; for (i = rt_hash_mask, k = rover; i >= 0; i--) { unsigned long tmo = expire; k = (k + 1) & rt_hash_mask; rthp = &rt_hash_table[k].chain; spin_lock_bh(&rt_hash_table[k].lock); while ((rth = *rthp) != NULL) { if (!rt_may_expire(rth, tmo, expire)) { tmo >>= 1; rthp = &rth->u.rt_next; continue; } *rthp = rth->u.rt_next; rt_free(rth); goal--; } spin_unlock_bh(&rt_hash_table[k].lock); if (goal <= 0) break; } rover = k; if (goal <= 0) goto work_done; /* Goal is not achieved. We stop process if: - if expire reduced to zero. Otherwise, expire is halfed. - if table is not full. - if we are called from interrupt. - jiffies check is just fallback/debug loop breaker. We will not spin here for long time in any case. */ RT_CACHE_STAT_INC(gc_goal_miss); if (expire == 0) break; expire >>= 1;#if RT_CACHE_DEBUG >= 2 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire, atomic_read(&ipv4_dst_ops.entries), goal, i);#endif if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) goto out; } while (!in_softirq() && time_before_eq(jiffies, now)); if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) goto out; if (net_ratelimit()) printk(KERN_WARNING "dst cache overflow\n"); RT_CACHE_STAT_INC(gc_dst_overflow); return 1;work_done: expire += ip_rt_gc_min_interval; if (expire > ip_rt_gc_timeout || atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh) expire = ip_rt_gc_timeout;#if RT_CACHE_DEBUG >= 2 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire, atomic_read(&ipv4_dst_ops.entries), goal, rover);#endifout: return 0;}static inline int compare_keys(struct flowi *fl1, struct flowi *fl2){ return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 && fl1->oif == fl2->oif && fl1->iif == fl2->iif;}static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp){ struct rtable *rth, **rthp; unsigned long now; struct rtable *cand, **candp; u32 min_score; int chain_length; int attempts = !in_softirq();restart: chain_length = 0; min_score = ~(u32)0; cand = NULL; candp = NULL; now = jiffies; rthp = &rt_hash_table[hash].chain; spin_lock_bh(&rt_hash_table[hash].lock); while ((rth = *rthp) != NULL) { if (compare_keys(&rth->fl, &rt->fl)) { /* Put it first */ *rthp = rth->u.rt_next; /* * Since lookup is lockfree, the deletion * must be visible to another weakly ordered CPU before * the insertion at the start of the hash chain. */ smp_wmb(); rth->u.rt_next = rt_hash_table[hash].chain; /* * Since lookup is lockfree, the update writes * must be ordered for consistency on SMP. */ smp_wmb(); rt_hash_table[hash].chain = rth; rth->u.dst.__use++; dst_hold(&rth->u.dst); rth->u.dst.lastuse = now; spin_unlock_bh(&rt_hash_table[hash].lock); rt_drop(rt); *rp = rth; return 0; } if (!atomic_read(&rth->u.dst.__refcnt)) { u32 score = rt_score(rth); if (score <= min_score) { cand = rth; candp = rthp; min_score = score; } } chain_length++; rthp = &rth->u.rt_next; } if (cand) { /* ip_rt_gc_elasticity used to be average length of chain * length, when exceeded gc becomes really aggressive. * * The second limit is less certain. At the moment it allows * only 2 entries per bucket. We will see. */ if (chain_length > ip_rt_gc_elasticity) { *candp = cand->u.rt_next; rt_free(cand); } } /* Try to bind route to arp only if it is output route or unicast forwarding path. */ if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { int err = arp_bind_neighbour(&rt->u.dst); if (err) { spin_unlock_bh(&rt_hash_table[hash].lock); if (err != -ENOBUFS) { rt_drop(rt); return err; } /* Neighbour tables are full and nothing can be released. Try to shrink route cache, it is most likely it holds some neighbour records. */ if (attempts-- > 0) { int saved_elasticity = ip_rt_gc_elasticity; int saved_int = ip_rt_gc_min_interval; ip_rt_gc_elasticity = 1; ip_rt_gc_min_interval = 0; rt_garbage_collect(); ip_rt_gc_min_interval = saved_int; ip_rt_gc_elasticity = saved_elasticity; goto restart; } if (net_ratelimit()) printk(KERN_WARNING "Neighbour table overflow.\n"); rt_drop(rt); return -ENOBUFS; } } rt->u.rt_next = rt_hash_table[hash].chain;#if RT_CACHE_DEBUG >= 2 if (rt->u.rt_next) { struct rtable *trt; printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash, NIPQUAD(rt->rt_dst)); for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next) printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst)); printk("\n"); }#endif rt_hash_table[hash].chain = rt; spin_unlock_bh(&rt_hash_table[hash].lock); *rp = rt; return 0;}void rt_bind_peer(struct rtable *rt, int create){ static spinlock_t rt_peer_lock = SPIN_LOCK_UNLOCKED; struct inet_peer *peer; peer = inet_getpeer(rt->rt_dst, create); spin_lock_bh(&rt_peer_lock); if (rt->peer == NULL) { rt->peer = peer; peer = NULL; } spin_unlock_bh(&rt_peer_lock); if (peer) inet_putpeer(peer);}/* * Peer allocation may fail only in serious out-of-memory conditions. However * we still can generate some output. * Random ID selection looks a bit dangerous because we have no chances to * select ID being unique in a reasonable period of time. * But broken packet identifier may be better than no packet at all. */static void ip_select_fb_ident(struct iphdr *iph){ static spinlock_t ip_fb_id_lock = SPIN_LOCK_UNLOCKED; static u32 ip_fallback_id; u32 salt; spin_lock_bh(&ip_fb_id_lock); salt = secure_ip_id(ip_fallback_id ^ iph->daddr); iph->id = htons(salt & 0xFFFF); ip_fallback_id = salt; spin_unlock_bh(&ip_fb_id_lock);}void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more){ struct rtable *rt = (struct rtable *) dst; if (rt) { if (rt->peer == NULL) rt_bind_peer(rt, 1); /* If peer is attached to destination, it is never detached, so that we need not to grab a lock to dereference it. */ if (rt->peer) { iph->id = htons(inet_getid(rt->peer, more)); return; } } else printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", NET_CALLER(iph)); ip_select_fb_ident(iph);}static void rt_del(unsigned hash, struct rtable *rt){ struct rtable **rthp; spin_lock_bh(&rt_hash_table[hash].lock); ip_rt_put(rt); for (rthp = &rt_hash_table[hash].chain; *rthp; rthp = &(*rthp)->u.rt_next) if (*rthp == rt) { *rthp = rt->u.rt_next; rt_free(rt); break; } spin_unlock_bh(&rt_hash_table[hash].lock);}void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, u32 saddr, u8 tos, struct net_device *dev){ int i, k; struct in_device *in_dev = in_dev_get(dev); struct rtable *rth, **rthp; u32 skeys[2] = { saddr, 0 }; int ikeys[2] = { dev->ifindex, 0 }; tos &= IPTOS_RT_MASK; if (!in_dev) return; if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw)) goto reject_redirect; if (!IN_DEV_SHARED_MEDIA(in_dev)) { if (!inet_addr_onlink(in_dev, new_gw, old_gw)) goto reject_redirect; if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev)) goto reject_redirect; } else { if (inet_addr_type(new_gw) != RTN_UNICAST) goto reject_redirect; } for (i = 0; i < 2; i++) { for (k = 0; k < 2; k++) { unsigned hash = rt_hash_code(daddr, skeys[i] ^ (ikeys[k] << 5), tos); rthp=&rt_hash_table[hash].chain; rcu_read_lock(); while ((rth = rcu_dereference(*rthp)) != NULL) { struct rtable *rt; if (rth->fl.fl4_dst != daddr || rth->fl.fl4_src != skeys[i] || rth->fl.fl4_tos != tos || rth->fl.oif != ikeys[k] || rth->fl.iif != 0) { rthp = &rth->u.rt_next; continue; } if (rth->rt_dst != daddr || rth->rt_src != saddr || rth->u.dst.error || rth->rt_gateway != old_gw || rth->u.dst.dev != dev) break; dst_hold(&rth->u.dst); rcu_read_unlock(); rt = dst_alloc(&ipv4_dst_ops); if (rt == NULL) { ip_rt_put(rth); in_dev_put(in_dev); return; } /* Copy all the information. */ *rt = *rth; INIT_RCU_HEAD(&rt->u.dst.rcu_head); rt->u.dst.__use = 1; atomic_set(&rt->u.dst.__refcnt, 1); rt->u.dst.child = NULL; if (rt->u.dst.dev) dev_hold(rt->u.dst.dev);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -