📄 fib_trie.c
字号:
/* * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet * & Swedish University of Agricultural Sciences. * * Jens Laas <jens.laas@data.slu.se> Swedish University of * Agricultural Sciences. * * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet * * This work is based on the LPC-trie which is originally descibed in: * * An experimental study of compression methods for dynamic tries * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/ * * * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999 * * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $ * * * Code from fib_hash has been reused which includes the following header: * * * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * IPv4 FIB: lookup engine and maintenance routines. * * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * * Substantial contributions to this work comes from: * * David S. Miller, <davem@davemloft.net> * Stephen Hemminger <shemminger@osdl.org> * Paul E. McKenney <paulmck@us.ibm.com> * Patrick McHardy <kaber@trash.net> */#define VERSION "0.408"#include <asm/uaccess.h>#include <asm/system.h>#include <linux/bitops.h>#include <linux/types.h>#include <linux/kernel.h>#include <linux/mm.h>#include <linux/string.h>#include <linux/socket.h>#include <linux/sockios.h>#include <linux/errno.h>#include <linux/in.h>#include <linux/inet.h>#include <linux/inetdevice.h>#include <linux/netdevice.h>#include <linux/if_arp.h>#include <linux/proc_fs.h>#include <linux/rcupdate.h>#include <linux/skbuff.h>#include <linux/netlink.h>#include <linux/init.h>#include <linux/list.h>#include <net/net_namespace.h>#include <net/ip.h>#include <net/protocol.h>#include <net/route.h>#include <net/tcp.h>#include <net/sock.h>#include <net/ip_fib.h>#include "fib_lookup.h"#undef CONFIG_IP_FIB_TRIE_STATS#define MAX_STAT_DEPTH 32#define KEYLENGTH (8*sizeof(t_key))typedef unsigned int t_key;#define T_TNODE 0#define T_LEAF 1#define NODE_TYPE_MASK 0x1UL#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)#define IS_TNODE(n) (!(n->parent & T_LEAF))#define IS_LEAF(n) (n->parent & T_LEAF)struct node { t_key key; unsigned long parent;};struct leaf { t_key key; unsigned long parent; struct hlist_head list; struct rcu_head rcu;};struct leaf_info { struct hlist_node hlist; struct rcu_head rcu; int plen; struct list_head falh;};struct tnode { t_key key; unsigned long parent; unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */ unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */ unsigned short full_children; /* KEYLENGTH bits needed */ unsigned short empty_children; /* KEYLENGTH bits needed */ struct rcu_head rcu; struct node *child[0];};#ifdef CONFIG_IP_FIB_TRIE_STATSstruct trie_use_stats { unsigned int gets; unsigned int backtrack; unsigned int semantic_match_passed; unsigned int semantic_match_miss; unsigned int null_node_hit; unsigned int resize_node_skipped;};#endifstruct trie_stat { unsigned int totdepth; unsigned int maxdepth; unsigned int tnodes; unsigned int leaves; unsigned int nullpointers; unsigned int nodesizes[MAX_STAT_DEPTH];};struct trie { struct node *trie;#ifdef CONFIG_IP_FIB_TRIE_STATS struct trie_use_stats stats;#endif int size; unsigned int revision;};static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);static struct node *resize(struct trie *t, struct tnode *tn);static struct tnode *inflate(struct trie *t, struct tnode *tn);static struct tnode *halve(struct trie *t, struct tnode *tn);static void tnode_free(struct tnode *tn);static struct kmem_cache *fn_alias_kmem __read_mostly;static struct trie *trie_local = NULL, *trie_main = NULL;static inline struct tnode *node_parent(struct node *node){ struct tnode *ret; ret = (struct tnode *)(node->parent & ~NODE_TYPE_MASK); return rcu_dereference(ret);}static inline void node_set_parent(struct node *node, struct tnode *ptr){ rcu_assign_pointer(node->parent, (unsigned long)ptr | NODE_TYPE(node));}/* rcu_read_lock needs to be hold by caller from readside */static inline struct node *tnode_get_child(struct tnode *tn, int i){ BUG_ON(i >= 1 << tn->bits); return rcu_dereference(tn->child[i]);}static inline int tnode_child_length(const struct tnode *tn){ return 1 << tn->bits;}static inline t_key mask_pfx(t_key k, unsigned short l){ return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);}static inline t_key tkey_extract_bits(t_key a, int offset, int bits){ if (offset < KEYLENGTH) return ((t_key)(a << offset)) >> (KEYLENGTH - bits); else return 0;}static inline int tkey_equals(t_key a, t_key b){ return a == b;}static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b){ if (bits == 0 || offset >= KEYLENGTH) return 1; bits = bits > KEYLENGTH ? KEYLENGTH : bits; return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;}static inline int tkey_mismatch(t_key a, int offset, t_key b){ t_key diff = a ^ b; int i = offset; if (!diff) return 0; while ((diff << i) >> (KEYLENGTH-1) == 0) i++; return i;}/* To understand this stuff, an understanding of keys and all their bits is necessary. Every node in the trie has a key associated with it, but not all of the bits in that key are significant. Consider a node 'n' and its parent 'tp'. If n is a leaf, every bit in its key is significant. Its presence is necessitated by path compression, since during a tree traversal (when searching for a leaf - unless we are doing an insertion) we will completely ignore all skipped bits we encounter. Thus we need to verify, at the end of a potentially successful search, that we have indeed been walking the correct key path. Note that we can never "miss" the correct key in the tree if present by following the wrong path. Path compression ensures that segments of the key that are the same for all keys with a given prefix are skipped, but the skipped part *is* identical for each node in the subtrie below the skipped bit! trie_insert() in this implementation takes care of that - note the call to tkey_sub_equals() in trie_insert(). if n is an internal node - a 'tnode' here, the various parts of its key have many different meanings. Example: _________________________________________________________________ | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C | ----------------------------------------------------------------- 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 _________________________________________________________________ | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u | ----------------------------------------------------------------- 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 tp->pos = 7 tp->bits = 3 n->pos = 15 n->bits = 4 First, let's just ignore the bits that come before the parent tp, that is the bits from 0 to (tp->pos-1). They are *known* but at this point we do not use them for anything. The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the index into the parent's child array. That is, they will be used to find 'n' among tp's children. The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits for the node n. All the bits we have seen so far are significant to the node n. The rest of the bits are really not needed or indeed known in n->key. The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into n's child array, and will of course be different for each child. The rest of the bits, from (n->pos + n->bits) onward, are completely unknown at this point.*/static inline void check_tnode(const struct tnode *tn){ WARN_ON(tn && tn->pos+tn->bits > 32);}static int halve_threshold = 25;static int inflate_threshold = 50;static int halve_threshold_root = 8;static int inflate_threshold_root = 15;static void __alias_free_mem(struct rcu_head *head){ struct fib_alias *fa = container_of(head, struct fib_alias, rcu); kmem_cache_free(fn_alias_kmem, fa);}static inline void alias_free_mem_rcu(struct fib_alias *fa){ call_rcu(&fa->rcu, __alias_free_mem);}static void __leaf_free_rcu(struct rcu_head *head){ kfree(container_of(head, struct leaf, rcu));}static void __leaf_info_free_rcu(struct rcu_head *head){ kfree(container_of(head, struct leaf_info, rcu));}static inline void free_leaf_info(struct leaf_info *leaf){ call_rcu(&leaf->rcu, __leaf_info_free_rcu);}static struct tnode *tnode_alloc(unsigned int size){ struct page *pages; if (size <= PAGE_SIZE) return kcalloc(size, 1, GFP_KERNEL); pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size)); if (!pages) return NULL; return page_address(pages);}static void __tnode_free_rcu(struct rcu_head *head){ struct tnode *tn = container_of(head, struct tnode, rcu); unsigned int size = sizeof(struct tnode) + (1 << tn->bits) * sizeof(struct node *); if (size <= PAGE_SIZE) kfree(tn); else free_pages((unsigned long)tn, get_order(size));}static inline void tnode_free(struct tnode *tn){ if (IS_LEAF(tn)) { struct leaf *l = (struct leaf *) tn; call_rcu_bh(&l->rcu, __leaf_free_rcu); } else call_rcu(&tn->rcu, __tnode_free_rcu);}static struct leaf *leaf_new(void){ struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL); if (l) { l->parent = T_LEAF; INIT_HLIST_HEAD(&l->list); } return l;}static struct leaf_info *leaf_info_new(int plen){ struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL); if (li) { li->plen = plen; INIT_LIST_HEAD(&li->falh); } return li;}static struct tnode* tnode_new(t_key key, int pos, int bits){ int nchildren = 1<<bits; int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *); struct tnode *tn = tnode_alloc(sz); if (tn) { memset(tn, 0, sz); tn->parent = T_TNODE; tn->pos = pos; tn->bits = bits; tn->key = key; tn->full_children = 0; tn->empty_children = 1<<bits; } pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode), (unsigned int) (sizeof(struct node) * 1<<bits)); return tn;}/* * Check whether a tnode 'n' is "full", i.e. it is an internal node * and no bits are skipped. See discussion in dyntree paper p. 6 */static inline int tnode_full(const struct tnode *tn, const struct node *n){ if (n == NULL || IS_LEAF(n)) return 0; return ((struct tnode *) n)->pos == tn->pos + tn->bits;}static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n){ tnode_put_child_reorg(tn, i, n, -1);} /* * Add a child at position i overwriting the old value. * Update the value of full_children and empty_children. */static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull){ struct node *chi = tn->child[i]; int isfull; BUG_ON(i >= 1<<tn->bits); /* update emptyChildren */ if (n == NULL && chi != NULL) tn->empty_children++; else if (n != NULL && chi == NULL) tn->empty_children--; /* update fullChildren */ if (wasfull == -1) wasfull = tnode_full(tn, chi); isfull = tnode_full(tn, n); if (wasfull && !isfull) tn->full_children--; else if (!wasfull && isfull) tn->full_children++; if (n) node_set_parent(n, tn); rcu_assign_pointer(tn->child[i], n);}static struct node *resize(struct trie *t, struct tnode *tn){ int i; int err = 0; struct tnode *old_tn; int inflate_threshold_use; int halve_threshold_use; int max_resize; if (!tn) return NULL; pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n", tn, inflate_threshold, halve_threshold); /* No children */ if (tn->empty_children == tnode_child_length(tn)) { tnode_free(tn); return NULL; } /* One child */ if (tn->empty_children == tnode_child_length(tn) - 1) for (i = 0; i < tnode_child_length(tn); i++) { struct node *n; n = tn->child[i]; if (!n) continue; /* compress one level */ node_set_parent(n, NULL); tnode_free(tn); return n; } /* * Double as long as the resulting node has a number of * nonempty nodes that are above the threshold. */ /* * From "Implementing a dynamic compressed trie" by Stefan Nilsson of * the Helsinki University of Technology and Matti Tikkanen of Nokia * Telecommunications, page 6: * "A node is doubled if the ratio of non-empty children to all * children in the *doubled* node is at least 'high'." * * 'high' in this instance is the variable 'inflate_threshold'. It * is expressed as a percentage, so we multiply it with * tnode_child_length() and instead of multiplying by 2 (since the * child array will be doubled by inflate()) and multiplying * the left-hand side by 100 (to handle the percentage thing) we * multiply the left-hand side by 50. * * The left-hand side may look a bit weird: tnode_child_length(tn) * - tn->empty_children is of course the number of non-null children * in the current node. tn->full_children is the number of "full" * children, that is non-null tnodes with a skip value of 0. * All of those will be doubled in the resulting inflated tnode, so * we just count them one extra time here. * * A clearer way to write this would be: * * to_be_doubled = tn->full_children; * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children - * tn->full_children; * * new_child_length = tnode_child_length(tn) * 2; * * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) / * new_child_length; * if (new_fill_factor >= inflate_threshold) * * ...and so on, tho it would mess up the while () loop. * * anyway, * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >= * inflate_threshold * * avoid a division: * 100 * (not_to_be_doubled + 2*to_be_doubled) >= * inflate_threshold * new_child_length * * expand not_to_be_doubled and to_be_doubled, and shorten: * 100 * (tnode_child_length(tn) - tn->empty_children + * tn->full_children) >= inflate_threshold * new_child_length * * expand new_child_length: * 100 * (tnode_child_length(tn) - tn->empty_children + * tn->full_children) >= * inflate_threshold * tnode_child_length(tn) * 2 * * shorten again: * 50 * (tn->full_children + tnode_child_length(tn) - * tn->empty_children) >= inflate_threshold * * tnode_child_length(tn) * */ check_tnode(tn); /* Keep root node larger */ if (!tn->parent) inflate_threshold_use = inflate_threshold_root; else inflate_threshold_use = inflate_threshold; err = 0; max_resize = 10; while ((tn->full_children > 0 && max_resize-- && 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >= inflate_threshold_use * tnode_child_length(tn))) { old_tn = tn; tn = inflate(t, tn); if (IS_ERR(tn)) { tn = old_tn;#ifdef CONFIG_IP_FIB_TRIE_STATS t->stats.resize_node_skipped++;#endif break; } } if (max_resize < 0) { if (!tn->parent) printk(KERN_WARNING "Fix inflate_threshold_root. Now=%d size=%d bits\n", inflate_threshold_root, tn->bits); else printk(KERN_WARNING "Fix inflate_threshold. Now=%d size=%d bits\n", inflate_threshold, tn->bits); } check_tnode(tn); /* * Halve as long as the number of empty children in this * node is above threshold. */ /* Keep root node larger */ if (!tn->parent) halve_threshold_use = halve_threshold_root; else halve_threshold_use = halve_threshold; err = 0; max_resize = 10; while (tn->bits > 1 && max_resize-- && 100 * (tnode_child_length(tn) - tn->empty_children) < halve_threshold_use * tnode_child_length(tn)) { old_tn = tn; tn = halve(t, tn); if (IS_ERR(tn)) { tn = old_tn;#ifdef CONFIG_IP_FIB_TRIE_STATS t->stats.resize_node_skipped++;#endif break; } } if (max_resize < 0) { if (!tn->parent) printk(KERN_WARNING "Fix halve_threshold_root. Now=%d size=%d bits\n", halve_threshold_root, tn->bits); else
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -