fastfind.c

来自「一个很有名的浏览器」· C语言 代码 · 共 795 行 · 第 1/2 页

C
795
字号
/* Very fast search_keyword_in_list. *//* $Id: fastfind.c,v 1.72.4.1 2005/04/05 21:08:43 jonas Exp $ */#ifdef HAVE_CONFIG_H#include "config.h"#endif#include <stdio.h>#include <stdlib.h>#include "elinks.h"#include "util/conv.h"#include "util/error.h"#include "util/fastfind.h"#include "util/memdebug.h"#include "util/memory.h"#ifdef USE_FASTFIND/* It replaces bsearch() + strcasecmp() + callback + ... * * Following conditions should be met: * * - list keys are C strings. * - keys should not be greater than 255 characters, and optimally < 20 *   characters. It can work with greater keys but then memory usage will *   grow a lot. * - each key must be unique and non empty. * - list do not have to be ordered. * - total number of unique characters used in all keys should be <= 128 * - idealy total number of keys should be <= 512 (but see below) * *  (c) 2003 Laurent MONIN (aka Zas) * Feel free to do whatever you want with that code. *//* These routines use a tree search. First, a big tree is composed from the * keys on input. Then, when searching we just go through the tree. If we will * end up on an 'ending' node, we've got it. * * Hm, okay. For keys { 'head', 'h1', 'body', 'bodyrock', 'bodyground' }, it * would look like: * *             [root] *          b          h *          o        e   1 *          d        a *          Y        D *        g   r *        r   o *        o   c *        u   K *        D * * (the ending nodes are upcased just for this drawing, not in real) * * To optimize this for speed, leafs of nodes are organized in per-node arrays * (so-called 'leafsets'), indexed by symbol value of the key's next character. * But to optimize that for memory, we first compose own alphabet consisting * only from the chars we ever use in the key strings. @uniq_chars holds that * alphabet and @idxtab is used to translate between it and ASCII. * * Tree building: O((L+M)*N) * 			(L: mean key length, M: alphabet size, * 			 N: number of items). * String lookup: O(N) (N: string length). *//* Define it to generate performance and memory usage statistics to stderr. */#if 0#define DEBUG_FASTFIND#endif/* Define whether to use 32 or 64 bits per compressed element. */#if 1#define USE_32_BITS#endif#define END_LEAF_BITS		1#define COMPRESSED_BITS		1#ifdef USE_32_BITS/* Use only 32 bits per element, but has very low limits. *//* Adequate for ELinks tags search. */#define POINTER_INDEX_BITS	10	/* 1024 */#define LEAFSET_INDEX_BITS	13	/* 8192 */#define COMP_CHAR_INDEX_BITS	7	/* 128	*/#define ff_node ff_node_c /* Both are 32 bits long. */#if (POINTER_INDEX_BITS + LEAFSET_INDEX_BITS + \     COMP_CHAR_INDEX_BITS + END_LEAF_BITS + \     COMPRESSED_BITS) > 32#error Over 32 bits in struct ff_node !!#endif#else /* !USE_32_BITS *//* Keep this one if there is more than 512 keywords in a list * it eats a bit more memory. * ELinks may need this one if fastfind is used in other * things than tags searching. *//* This will make struct ff_node_c use 64 bits. */#define POINTER_INDEX_BITS	12#define LEAFSET_INDEX_BITS	18#define COMP_CHAR_INDEX_BITS	8#if (POINTER_INDEX_BITS + LEAFSET_INDEX_BITS + \     + END_LEAF_BITS + COMPRESSED_BITS) > 32#error Over 32 bits in struct ff_node !!#endifstruct ff_node {	/* End leaf -> p is significant */	unsigned int e:END_LEAF_BITS;	/* Compressed */	unsigned int c:COMPRESSED_BITS;	/* Index in pointers */	unsigned int p:POINTER_INDEX_BITS;	/* Index in leafsets */	unsigned int l:LEAFSET_INDEX_BITS;};#endif /* USE_32_BITS */#define FF_MAX_KEYS	(1  << POINTER_INDEX_BITS)#define FF_MAX_LEAFSETS ((1 << LEAFSET_INDEX_BITS) - 1)#define FF_MAX_CHARS	(1  << COMP_CHAR_INDEX_BITS)#define FF_MAX_KEYLEN	255struct ff_node_c {	unsigned int e:END_LEAF_BITS;	unsigned int c:COMPRESSED_BITS;	unsigned int p:POINTER_INDEX_BITS;	unsigned int l:LEAFSET_INDEX_BITS;	/* Index of char when compressed. */	unsigned int ch:COMP_CHAR_INDEX_BITS;};struct ff_data {	void *pointer;	int keylen;};struct fastfind_info {	struct ff_data *data;	struct ff_node **leafsets;	struct ff_node *root_leafset;	int min_key_len;	int max_key_len;	int uniq_chars_count;	int count;	int pointers_count;	int leafsets_count;	unsigned int case_aware:1;	unsigned int compress:1;	int idxtab[FF_MAX_CHARS];	unsigned char uniq_chars[FF_MAX_CHARS];#ifdef DEBUG_FASTFIND	struct {		unsigned long searches;		unsigned long found;		unsigned long itertmp;		unsigned long iterdelta;		unsigned long itermax;		unsigned long iterations;		unsigned long tests;		unsigned long teststmp;		unsigned long testsdelta;		unsigned long testsmax;		unsigned long memory_usage;		unsigned long total_key_len;		unsigned int  compressed_nodes;		unsigned char *comment;	} debug;#endif};#ifdef DEBUG_FASTFIND/* These are for performance testing. */#define FF_DBG_mem(x, size) (x)->debug.memory_usage += (size)#define FF_DBG_test(x) (x)->debug.tests++#define FF_DBG_iter(x) (x)->debug.iterations++#define FF_DBG_cnode(x) (x)->debug.compressed_nodes++#define FF_DBG_found(x) \	do { \		unsigned long iter = (x)->debug.iterations - (x)->debug.itertmp;	\		unsigned long tests = (x)->debug.tests - (x)->debug.teststmp;		\											\		(x)->debug.iterdelta += iter;						\		(x)->debug.testsdelta += tests;						\		if (iter > (x)->debug.itermax)						\			(x)->debug.itermax = iter;					\		if (tests > (x)->debug.testsmax)					\			(x)->debug.testsmax = tests;					\		(x)->debug.found++;							\	} while (0)#define FF_DBG_comment(x, str) do { (x)->debug.comment = empty_string_or_(str); } while (0)/* Update search stats. */static voidFF_DBG_search_stats(struct fastfind_info *info, int key_len){	info->debug.searches++;	info->debug.total_key_len += key_len;	info->debug.teststmp = info->debug.tests;	info->debug.itertmp = info->debug.iterations;}/* Dump all stats. */static voidFF_DBG_dump_stats(struct fastfind_info *info){	fprintf(stderr, "------ FastFind Statistics ------\n");	fprintf(stderr, "Comment     : %s\n", info->debug.comment);	fprintf(stderr, "Case-aware  : %s\n", info->case_aware ? "yes" : "no");	fprintf(stderr, "Compress    : %s\n", info->compress ? "yes" : "no");	fprintf(stderr, "Uniq_chars  : %s\n", info->uniq_chars);	fprintf(stderr, "Uniq_chars #: %d/%d max.\n", info->uniq_chars_count, FF_MAX_CHARS);	fprintf(stderr, "Min_key_len : %d\n", info->min_key_len);	fprintf(stderr, "Max_key_len : %d\n", info->max_key_len);	fprintf(stderr, "Entries     : %d/%d max.\n", info->pointers_count, FF_MAX_KEYS);	fprintf(stderr, "Leafsets    : %d/%d max.\n", info->leafsets_count, FF_MAX_LEAFSETS);	if (info->compress)		fprintf(stderr, "C. leafsets : %d/%d (%0.2f%%)\n",			info->debug.compressed_nodes,			info->leafsets_count,			100 * (double) info->debug.compressed_nodes / info->leafsets_count);	fprintf(stderr, "Memory usage: %lu bytes (cost per entry = %0.2f bytes)\n",		info->debug.memory_usage, (double) info->debug.memory_usage / info->pointers_count);	fprintf(stderr, "Struct info : %d bytes\n", sizeof(*info) - sizeof(info->debug));	fprintf(stderr, "Struct node : %d bytes\n", sizeof(struct ff_node));	fprintf(stderr, "Struct cnode: %d bytes\n", sizeof(struct ff_node_c));	fprintf(stderr, "Searches    : %lu\n", info->debug.searches);	fprintf(stderr, "Found       : %lu (%0.2f%%)\n",		info->debug.found, 100 * (double) info->debug.found / info->debug.searches);	fprintf(stderr, "Iterations  : %lu (%0.2f per search, %0.2f before found, %lu max)\n",		info->debug.iterations, (double) info->debug.iterations / info->debug.searches,		(double) info->debug.iterdelta / info->debug.found,		info->debug.itermax);	fprintf(stderr, "Tests       : %lu (%0.2f per search, %0.2f per iter., %0.2f before found, %lu max)\n",		info->debug.tests, (double) info->debug.tests / info->debug.searches,		(double) info->debug.tests / info->debug.iterations,		(double) info->debug.testsdelta / info->debug.found,		info->debug.testsmax);	fprintf(stderr, "Total keylen: %lu bytes (%0.2f per search, %0.2f per iter.)\n",		info->debug.total_key_len, (double) info->debug.total_key_len / info->debug.searches,		(double) info->debug.total_key_len / info->debug.iterations);	fprintf(stderr, "\n");}#else /* !DEBUG_FASTFIND */#define FF_DBG_mem(x, size)#define FF_DBG_test(x)#define FF_DBG_iter(x)#define FF_DBG_cnode(x)#define FF_DBG_found(x)#define FF_DBG_comment(x, comment)#define FF_DBG_search_stats(info, key_len)#define FF_DBG_dump_stats(info)#endif /* DEBUG_FASTFIND */static struct fastfind_info *init_fastfind(struct fastfind_index *index, enum fastfind_flags flags){	struct fastfind_info *info = mem_calloc(1, sizeof(*info));	index->handle = info;	if (!info) return NULL;	info->min_key_len = FF_MAX_KEYLEN;	info->case_aware = !!(flags & FF_CASE_AWARE);	info->compress = !!(flags & FF_COMPRESS);	FF_DBG_mem(info, sizeof(*info) - sizeof(info->debug));	FF_DBG_comment(info, index->comment);	return info;}/* Return 1 on success, 0 on allocation failure */static intalloc_ff_data(struct fastfind_info *info){	struct ff_data *data;	assert(info->count < FF_MAX_KEYS);	if_assert_failed return 0;	/* On error, cleanup is done by fastfind_done(). */	data = mem_calloc(info->count, sizeof(*data));	if (!data) return 0;	info->data = data;	FF_DBG_mem(info, info->count * sizeof(*data));	return 1;}/* Add pointer and its key length to correspondant arrays, incrementing * internal counter. */static voidadd_to_ff_data(void *p, int key_len, struct fastfind_info *info){	struct ff_data *data = &info->data[info->pointers_count++];	/* Record new pointer and key len, used in search */	data->pointer = p;	data->keylen = key_len;}/* Return 1 on success, 0 on allocation failure */static intalloc_leafset(struct fastfind_info *info){	struct ff_node **leafsets;	struct ff_node *leafset;	assert(info->leafsets_count < FF_MAX_LEAFSETS);	if_assert_failed return 0;	/* info->leafsets[0] is never used since l=0 marks no leaf	 * in struct ff_node. That's the reason of that + 2. */	leafsets = mem_realloc(info->leafsets,			       sizeof(*leafsets) * (info->leafsets_count + 2));	if (!leafsets) return 0;	info->leafsets = leafsets;	leafset = mem_calloc(info->uniq_chars_count, sizeof(*leafset));	if (!leafset) return 0;	FF_DBG_mem(info, sizeof(*leafsets));	FF_DBG_mem(info, sizeof(*leafset) * info->uniq_chars_count);	info->leafsets_count++;	info->leafsets[info->leafsets_count] = leafset;	return 1;}static inline intchar2idx(unsigned char c, struct fastfind_info *info){	unsigned char *idx = memchr(info->uniq_chars, c, info->uniq_chars_count);	if (idx) return (idx - info->uniq_chars);	return -1;}static inline voidinit_idxtab(struct fastfind_info *info){	int i;	for (i = 0; i < FF_MAX_CHARS; i++)		info->idxtab[i] = char2idx((unsigned char) i, info);}static inline voidcompress_node(struct ff_node *leafset, struct fastfind_info *info,	      int i, int pos){	struct ff_node_c *new = mem_alloc(sizeof(*new));	if (!new) return;	new->c = 1;	new->e = leafset[pos].e;	new->p = leafset[pos].p;	new->l = leafset[pos].l;	new->ch = pos;	mem_free_set(&info->leafsets[i], (struct ff_node *) new);	FF_DBG_cnode(info);	FF_DBG_mem(info, sizeof(*new));	FF_DBG_mem(info, sizeof(*leafset) * -info->uniq_chars_count);}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?