⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 ac_bm.c

📁 AC-BM算法的实现的压缩包
💻 C
字号:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "ac_bm.h"

#include <ctype.h>

//#define CASE_SENSITIVE
//#define DEBUG_SEARCH

static int ACtree_build(pattern_tree *ptree, \
		pattern_data *patterns, \
		int npattern);
static int ACtree_compute_BCshifts(pattern_tree *ptree);
static int set_GSshift(pattern_tree *ptree, unsigned char *pat, int depth, int shift);
static int compute_GSshift(pattern_tree *ptree, unsigned char *pat1, \
		int pat1_len, unsigned char *pat2, int pat2_len);
static int ACtree_compute_shifts(pattern_tree *ptree);
static int ACtree_compute_GSshifts(pattern_tree *ptree);
static int _init_GSshifts(pattern_tree_node *root, int shift);
static int ACtree_init_GSshifts(pattern_tree *ptree);
//static void ACtree_print_tree(pattern_tree *ptree);
static void _print_tree(pattern_tree_node *root);
static void _clean_tree(pattern_tree_node *root);

//
// ACtree_build
//
static int ACtree_build(pattern_tree *ptree, \
		pattern_data *patterns, \
		int npattern)
{
	int i;
	pattern_tree_node *root = NULL, *parent = NULL;
	unsigned char ch;
	int max_pattern_len = 0, min_pattern_len = PATTERN_LEN;

	if (ptree == NULL || patterns == NULL || npattern < 0)
		goto err;
	
	// create root tree node
	root = malloc(sizeof(pattern_tree_node));
	if (root == NULL)
		goto err;
	bzero(root, sizeof(pattern_tree_node));
	root->label = -2;
	root->depth = 0;
	
	// add pattern node to tree
	for (i = 0; i < npattern; i++) 
	{
		int pat_len;
		int ch_i;
		
		pat_len = (patterns+i)->len;
		if (pat_len == 0)
			continue;
		else if (pat_len > PATTERN_LEN)
			pat_len = PATTERN_LEN;
		
		
		if (pat_len > max_pattern_len)
			max_pattern_len = pat_len;
		if (pat_len < min_pattern_len)
			min_pattern_len = pat_len;

		/* search branch adding point */
		parent = root;
		for (ch_i = 0; ch_i < pat_len; ch_i++)
		{
			ch = ((patterns+i)->data)[ch_i]; 
#ifndef CASE_SENSITIVE
			ch = tolower(ch);
#endif

			if (parent->childs[ch] == NULL) { /* find adding point */
				break;
			}
			
			parent = parent->childs[ch];
		}
		
		if (ch_i < pat_len) {
			/* add the branch under 'parent' */
			for (; ch_i < pat_len; ch_i++)
			{
				pattern_tree_node *node = NULL;

				ch = ((patterns+i)->data)[ch_i];
#ifndef CASE_SENSITIVE
				ch = tolower(ch);
#endif
				
				/* alloc node */
				node = malloc(sizeof(pattern_tree_node));
				if (node == NULL)
					goto err;
				bzero(node, sizeof(pattern_tree_node));
				node->depth = ch_i + 1;  // from 1
				node->ch = ch;
				node->label = -1;
				
				/* add to tree */
				parent->childs[ch] = node;
#ifndef CASE_SENSITIVE
				/* no case sensitive */
				if (ch >= 'a' && ch <= 'z') {
					parent->childs[ch-32] = node;
				}
#endif

				parent->nchild++;
				parent->one_child = ch;
				node->parent = parent;
				
				parent = node;
			}
		}

		/* last node remember which pattern where it from */
		parent->label = i;
	}
	
	ptree->pattern_list = patterns;
	ptree->pattern_count = npattern;
	ptree->root = root;
	ptree->max_depth = max_pattern_len;
	ptree->min_pattern_size = min_pattern_len;

	return 0;
err:
	/* free tree */
	if (ptree->root != NULL)
	{
		_clean_tree(ptree->root);
		free(ptree->root);
		ptree->root = NULL;
	}
	return -1;
}


/*
 * _print_tree
 */
static void _print_tree(pattern_tree_node *root)
{
	int i;

	if (root == NULL)
		return;
	// print this node
	printf("ch:%c shift:%d label:%d depth:%d childs:", root->ch, root->GSshift, root->label, root->depth);
	
	for (i = 0; i < 256; i++) {
#ifndef CASE_SENSITIVE
		if (i >= 'A' && i <= 'Z')
			continue;
#endif
		if (root->childs[i])
			printf("%c ", (root->childs[i])->ch);
	}
	printf("\n");

	// print child tree recursively
	for (i = 0; i < 256; i++) {
		if (root->childs[i])
			_print_tree(root->childs[i]);
	}

	return;
}

#if 0 /* unused */
/*
 * ACtree_print_tree
 */
static void ACtree_print_tree(pattern_tree *ptree)
{
	if (ptree == NULL)
		return;

	if (ptree->root)
		_print_tree(ptree->root);

	return;
}
#endif 

/*
 * ACtree_compute_BCshifts
 */
static int ACtree_compute_BCshifts(pattern_tree *ptree)
{
	int i, j = 0;

	for (i = 0; i < 256; i++)
		ptree->BCshift[i] = ptree->min_pattern_size;

	for (i = ptree->min_pattern_size - 1; i > 0; i--)
		for (j = 0; j < ptree->pattern_count; j++) {
			unsigned char ch;

			ch = (ptree->pattern_list+j)->data[i];
#ifndef CASE_SENSITIVE
			ch = tolower(ch);
#endif
			ptree->BCshift[ch] = i;
#ifndef CASE_SENSITIVE
			if (ch > 'a' && ch <'z')
				ptree->BCshift[ch-32] = i;
#endif
		}

	return 0;
}

/*
 * set_GSshift
 */
static int set_GSshift(pattern_tree *ptree, unsigned char *pat, int depth, int shift)
{
	int i;
	pattern_tree_node *node;

	if (ptree == NULL || ptree->root == NULL)
		goto err;

	node = ptree->root;
	for (i = 0; i < depth; i++) {
		node = node->childs[pat[i]];
		if (node == NULL)
			goto err;
	}
	
	node->GSshift = node->GSshift < shift ? node->GSshift : shift;

	return 0;
err:
	return -1;
}

/*
 * compute_GSshift
 */
static int compute_GSshift(pattern_tree *ptree, unsigned char *pat1, \
		int pat1_len, unsigned char *pat2, int pat2_len)
{
	unsigned char first_char;
	int i;
	int pat1_index, pat2_index, offset;


	if (pat1 == NULL || pat2 == NULL || pat1_len < 0 || pat2_len < 0)
		goto err;

	if (pat1_len == 0 || pat2_len == 0)
		return 0;

	first_char = pat1[0];
#ifndef CASE_SENSITIVE
	first_char = tolower(first_char);
#endif
	/* handle char 0 separately */
	for (i = 1; i < pat2_len; i++) {
#ifndef CASE_SENSITIVE
		if (tolower(pat2[i]) != first_char)
#else
		if (pat2[i] != first_char)
#endif
			break;
	}
	set_GSshift(ptree, pat1, 1, i);

	i = 1;
	while (1) {
		// search first char in pat2
#ifndef CASE_SENSITIVE
		while (i < pat2_len && tolower(pat2[i]) != first_char)
#else
		while (i < pat2_len && pat2[i] != first_char)
#endif
			i++;
		
		if (i == pat2_len) // not found the first char
			break;
		
		pat2_index = i;
		pat1_index = 0;
		offset = i;
		
		/* optimization: >min_pattern_size, not check*/
		if (offset > ptree->min_pattern_size)
			break;
		
		while (pat2_index < pat2_len && pat1_index < pat1_len)
		{
#ifndef CASE_SENSITIVE
			if (tolower(pat1[pat1_index]) != tolower(pat2[pat2_index])) /*found different*/
#else
			if (pat1[pat1_index] != pat2[pat2_index]) /*found different*/
#endif
				break;
			pat1_index++;
			pat2_index++;
		}
		
		if (pat1_index == pat1_len) {
			/* pat2 contained pat1 ??? how to do */
		} else if (pat2_index == pat2_len) {
			int j;
			/* 1.pat2 contained some part of pat1 */
			for (j = pat1_index; j < pat1_len; j++) {
				set_GSshift(ptree, pat1, j+1, offset);
			}
		} else /* 2.found one substring in pat2 */ 
			set_GSshift(ptree, pat1, pat1_index+1, offset);
		
		i++;
	}

	return 0;
	
err:
	return -1;
}

/*
 * ACtree_compute_GSshifts
 */
static int ACtree_compute_GSshifts(pattern_tree *ptree)
{
	int pat_i = 0, pat_j = 0;
	
	for (pat_i = 0; pat_i < ptree->pattern_count; pat_i++)
	{
		for (pat_j = 0; pat_j < ptree->pattern_count; pat_j++) {
			unsigned char *ppat_i = (ptree->pattern_list+pat_i)->data;
			int patlen_i = (ptree->pattern_list+pat_i)->len;
			unsigned char *ppat_j = (ptree->pattern_list+pat_j)->data;
			int patlen_j = (ptree->pattern_list+pat_j)->len;

			compute_GSshift(ptree, ppat_i, patlen_i, ppat_j, patlen_j);
		}
	}

	return 0;
}

/*
 * _init_GSshifts
 */
static int _init_GSshifts(pattern_tree_node *root, int shift)
{
	int i;

	if (root->label != -2)
		root->GSshift = shift;
			
	for (i = 0; i < 256; i++)
	{
#ifndef CASE_SENSITIVE
		if (i >= 'A' && i <= 'Z')
			continue;
#endif

		if (root->childs[i] != NULL) {
			_init_GSshifts(root->childs[i], shift);
		}
	}

	return 0;
}

/*
 * ACtree_init_GSshifts
 */
static int ACtree_init_GSshifts(pattern_tree *ptree)
{
	_init_GSshifts(ptree->root, ptree->min_pattern_size);

	return 0;
}

/*
 * ACtree_compute_shifts
 */
static int ACtree_compute_shifts(pattern_tree *ptree)
{
	ACtree_compute_BCshifts(ptree);
	
	ACtree_init_GSshifts(ptree);
	ACtree_compute_GSshifts(ptree);

	return 0;
}


pattern_tree *acbm_init(pattern_data *patterns, int npattern)
{
	pattern_tree *ptree = NULL;
	
	ptree = (pattern_tree *)malloc(sizeof(pattern_tree));
	if (!ptree)
		goto err;

	bzero(ptree, sizeof(pattern_tree));

	ACtree_build(ptree, patterns, npattern);
	
	ACtree_compute_shifts(ptree);

//	ACtree_print_tree(ptree);

	return ptree;

err:
	return NULL;
}

/*
 * acbm_search
 * return value:
 * n: matched n patterns
 * 0: not matched
 * <0: error
 */
inline int acbm_search(pattern_tree *ptree, unsigned char *text, int text_len, \
		unsigned int matched_indexs[], int nmax_index)
{
	int nmatched = 0;
	register int base_index = 0, cur_index = 0;
	register int real_shift = 0, gs_shift = 0, bc_shift = 0;
	pattern_tree_node *node = NULL;
	
	/*
	if (ptree == NULL || text == NULL || text_len < 0)
		goto err;
	
	if (ptree->root == NULL)
		goto err;
	
	*/
	if (text_len < ptree->min_pattern_size)
		goto ret;
	
	base_index = text_len - ptree->min_pattern_size;
	
	while (base_index >= 0)
	{
		cur_index = base_index;

		node = ptree->root;
		
#ifdef DEBUG_SEARCH
		printf("Checking pattern tree at %d...", base_index);
#endif
		while (node->childs[text[cur_index]] != NULL)
		{
			node = node->childs[text[cur_index]];
			
			if (node->label >= 0) {
				/* matched a pattern! */
#ifdef DEBUG_SEARCH
				printf("Matched(%d) ", node->label);
#endif
				matched_indexs[nmatched] = node->label;
				nmatched++;
				if (nmatched == nmax_index)
					goto ret;
			}

			cur_index++;

			if (cur_index >= text_len)
				break;
		}
		
		if (node->nchild > 0) {
			/* match fail */
			gs_shift = node->childs[node->one_child]->GSshift;
			if (cur_index < text_len)
				bc_shift = ptree->BCshift[text[cur_index]]+base_index-cur_index;
			else
				bc_shift = 1;
			real_shift = gs_shift > bc_shift ? gs_shift : bc_shift;
			base_index -= real_shift;
#ifdef DEBUG_SEARCH
			printf("Failed, GSshift:%d, BCshift:%d Realshift%d", gs_shift, bc_shift, real_shift);
#endif
		} else {
			/* match successful, ??? to be continued */
			base_index--;
#ifdef DEBUG_SEARCH
			printf("Matched, shift %d", 1);
#endif
		}
#ifdef DEBUG_SEARCH
		printf("\n");
#endif
	}
	
ret:
	return nmatched;
}

static void _clean_tree(pattern_tree_node *root)
{
	int i;
	for (i = 0; i < 256; i++)
	{
#ifndef CASE_SENSITIVE
		if (i >= 'A' && i <= 'Z')
			continue;
#endif
		if (root->childs[i]) {
			_clean_tree(root->childs[i]);

			free(root->childs[i]);
			root->childs[i] = NULL;
		}
	}

	return;
}

/*
 * acbm_clean
 */
void acbm_clean(pattern_tree *ptree)
{
	if (ptree == NULL)
		return;

	if (ptree->root) 
	{
		_clean_tree(ptree->root);
		free(ptree->root);
		ptree->root = NULL;
	}

	free(ptree);

//	ACtree_print_tree(ptree);
	return;
}


inline int acbm_search_ex(pattern_tree *ptree, unsigned char *text, int text_len, \
		matched_info_t matched_items[], int nmax_index)
{
	int nmatched = 0;
	register int base_index = 0, cur_index = 0;
	register int real_shift = 0, gs_shift = 0, bc_shift = 0;
	pattern_tree_node *node = NULL;
	
	/*
	if (ptree == NULL || text == NULL || text_len < 0)
		goto err;
	
	if (ptree->root == NULL)
		goto err;
	
	*/
	if (text_len < ptree->min_pattern_size)
		goto ret;
	
	base_index = text_len - ptree->min_pattern_size;
	
	while (base_index >= 0)
	{
		cur_index = base_index;

		node = ptree->root;
		
#ifdef DEBUG_SEARCH
		printf("Checking pattern tree at %d...", base_index);
#endif
		while (node->childs[text[cur_index]] != NULL) {
			node = node->childs[text[cur_index]];
			
			if (node->label >= 0) {
				/* matched a pattern! */
#ifdef DEBUG_SEARCH
				printf("Matched(%d) ", node->label);
#endif
				matched_items[nmatched].pattern_i = node->label;
				matched_items[nmatched].offset = base_index; /* postion */
#ifdef DEBUG_SEARCH
				printf("%s\n", text + matched_items[nmatched].offset);
#endif
				nmatched++;


				if (nmatched == nmax_index)
					goto ret;
			}

			cur_index++;

			if (cur_index >= text_len)
				break;
		}
		
		if (node->nchild > 0) {
			/* match fail */
			gs_shift = node->childs[node->one_child]->GSshift;
			if (cur_index < text_len)
				bc_shift = ptree->BCshift[text[cur_index]]+base_index-cur_index;
			else
				bc_shift = 1;
			real_shift = gs_shift > bc_shift ? gs_shift : bc_shift;
			base_index -= real_shift;
#ifdef DEBUG_SEARCH
			printf("Failed, GSshift:%d, BCshift:%d Realshift%d", gs_shift, bc_shift, real_shift);
#endif
		} else {
			/* match successful, ??? to be continued */
			base_index--;
#ifdef DEBUG_SEARCH
			printf("Matched, shift %d", 1);
#endif
		}
#ifdef DEBUG_SEARCH
		printf("\n");
#endif
	}
	
ret:
	return nmatched;
}



⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -