📄 kwset.c

📁 linux平台中
💻 C
📖 第 1 页 / 共 2 页
字号:
上一页 12
     delta entry for a given character is the smallest depth of any     node at which an outgoing edge is labeled by that character. */  if (kwset->mind < 256)    for (i = 0; i < NCHAR; ++i)      delta[i] = kwset->mind;  else    for (i = 0; i < NCHAR; ++i)      delta[i] = 255;  /* Check if we can use the simple boyer-moore algorithm, instead     of the hairy commentz-walter algorithm. */  if (kwset->words == 1 && kwset->trans == 0)    {      /* Looking for just one string.  Extract it from the trie. */      kwset->target = obstack_alloc(&kwset->obstack, kwset->mind);      for (i = kwset->mind - 1, curr = kwset->trie; i >= 0; --i)	{	  kwset->target[i] = curr->links->label;	  curr = curr->links->trie;	}      /* Build the Boyer Moore delta.  Boy that's easy compared to CW. */      for (i = 0; i < kwset->mind; ++i)	delta[(unsigned char) kwset->target[i]] = kwset->mind - (i + 1);      kwset->mind2 = kwset->mind;      /* Find the minimal delta2 shift that we might make after	 a backwards match has failed. */      for (i = 0; i < kwset->mind - 1; ++i)	if (kwset->target[i] == kwset->target[kwset->mind - 1])	  kwset->mind2 = kwset->mind - (i + 1);    }  else    {      /* Traverse the nodes of the trie in level order, simultaneously	 computing the delta table, failure function, and shift function. */      for (curr = last = kwset->trie; curr; curr = curr->next)	{	  /* Enqueue the immediate descendents in the level order queue. */	  enqueue(curr->links, &last);	  curr->shift = kwset->mind;	  curr->maxshift = kwset->mind;	  /* Update the delta table for the descendents of this node. */	  treedelta(curr->links, curr->depth, delta);	  /* Compute the failure function for the decendents of this node. */	  treefails(curr->links, curr->fail, kwset->trie);	  /* Update the shifts at each node in the current node's chain	     of fails back to the root. */	  for (fail = curr->fail; fail; fail = fail->fail)	    {	      /* If the current node has some outgoing edge that the fail		 doesn't, then the shift at the fail should be no larger		 than the difference of their depths. */	      if (!hasevery(fail->links, curr->links))		if (curr->depth - fail->depth < fail->shift)		  fail->shift = curr->depth - fail->depth;	      /* If the current node is accepting then the shift at the		 fail and its descendents should be no larger than the		 difference of their depths. */	      if (curr->accepting && fail->maxshift > curr->depth - fail->depth)		fail->maxshift = curr->depth - fail->depth;	    }	}      /* Traverse the trie in level order again, fixing up all nodes whose	 shift exceeds their inherited maxshift. */      for (curr = kwset->trie->next; curr; curr = curr->next)	{	  if (curr->maxshift > curr->parent->maxshift)	    curr->maxshift = curr->parent->maxshift;	  if (curr->shift > curr->maxshift)	    curr->shift = curr->maxshift;	}      /* Create a vector, indexed by character code, of the outgoing links	 from the root node. */      for (i = 0; i < NCHAR; ++i)	next[i] = 0;      treenext(kwset->trie->links, next);      if ((trans = kwset->trans) != 0)	for (i = 0; i < NCHAR; ++i)	  kwset->next[i] = next[(unsigned char) trans[i]];      else	for (i = 0; i < NCHAR; ++i)	  kwset->next[i] = next[i];    }  /* Fix things up for any translation table. */  if ((trans = kwset->trans) != 0)    for (i = 0; i < NCHAR; ++i)      kwset->delta[i] = delta[(unsigned char) trans[i]];  else    for (i = 0; i < NCHAR; ++i)      kwset->delta[i] = delta[i];  return 0;}#define U(C) ((unsigned char) (C))/* Fast boyer-moore search. */static size_tbmexec (kwset_t kws, char const *text, size_t size){  struct kwset const *kwset;  register unsigned char const *d1;  register char const *ep, *sp, *tp;  register int d, gc, i, len, md2;  kwset = (struct kwset const *) kws;  len = kwset->mind;  if (len == 0)    return 0;  if (len > size)    return -1;  if (len == 1)    {      tp = memchr (text, kwset->target[0], size);      return tp ? tp - text : -1;    }  d1 = kwset->delta;  sp = kwset->target + len;  gc = U(sp[-2]);  md2 = kwset->mind2;  tp = text + len;  /* Significance of 12: 1 (initial offset) + 10 (skip loop) + 1 (md2). */  if (size > 12 * len)    /* 11 is not a bug, the initial offset happens only once. */    for (ep = text + size - 11 * len;;)      {	while (tp <= ep)	  {	    d = d1[U(tp[-1])], tp += d;	    d = d1[U(tp[-1])], tp += d;	    if (d == 0)	      goto found;	    d = d1[U(tp[-1])], tp += d;	    d = d1[U(tp[-1])], tp += d;	    d = d1[U(tp[-1])], tp += d;	    if (d == 0)	      goto found;	    d = d1[U(tp[-1])], tp += d;	    d = d1[U(tp[-1])], tp += d;	    d = d1[U(tp[-1])], tp += d;	    if (d == 0)	      goto found;	    d = d1[U(tp[-1])], tp += d;	    d = d1[U(tp[-1])], tp += d;	  }	break;      found:	if (U(tp[-2]) == gc)	  {	    for (i = 3; i <= len && U(tp[-i]) == U(sp[-i]); ++i)	      ;	    if (i > len)	      return tp - len - text;	  }	tp += md2;      }  /* Now we have only a few characters left to search.  We     carefully avoid ever producing an out-of-bounds pointer. */  ep = text + size;  d = d1[U(tp[-1])];  while (d <= ep - tp)    {      d = d1[U((tp += d)[-1])];      if (d != 0)	continue;      if (U(tp[-2]) == gc)	{	  for (i = 3; i <= len && U(tp[-i]) == U(sp[-i]); ++i)	    ;	  if (i > len)	    return tp - len - text;	}      d = md2;    }  return -1;}/* Hairy multiple string search. */static size_tcwexec (kwset_t kws, char const *text, size_t len, struct kwsmatch *kwsmatch){  struct kwset const *kwset;  struct trie * const *next;  struct trie const *trie;  struct trie const *accept;  char const *beg, *lim, *mch, *lmch;  register unsigned char c;  register unsigned char const *delta;  register int d;  register char const *end, *qlim;  register struct tree const *tree;  register char const *trans;#ifdef lint  accept = NULL;#endif  /* Initialize register copies and look for easy ways out. */  kwset = (struct kwset *) kws;  if (len < kwset->mind)    return -1;  next = kwset->next;  delta = kwset->delta;  trans = kwset->trans;  lim = text + len;  end = text;  if ((d = kwset->mind) != 0)    mch = 0;  else    {      mch = text, accept = kwset->trie;      goto match;    }  if (len >= 4 * kwset->mind)    qlim = lim - 4 * kwset->mind;  else    qlim = 0;  while (lim - end >= d)    {      if (qlim && end <= qlim)	{	  end += d - 1;	  while ((d = delta[c = *end]) && end < qlim)	    {	      end += d;	      end += delta[(unsigned char) *end];	      end += delta[(unsigned char) *end];	    }	  ++end;	}      else	d = delta[c = (end += d)[-1]];      if (d)	continue;      beg = end - 1;      trie = next[c];      if (trie->accepting)	{	  mch = beg;	  accept = trie;	}      d = trie->shift;      while (beg > text)	{	  c = trans ? trans[(unsigned char) *--beg] : *--beg;	  tree = trie->links;	  while (tree && c != tree->label)	    if (c < tree->label)	      tree = tree->llink;	    else	      tree = tree->rlink;	  if (tree)	    {	      trie = tree->trie;	      if (trie->accepting)		{		  mch = beg;		  accept = trie;		}	    }	  else	    break;	  d = trie->shift;	}      if (mch)	goto match;    }  return -1; match:  /* Given a known match, find the longest possible match anchored     at or before its starting point.  This is nearly a verbatim     copy of the preceding main search loops. */  if (lim - mch > kwset->maxd)    lim = mch + kwset->maxd;  lmch = 0;  d = 1;  while (lim - end >= d)    {      if ((d = delta[c = (end += d)[-1]]) != 0)	continue;      beg = end - 1;      if (!(trie = next[c]))	{	  d = 1;	  continue;	}      if (trie->accepting && beg <= mch)	{	  lmch = beg;	  accept = trie;	}      d = trie->shift;      while (beg > text)	{	  c = trans ? trans[(unsigned char) *--beg] : *--beg;	  tree = trie->links;	  while (tree && c != tree->label)	    if (c < tree->label)	      tree = tree->llink;	    else	      tree = tree->rlink;	  if (tree)	    {	      trie = tree->trie;	      if (trie->accepting && beg <= mch)		{		  lmch = beg;		  accept = trie;		}	    }	  else	    break;	  d = trie->shift;	}      if (lmch)	{	  mch = lmch;	  goto match;	}      if (!d)	d = 1;    }  if (kwsmatch)    {      kwsmatch->index = accept->accepting / 2;      kwsmatch->offset[0] = mch - text;      kwsmatch->size[0] = accept->depth;    }  return mch - text;}/* Search through the given text for a match of any member of the   given keyword set.  Return a pointer to the first character of   the matching substring, or NULL if no match is found.  If FOUNDLEN   is non-NULL store in the referenced location the length of the   matching substring.  Similarly, if FOUNDIDX is non-NULL, store   in the referenced location the index number of the particular   keyword matched. */size_tkwsexec (kwset_t kws, char const *text, size_t size,	 struct kwsmatch *kwsmatch){  struct kwset const *kwset = (struct kwset *) kws;  if (kwset->words == 1 && kwset->trans == 0)    {      size_t ret = bmexec (kws, text, size);      if (kwsmatch != 0 && ret != (size_t) -1)	{	  kwsmatch->index = 0;	  kwsmatch->offset[0] = ret;	  kwsmatch->size[0] = kwset->mind;	}      return ret;    }  else    return cwexec(kws, text, size, kwsmatch);}/* Free the components of the given keyword set. */voidkwsfree (kwset_t kws){  struct kwset *kwset;  kwset = (struct kwset *) kws;  obstack_free(&kwset->obstack, 0);  free(kws);}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -