📄 kwset.c
字号:
delta entry for a given character is the smallest depth of any node at which an outgoing edge is labeled by that character. */ if (kwset->mind < 256) for (i = 0; i < NCHAR; ++i) delta[i] = kwset->mind; else for (i = 0; i < NCHAR; ++i) delta[i] = 255; /* Check if we can use the simple boyer-moore algorithm, instead of the hairy commentz-walter algorithm. */ if (kwset->words == 1 && kwset->trans == 0) { /* Looking for just one string. Extract it from the trie. */ kwset->target = obstack_alloc(&kwset->obstack, kwset->mind); for (i = kwset->mind - 1, curr = kwset->trie; i >= 0; --i) { kwset->target[i] = curr->links->label; curr = curr->links->trie; } /* Build the Boyer Moore delta. Boy that's easy compared to CW. */ for (i = 0; i < kwset->mind; ++i) delta[(unsigned char) kwset->target[i]] = kwset->mind - (i + 1); kwset->mind2 = kwset->mind; /* Find the minimal delta2 shift that we might make after a backwards match has failed. */ for (i = 0; i < kwset->mind - 1; ++i) if (kwset->target[i] == kwset->target[kwset->mind - 1]) kwset->mind2 = kwset->mind - (i + 1); } else { /* Traverse the nodes of the trie in level order, simultaneously computing the delta table, failure function, and shift function. */ for (curr = last = kwset->trie; curr; curr = curr->next) { /* Enqueue the immediate descendents in the level order queue. */ enqueue(curr->links, &last); curr->shift = kwset->mind; curr->maxshift = kwset->mind; /* Update the delta table for the descendents of this node. */ treedelta(curr->links, curr->depth, delta); /* Compute the failure function for the decendents of this node. */ treefails(curr->links, curr->fail, kwset->trie); /* Update the shifts at each node in the current node's chain of fails back to the root. */ for (fail = curr->fail; fail; fail = fail->fail) { /* If the current node has some outgoing edge that the fail doesn't, then the shift at the fail should be no larger than the difference of their depths. */ if (!hasevery(fail->links, curr->links)) if (curr->depth - fail->depth < fail->shift) fail->shift = curr->depth - fail->depth; /* If the current node is accepting then the shift at the fail and its descendents should be no larger than the difference of their depths. */ if (curr->accepting && fail->maxshift > curr->depth - fail->depth) fail->maxshift = curr->depth - fail->depth; } } /* Traverse the trie in level order again, fixing up all nodes whose shift exceeds their inherited maxshift. */ for (curr = kwset->trie->next; curr; curr = curr->next) { if (curr->maxshift > curr->parent->maxshift) curr->maxshift = curr->parent->maxshift; if (curr->shift > curr->maxshift) curr->shift = curr->maxshift; } /* Create a vector, indexed by character code, of the outgoing links from the root node. */ for (i = 0; i < NCHAR; ++i) next[i] = 0; treenext(kwset->trie->links, next); if ((trans = kwset->trans) != 0) for (i = 0; i < NCHAR; ++i) kwset->next[i] = next[(unsigned char) trans[i]]; else for (i = 0; i < NCHAR; ++i) kwset->next[i] = next[i]; } /* Fix things up for any translation table. */ if ((trans = kwset->trans) != 0) for (i = 0; i < NCHAR; ++i) kwset->delta[i] = delta[(unsigned char) trans[i]]; else for (i = 0; i < NCHAR; ++i) kwset->delta[i] = delta[i]; return 0;}#define U(C) ((unsigned char) (C))/* Fast boyer-moore search. */static size_tbmexec (kwset_t kws, char const *text, size_t size){ struct kwset const *kwset; register unsigned char const *d1; register char const *ep, *sp, *tp; register int d, gc, i, len, md2; kwset = (struct kwset const *) kws; len = kwset->mind; if (len == 0) return 0; if (len > size) return -1; if (len == 1) { tp = memchr (text, kwset->target[0], size); return tp ? tp - text : -1; } d1 = kwset->delta; sp = kwset->target + len; gc = U(sp[-2]); md2 = kwset->mind2; tp = text + len; /* Significance of 12: 1 (initial offset) + 10 (skip loop) + 1 (md2). */ if (size > 12 * len) /* 11 is not a bug, the initial offset happens only once. */ for (ep = text + size - 11 * len;;) { while (tp <= ep) { d = d1[U(tp[-1])], tp += d; d = d1[U(tp[-1])], tp += d; if (d == 0) goto found; d = d1[U(tp[-1])], tp += d; d = d1[U(tp[-1])], tp += d; d = d1[U(tp[-1])], tp += d; if (d == 0) goto found; d = d1[U(tp[-1])], tp += d; d = d1[U(tp[-1])], tp += d; d = d1[U(tp[-1])], tp += d; if (d == 0) goto found; d = d1[U(tp[-1])], tp += d; d = d1[U(tp[-1])], tp += d; } break; found: if (U(tp[-2]) == gc) { for (i = 3; i <= len && U(tp[-i]) == U(sp[-i]); ++i) ; if (i > len) return tp - len - text; } tp += md2; } /* Now we have only a few characters left to search. We carefully avoid ever producing an out-of-bounds pointer. */ ep = text + size; d = d1[U(tp[-1])]; while (d <= ep - tp) { d = d1[U((tp += d)[-1])]; if (d != 0) continue; if (U(tp[-2]) == gc) { for (i = 3; i <= len && U(tp[-i]) == U(sp[-i]); ++i) ; if (i > len) return tp - len - text; } d = md2; } return -1;}/* Hairy multiple string search. */static size_tcwexec (kwset_t kws, char const *text, size_t len, struct kwsmatch *kwsmatch){ struct kwset const *kwset; struct trie * const *next; struct trie const *trie; struct trie const *accept; char const *beg, *lim, *mch, *lmch; register unsigned char c; register unsigned char const *delta; register int d; register char const *end, *qlim; register struct tree const *tree; register char const *trans;#ifdef lint accept = NULL;#endif /* Initialize register copies and look for easy ways out. */ kwset = (struct kwset *) kws; if (len < kwset->mind) return -1; next = kwset->next; delta = kwset->delta; trans = kwset->trans; lim = text + len; end = text; if ((d = kwset->mind) != 0) mch = 0; else { mch = text, accept = kwset->trie; goto match; } if (len >= 4 * kwset->mind) qlim = lim - 4 * kwset->mind; else qlim = 0; while (lim - end >= d) { if (qlim && end <= qlim) { end += d - 1; while ((d = delta[c = *end]) && end < qlim) { end += d; end += delta[(unsigned char) *end]; end += delta[(unsigned char) *end]; } ++end; } else d = delta[c = (end += d)[-1]]; if (d) continue; beg = end - 1; trie = next[c]; if (trie->accepting) { mch = beg; accept = trie; } d = trie->shift; while (beg > text) { c = trans ? trans[(unsigned char) *--beg] : *--beg; tree = trie->links; while (tree && c != tree->label) if (c < tree->label) tree = tree->llink; else tree = tree->rlink; if (tree) { trie = tree->trie; if (trie->accepting) { mch = beg; accept = trie; } } else break; d = trie->shift; } if (mch) goto match; } return -1; match: /* Given a known match, find the longest possible match anchored at or before its starting point. This is nearly a verbatim copy of the preceding main search loops. */ if (lim - mch > kwset->maxd) lim = mch + kwset->maxd; lmch = 0; d = 1; while (lim - end >= d) { if ((d = delta[c = (end += d)[-1]]) != 0) continue; beg = end - 1; if (!(trie = next[c])) { d = 1; continue; } if (trie->accepting && beg <= mch) { lmch = beg; accept = trie; } d = trie->shift; while (beg > text) { c = trans ? trans[(unsigned char) *--beg] : *--beg; tree = trie->links; while (tree && c != tree->label) if (c < tree->label) tree = tree->llink; else tree = tree->rlink; if (tree) { trie = tree->trie; if (trie->accepting && beg <= mch) { lmch = beg; accept = trie; } } else break; d = trie->shift; } if (lmch) { mch = lmch; goto match; } if (!d) d = 1; } if (kwsmatch) { kwsmatch->index = accept->accepting / 2; kwsmatch->offset[0] = mch - text; kwsmatch->size[0] = accept->depth; } return mch - text;}/* Search through the given text for a match of any member of the given keyword set. Return a pointer to the first character of the matching substring, or NULL if no match is found. If FOUNDLEN is non-NULL store in the referenced location the length of the matching substring. Similarly, if FOUNDIDX is non-NULL, store in the referenced location the index number of the particular keyword matched. */size_tkwsexec (kwset_t kws, char const *text, size_t size, struct kwsmatch *kwsmatch){ struct kwset const *kwset = (struct kwset *) kws; if (kwset->words == 1 && kwset->trans == 0) { size_t ret = bmexec (kws, text, size); if (kwsmatch != 0 && ret != (size_t) -1) { kwsmatch->index = 0; kwsmatch->offset[0] = ret; kwsmatch->size[0] = kwset->mind; } return ret; } else return cwexec(kws, text, size, kwsmatch);}/* Free the components of the given keyword set. */voidkwsfree (kwset_t kws){ struct kwset *kwset; kwset = (struct kwset *) kws; obstack_free(&kwset->obstack, 0); free(kws);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -