📄 dfa.c
字号:
if (!regcopy) reg_error("out of memory"); /* This is a complete kludge and could potentially break \<letter> escapes . . . */ case_fold = 0; for (i = 0; i < len; ++i) if (ISUPPER(s[i])) regcopy[i] = tolower(s[i]); else regcopy[i] = s[i]; reginit(r); r->mustn = 0; r->must[0] = '\0'; regparse(regcopy, len, r); free(regcopy); regmust(r); reganalyze(r, searchflag); case_fold = 1; reginit(r); regparse(s, len, r); reganalyze(r, searchflag); } else { reginit(r); regparse(s, len, r); regmust(r); reganalyze(r, searchflag); }}/* Free the storage held by the components of a regexp. */voidreg_free(r) struct regexp *r;{ int i; free((ptr_t) r->charsets); free((ptr_t) r->tokens); for (i = 0; i < r->sindex; ++i) free((ptr_t) r->states[i].elems.elems); free((ptr_t) r->states); for (i = 0; i < r->tindex; ++i) if (r->follows[i].elems) free((ptr_t) r->follows[i].elems); free((ptr_t) r->follows); for (i = 0; i < r->tralloc; ++i) if (r->trans[i]) free((ptr_t) r->trans[i]); else if (r->fails[i]) free((ptr_t) r->fails[i]); if (r->realtrans) free((ptr_t) r->realtrans); if (r->fails) free((ptr_t) r->fails); if (r->newlines) free((ptr_t) r->newlines);}/*Having found the postfix representation of the regular expression,try to find a long sequence of characters that must appear in any linecontaining the r.e.Finding a "longest" sequence is beyond the scope here;we take an easy way out and hope for the best.(Take "(ab|a)b"--please.)We do a bottom-up calculation of sequences of characters that must appearin matches of r.e.'s represented by trees rooted at the nodes of the postfixrepresentation: sequences that must appear at the left of the match ("left") sequences that must appear at the right of the match ("right") lists of sequences that must appear somewhere in the match ("in") sequences that must constitute the match ("is")When we get to the root of the tree, we use one of the longest of itscalculated "in" sequences as our answer. The sequence we find is returned inr->must (where "r" is the single argument passed to "regmust");the length of the sequence is returned in r->mustn.The sequences calculated for the various types of node (in pseudo ANSI c)are shown below. "p" is the operand of unary operators (and the left-handoperand of binary operators); "q" is the right-hand operand of binary operators."ZERO" means "a zero-length sequence" below.Type left right is in---- ---- ----- -- --char c # c # c # c # cSET ZERO ZERO ZERO ZEROSTAR ZERO ZERO ZERO ZEROQMARK ZERO ZERO ZERO ZEROPLUS p->left p->right ZERO p->inCAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus p->left : q->right : q->is!=ZERO) ? q->in plus p->is##q->left p->right##q->is p->is##q->is : p->right##q->left ZEROOR longest common longest common (do p->is and substrings common to leading trailing q->is have same p->in and q->in (sub)sequence (sub)sequence length and of p->left of p->right content) ? and q->left and q->right p->is : NULL If there's anything else we recognize in the tree, all four sequences get setto zero-length sequences. If there's something we don't recognize in the tree,we just return a zero-length sequence.Break ties in favor of infrequent letters (choosing 'zzz' in preference to'aaa')?And. . .is it here or someplace that we might ponder "optimizations" such as egrep 'psi|epsilon' -> egrep 'psi' egrep 'pepsi|epsilon' -> egrep 'epsi' (Yes, we now find "epsi" as a "string that must occur", but we might also simplify the *entire* r.e. being sought) grep '[c]' -> grep 'c' grep '(ab|a)b' -> grep 'ab' grep 'ab*' -> grep 'a' grep 'a*b' -> grep 'b'There are several issues: Is optimization easy (enough)? Does optimization actually accomplish anything, or is the automaton you get from "psi|epsilon" (for example) the same as the one you get from "psi" (for example)? Are optimizable r.e.'s likely to be used in real-life situations (something like 'ab*' is probably unlikely; something like is 'psi|epsilon' is likelier)?*/static char *icatalloc(old, new)char * old;const char * new;{ register char * result; register int oldsize, newsize; newsize = (new == NULL) ? 0 : strlen(new); if (old == NULL) oldsize = 0; else if (newsize == 0) return old; else oldsize = strlen(old); if (old == NULL) result = (char *) malloc(newsize + 1); else result = (char *) realloc((void *) old, oldsize + newsize + 1); if (result != NULL && new != NULL) (void) strcpy(result + oldsize, new); return result;}static char *icpyalloc(string)const char * string;{ return icatalloc((char *) NULL, string);}static char *istrstr(lookin, lookfor)char * lookin;register char * lookfor;{ register char * cp; register int len; len = strlen(lookfor); for (cp = lookin; *cp != '\0'; ++cp) if (strncmp(cp, lookfor, len) == 0) return cp; return NULL;}static voidifree(cp)char * cp;{ if (cp != NULL) free(cp);}static voidfreelist(cpp)register char ** cpp;{ register int i; if (cpp == NULL) return; for (i = 0; cpp[i] != NULL; ++i) { free(cpp[i]); cpp[i] = NULL; }}static char **enlist(cpp, new, len)register char ** cpp;register char * new;#ifdef __STDC__size_t len;#elseint len;#endif{ register int i, j; if (cpp == NULL) return NULL; if ((new = icpyalloc(new)) == NULL) { freelist(cpp); return NULL; } new[len] = '\0'; /* ** Is there already something in the list that's new (or longer)? */ for (i = 0; cpp[i] != NULL; ++i) if (istrstr(cpp[i], new) != NULL) { free(new); return cpp; } /* ** Eliminate any obsoleted strings. */ j = 0; while (cpp[j] != NULL) if (istrstr(new, cpp[j]) == NULL) ++j; else { free(cpp[j]); if (--i == j) break; cpp[j] = cpp[i]; } /* ** Add the new string. */ cpp = (char **) realloc((char *) cpp, (i + 2) * sizeof *cpp); if (cpp == NULL) return NULL; cpp[i] = new; cpp[i + 1] = NULL; return cpp;}/*** Given pointers to two strings,** return a pointer to an allocated list of their distinct common substrings.** Return NULL if something seems wild.*/static char **comsubs(left, right)char * left;char * right;{ register char ** cpp; register char * lcp; register char * rcp; register int i, len; if (left == NULL || right == NULL) return NULL; cpp = (char **) malloc(sizeof *cpp); if (cpp == NULL) return NULL; cpp[0] = NULL; for (lcp = left; *lcp != '\0'; ++lcp) { len = 0; rcp = strchr(right, *lcp); while (rcp != NULL) { for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i) ; if (i > len) len = i; rcp = strchr(rcp + 1, *lcp); } if (len == 0) continue;#ifdef __STDC__ if ((cpp = enlist(cpp, lcp, (size_t)len)) == NULL)#else if ((cpp = enlist(cpp, lcp, len)) == NULL)#endif break; } return cpp;}static char **addlists(old, new)char ** old;char ** new;{ register int i; if (old == NULL || new == NULL) return NULL; for (i = 0; new[i] != NULL; ++i) { old = enlist(old, new[i], strlen(new[i])); if (old == NULL) break; } return old;}/*** Given two lists of substrings,** return a new list giving substrings common to both.*/static char **inboth(left, right)char ** left;char ** right;{ register char ** both; register char ** temp; register int lnum, rnum; if (left == NULL || right == NULL) return NULL; both = (char **) malloc(sizeof *both); if (both == NULL) return NULL; both[0] = NULL; for (lnum = 0; left[lnum] != NULL; ++lnum) { for (rnum = 0; right[rnum] != NULL; ++rnum) { temp = comsubs(left[lnum], right[rnum]); if (temp == NULL) { freelist(both); return NULL; } both = addlists(both, temp); freelist(temp); if (both == NULL) return NULL; } } return both;}/*typedef struct { char ** in; char * left; char * right; char * is;} must; */static voidresetmust(mp)register must * mp;{ mp->left[0] = mp->right[0] = mp->is[0] = '\0'; freelist(mp->in);}static voidregmust(r)register struct regexp * r;{ register must * musts; register must * mp; register char * result = ""; register int ri; register int i; register _token t; static must must0; reg->mustn = 0; reg->must[0] = '\0'; musts = (must *) malloc((reg->tindex + 1) * sizeof *musts); if (musts == NULL) return; mp = musts; for (i = 0; i <= reg->tindex; ++i) mp[i] = must0; for (i = 0; i <= reg->tindex; ++i) { mp[i].in = (char **) malloc(sizeof *mp[i].in); mp[i].left = malloc(2); mp[i].right = malloc(2); mp[i].is = malloc(2); if (mp[i].in == NULL || mp[i].left == NULL || mp[i].right == NULL || mp[i].is == NULL) goto done; mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0'; mp[i].in[0] = NULL; } for (ri = 0; ri < reg->tindex; ++ri) { switch (t = reg->tokens[ri]) { case _ALLBEGLINE: case _ALLENDLINE: case _LPAREN: case _RPAREN: goto done; /* "cannot happen" */ case _EMPTY: case _BEGLINE: case _ENDLINE: case _BEGWORD: case _ENDWORD: case _LIMWORD: case _NOTLIMWORD: case _BACKREF: resetmust(mp); break; case _STAR: case _QMARK: if (mp <= musts) goto done; /* "cannot happen" */ --mp; resetmust(mp); break; case _OR: if (mp < &musts[2]) goto done; /* "cannot happen" */ { register char ** new; register must * lmp; register must * rmp; register int j, ln, rn, n; rmp = --mp; lmp = --mp; /* Guaranteed to be. Unlikely, but. . . */ if (strcmp(lmp->is, rmp->is) != 0) lmp->is[0] = '\0'; /* Left side--easy */ i = 0; while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i]) ++i; lmp->left[i] = '\0'; /* Right side */ ln = strlen(lmp->right); rn = strlen(rmp->right); n = ln; if (n > rn) n = rn; for (i = 0; i < n; ++i) if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1]) break; for (j = 0; j < i; ++j) lmp->right[j] = lmp->right[(ln - i) + j]; lmp->right[j] = '\0'; new = inboth(lmp->in, rmp->in); if (new == NULL) goto done; freelist(lmp->in); free((char *) lmp->in); lmp->in = new; } break; case _PLUS: if (mp <= musts) goto done; /* "cannot happen" */ --mp; mp->is[0] = '\0'; break; case _END: if (mp != &musts[1]) goto done; /* "cannot happen" */ for (i = 0; musts[0].in[i] != NULL; ++i) if (strlen(musts[0].in[i]) > strlen(result)) result = musts[0].in[i]; goto done; case _CAT: if (mp < &musts[2]) goto done; /* "cannot happen" */ { register must * lmp; register must * rmp; rmp = --mp; lmp = --mp; /* ** In. Everything in left, plus everything in ** right, plus catenation of ** left's right and right's left. */ lmp->in = addlists(lmp->in, rmp->in); if (lmp->in == NULL) goto done; if (lmp->right[0] != '\0' && rmp->left[0] != '\0') { register char * tp; tp = icpyalloc(lmp->right); if (tp == NULL) goto done; tp = icatalloc(tp, rmp->left); if (tp == NULL) goto done; lmp->in = enlist(lmp->in, tp, strlen(tp)); free(tp); if (lmp->in == NULL) goto done; } /* Left-hand */ if (lmp->is[0] != '\0') { lmp->left = icatalloc(lmp->left, rmp->left); if (lmp->left == NULL) goto done; } /* Right-hand */ if (rmp->is[0] == '\0') lmp->right[0] = '\0'; lmp->right = icatalloc(lmp->right, rmp->right); if (lmp->right == NULL) goto done; /* Guaranteed to be */ if (lmp->is[0] != '\0' && rmp->is[0] != '\0') { lmp->is = icatalloc(lmp->is, rmp->is); if (lmp->is == NULL) goto done; } } break; default: if (t < _END) { /* "cannot happen" */ goto done; } else if (t == '\0') { /* not on *my* shift */ goto done; } else if (t >= _SET) { /* easy enough */ resetmust(mp); } else { /* plain character */ resetmust(mp); mp->is[0] = mp->left[0] = mp->right[0] = t; mp->is[1] = mp->left[1] = mp->right[1] = '\0'; mp->in = enlist(mp->in, mp->is, 1); if (mp->in == NULL) goto done; } break; } ++mp; }done: (void) strncpy(reg->must, result, MUST_MAX - 1); reg->must[MUST_MAX - 1] = '\0'; reg->mustn = strlen(reg->must); mp = musts; for (i = 0; i <= reg->tindex; ++i) { freelist(mp[i].in); ifree((char *) mp[i].in); ifree(mp[i].left); ifree(mp[i].right); ifree(mp[i].is); } free((char *) mp);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -