📄 dfa.c

📁 linux平台中
💻 C
📖 第 1 页 / 共 5 页
字号:
	      || !(syntax_bits & RE_NEWLINE_ALT))	    goto normal_char;	  laststart = 1;	  return lasttok = OR;	case '(':	  if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))	    goto normal_char;	  ++parens;	  laststart = 1;	  return lasttok = LPAREN;	case ')':	  if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0))	    goto normal_char;	  if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD)	    goto normal_char;	  --parens;	  laststart = 0;	  return lasttok = RPAREN;	case '.':	  if (backslash)	    goto normal_char;#ifdef MBS_SUPPORT	  if (MB_CUR_MAX > 1)	    {	      /* In multibyte environment period must match with a single		 character not a byte.  So we use ANYCHAR.  */	      laststart = 0;	      return lasttok = ANYCHAR;	    }#endif /* MBS_SUPPORT */	  zeroset(ccl);	  notset(ccl);	  if (!(syntax_bits & RE_DOT_NEWLINE))	    clrbit(eolbyte, ccl);	  if (syntax_bits & RE_DOT_NOT_NULL)	    clrbit('\0', ccl);	  laststart = 0;	  return lasttok = CSET + charclass_index(ccl);	case 'w':	case 'W':	  if (!backslash || (syntax_bits & RE_NO_GNU_OPS))	    goto normal_char;	  zeroset(ccl);	  for (c2 = 0; c2 < NOTCHAR; ++c2)	    if (IS_WORD_CONSTITUENT(c2))	      setbit(c2, ccl);	  if (c == 'W')	    notset(ccl);	  laststart = 0;	  return lasttok = CSET + charclass_index(ccl);	case '[':	  if (backslash)	    goto normal_char;	  laststart = 0;#ifdef MBS_SUPPORT	  if (MB_CUR_MAX > 1)	    {	      /* In multibyte environment a bracket expression may contain		 multibyte characters, which must be treated as characters		 (not bytes).  So we parse it by parse_bracket_exp_mb().  */	      parse_bracket_exp_mb();	      return lasttok = MBCSET;	    }#endif	  zeroset(ccl);	  FETCH(c, _("Unbalanced ["));	  if (c == '^')	    {	      FETCH(c, _("Unbalanced ["));	      invert = 1;	    }	  else	    invert = 0;	  do	    {	      /* Nobody ever said this had to be fast. :-)		 Note that if we're looking at some other [:...:]		 construct, we just treat it as a bunch of ordinary		 characters.  We can do this because we assume		 regex has checked for syntax errors before		 dfa is ever called. */	      if (c == '[' && (syntax_bits & RE_CHAR_CLASSES))		for (c1 = 0; prednames[c1].name; ++c1)		  if (looking_at(prednames[c1].name))		    {		      int (*pred) PARAMS ((int)) = prednames[c1].pred;		      for (c2 = 0; c2 < NOTCHAR; ++c2)			if ((*pred)(c2))			  setbit_case_fold (c2, ccl);		      lexptr += strlen(prednames[c1].name);		      lexleft -= strlen(prednames[c1].name);		      FETCH(c1, _("Unbalanced ["));		      goto skip;		    }	      if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))		FETCH(c, _("Unbalanced ["));	      FETCH(c1, _("Unbalanced ["));	      if (c1 == '-')		{		  FETCH(c2, _("Unbalanced ["));		  if (c2 == ']')		    {		      /* In the case [x-], the - is an ordinary hyphen,			 which is left in c1, the lookahead character. */		      --lexptr;		      ++lexleft;		    }		  else		    {		      if (c2 == '\\'			  && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))			FETCH(c2, _("Unbalanced ["));		      FETCH(c1, _("Unbalanced ["));		      if (!hard_LC_COLLATE) {		        for (; c <= c2; c++)			  setbit_case_fold (c, ccl);		      } else {			/* POSIX locales are painful - leave the decision to libc */			char expr[6] = { '[', c, '-', c2, ']', '\0' };			regex_t re;			if (regcomp (&re, expr, case_fold ? REG_ICASE : 0) == REG_NOERROR) {			  for (c = 0; c < NOTCHAR; ++c) {			    char buf[2] = { c, '\0' };			    regmatch_t mat;			    if (regexec (&re, buf, 1, &mat, 0) == REG_NOERROR                               && mat.rm_so == 0 && mat.rm_eo == 1)                              setbit_case_fold (c, ccl);			  }			  regfree (&re);			}		      }		      continue;		    }		}	      setbit_case_fold (c, ccl);	    skip:	      ;	    }	  while ((c = c1) != ']');	  if (invert)	    {	      notset(ccl);	      if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)		clrbit(eolbyte, ccl);	    }	  return lasttok = CSET + charclass_index(ccl);	default:	normal_char:	  laststart = 0;	  if (case_fold && ISALPHA(c))	    {	      zeroset(ccl);	      setbit_case_fold (c, ccl);	      return lasttok = CSET + charclass_index(ccl);	    }	  return c;	}    }  /* The above loop should consume at most a backslash     and some other character. */  abort();  return END;	/* keeps pedantic compilers happy. */}/* Recursive descent parser for regular expressions. */static token tok;		/* Lookahead token. */static int depth;		/* Current depth of a hypothetical stack				   holding deferred productions.  This is				   used to determine the depth that will be				   required of the real stack later on in				   dfaanalyze(). *//* Add the given token to the parse tree, maintaining the depth count and   updating the maximum depth if necessary. */static voidaddtok (token t){#ifdef MBS_SUPPORT  if (MB_CUR_MAX > 1)    {      REALLOC_IF_NECESSARY(dfa->multibyte_prop, int, dfa->nmultibyte_prop,			   dfa->tindex);      /* Set dfa->multibyte_prop.  See struct dfa in dfa.h.  */      if (t == MBCSET)	dfa->multibyte_prop[dfa->tindex] = ((dfa->nmbcsets - 1) << 2) + 3;      else if (t < NOTCHAR)	dfa->multibyte_prop[dfa->tindex]	  = (cur_mb_len == 1)? 3 /* single-byte char */	  : (((cur_mb_index == 1)? 1 : 0) /* 1st-byte of multibyte char */	     + ((cur_mb_index == cur_mb_len)? 2 : 0)); /* last-byte */      else	/* It may be unnecesssary, but it is safer to treat other	   symbols as singlebyte characters.  */	dfa->multibyte_prop[dfa->tindex] = 3;    }#endif  REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex);  dfa->tokens[dfa->tindex++] = t;  switch (t)    {    case QMARK:    case STAR:    case PLUS:      break;    case CAT:    case OR:    case ORTOP:      --depth;      break;    default:      ++dfa->nleaves;    case EMPTY:      ++depth;      break;    }  if (depth > dfa->depth)    dfa->depth = depth;}/* The grammar understood by the parser is as follows.   regexp:     regexp OR branch     branch   branch:     branch closure     closure   closure:     closure QMARK     closure STAR     closure PLUS     closure REPMN     atom   atom:     <normal character>     <multibyte character>     ANYCHAR     MBCSET     CSET     BACKREF     BEGLINE     ENDLINE     BEGWORD     ENDWORD     LIMWORD     NOTLIMWORD     CRANGE     LPAREN regexp RPAREN     <empty>   The parser builds a parse tree in postfix form in an array of tokens. */static voidatom (void){  if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF      || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD#ifdef MBS_SUPPORT      || tok == ANYCHAR || tok == MBCSET /* MB_CUR_MAX > 1 */#endif /* MBS_SUPPORT */      || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD)    {      addtok(tok);      tok = lex();#ifdef MBS_SUPPORT      /* We treat a multibyte character as a single atom, so that DFA	 can treat a multibyte character as a single expression.         e.g. We construct following tree from "<mb1><mb2>".              <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT>              <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT>      */      if (MB_CUR_MAX > 1)	{	  while (cur_mb_index > 1 && tok >= 0 && tok < NOTCHAR)	    {	      addtok(tok);	      addtok(CAT);	      tok = lex();	    }	}#endif /* MBS_SUPPORT  */    }  else if (tok == CRANGE)    {      /* A character range like "[a-z]" in a locale other than "C" or	 "POSIX".  This range might any sequence of one or more	 characters.  Unfortunately the POSIX locale primitives give	 us no practical way to find what character sequences might be	 matched.  Treat this approximately like "(.\1)" -- i.e. match	 one character, and then punt to the full matcher.  */      charclass ccl;      zeroset (ccl);      notset (ccl);      addtok (CSET + charclass_index (ccl));      addtok (BACKREF);      addtok (CAT);      tok = lex ();    }  else if (tok == LPAREN)    {      tok = lex();      regexp(0);      if (tok != RPAREN)	dfaerror(_("Unbalanced ("));      tok = lex();    }  else    addtok(EMPTY);}/* Return the number of tokens in the given subexpression. */static intnsubtoks (int tindex){  int ntoks1;  switch (dfa->tokens[tindex - 1])    {    default:      return 1;    case QMARK:    case STAR:    case PLUS:      return 1 + nsubtoks(tindex - 1);    case CAT:    case OR:    case ORTOP:      ntoks1 = nsubtoks(tindex - 1);      return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1);    }}/* Copy the given subexpression to the top of the tree. */static voidcopytoks (int tindex, int ntokens){  int i;  for (i = 0; i < ntokens; ++i)    addtok(dfa->tokens[tindex + i]);}static voidclosure (void){  int tindex, ntokens, i;  atom();  while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN)    if (tok == REPMN)      {	ntokens = nsubtoks(dfa->tindex);	tindex = dfa->tindex - ntokens;	if (maxrep < 0)	  addtok(PLUS);	if (minrep == 0)	  addtok(QMARK);	for (i = 1; i < minrep; ++i)	  {	    copytoks(tindex, ntokens);	    addtok(CAT);	  }	for (; i < maxrep; ++i)	  {	    copytoks(tindex, ntokens);	    addtok(QMARK);	    addtok(CAT);	  }	tok = lex();      }    else      {	addtok(tok);	tok = lex();      }}static voidbranch (void){  closure();  while (tok != RPAREN && tok != OR && tok >= 0)    {      closure();      addtok(CAT);    }}static voidregexp (int toplevel){  branch();  while (tok == OR)    {      tok = lex();      branch();      if (toplevel)	addtok(ORTOP);      else	addtok(OR);    }}/* Main entry point for the parser.  S is a string to be parsed, len is the   length of the string, so s can include NUL characters.  D is a pointer to   the struct dfa to parse into. */voiddfaparse (char const *s, size_t len, struct dfa *d){  dfa = d;  lexstart = lexptr = s;  lexleft = len;  lasttok = END;  laststart = 1;  parens = 0;#if ENABLE_NLS  hard_LC_COLLATE = hard_locale (LC_COLLATE);#endif#ifdef MBS_SUPPORT  if (MB_CUR_MAX > 1)    {      cur_mb_index = 0;      cur_mb_len = 0;      memset(&mbs, 0, sizeof(mbstate_t));    }#endif /* MBS_SUPPORT  */  if (! syntax_bits_set)    dfaerror(_("No syntax specified"));  tok = lex();  depth = d->depth;  regexp(1);  if (tok != END)    dfaerror(_("Unbalanced )"));  addtok(END - d->nregexps);  addtok(CAT);  if (d->nregexps)    addtok(ORTOP);  ++d->nregexps;}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -