📄 dfa.c

📁 linux平台中
💻 C
📖 第 1 页 / 共 5 页
字号:
  int chars_al, range_sts_al, range_ends_al, ch_classes_al,    equivs_al, coll_elems_al;  REALLOC_IF_NECESSARY(dfa->mbcsets, struct mb_char_classes,		       dfa->mbcsets_alloc, dfa->nmbcsets + 1);  /* dfa->multibyte_prop[] hold the index of dfa->mbcsets.     We will update dfa->multibyte_prop in addtok(), because we can't     decide the index in dfa->tokens[].  */  /* Initialize work are */  work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]);  chars_al = 1;  range_sts_al = range_ends_al = 0;  ch_classes_al = equivs_al = coll_elems_al = 0;  MALLOC(work_mbc->chars, wchar_t, chars_al);  work_mbc->nchars = work_mbc->nranges = work_mbc->nch_classes = 0;  work_mbc->nequivs = work_mbc->ncoll_elems = 0;  work_mbc->chars = work_mbc->ch_classes = NULL;  work_mbc->range_sts = work_mbc->range_ends = NULL;  work_mbc->equivs = work_mbc->coll_elems = NULL;  wc = fetch_wc(_("Unbalanced ["));  if (wc == L'^')    {      wc = fetch_wc(_("Unbalanced ["));      work_mbc->invert = 1;    }  else    work_mbc->invert = 0;  do    {      wc1 = -1; /* mark wc1 is not initialized".  */      /* Note that if we're looking at some other [:...:] construct,	 we just treat it as a bunch of ordinary characters.  We can do	 this because we assume regex has checked for syntax errors before	 dfa is ever called. */      if (wc == L'[' && (syntax_bits & RE_CHAR_CLASSES))	{#define BRACKET_BUFFER_SIZE 128	  char str[BRACKET_BUFFER_SIZE];	  wc1 = wc;	  wc = fetch_wc(_("Unbalanced ["));	  /* If pattern contains `[[:', `[[.', or `[[='.  */	  if (cur_mb_len == 1 && (wc == L':' || wc == L'.' || wc == L'='))	    {	      unsigned char c;	      unsigned char delim = (unsigned char)wc;	      int len = 0;	      for (;;)		{		  if (! lexleft)		    dfaerror (_("Unbalanced ["));		  c = (unsigned char) *lexptr++;		  --lexleft;		  if ((c == delim && *lexptr == ']') || lexleft == 0)		    break;		  if (len < BRACKET_BUFFER_SIZE)		    str[len++] = c;		  else		    /* This is in any case an invalid class name.  */		    str[0] = '\0';		}	      str[len] = '\0';	      if (lexleft == 0)		{		  REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,				       work_mbc->nchars + 2);		  work_mbc->chars[work_mbc->nchars++] = L'[';		  work_mbc->chars[work_mbc->nchars++] = delim;		  break; 		}	      if (--lexleft, *lexptr++ != ']')		dfaerror (_("Unbalanced ["));	      if (delim == ':')		/* build character class.  */		{		  wctype_t wt;		  /* Query the character class as wctype_t.  */		  wt = wctype (str);		  if (ch_classes_al == 0)		    MALLOC(work_mbc->ch_classes, wchar_t, ++ch_classes_al);		  REALLOC_IF_NECESSARY(work_mbc->ch_classes, wctype_t,				       ch_classes_al,				       work_mbc->nch_classes + 1);		  work_mbc->ch_classes[work_mbc->nch_classes++] = wt; 		}	      else if (delim == '=' || delim == '.')		{		  char *elem;		  MALLOC(elem, char, len + 1);		  strncpy(elem, str, len + 1);		  if (delim == '=')		    /* build equivalent class.  */		    {		      if (equivs_al == 0)			MALLOC(work_mbc->equivs, char*, ++equivs_al);		      REALLOC_IF_NECESSARY(work_mbc->equivs, char*,					   equivs_al,					   work_mbc->nequivs + 1);		      work_mbc->equivs[work_mbc->nequivs++] = elem;		    }		  if (delim == '.')		    /* build collating element.  */		    {		      if (coll_elems_al == 0)			MALLOC(work_mbc->coll_elems, char*, ++coll_elems_al);		      REALLOC_IF_NECESSARY(work_mbc->coll_elems, char*,					   coll_elems_al,					   work_mbc->ncoll_elems + 1);		      work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem;		    } 		}	      wc = -1;	    }	  else	    /* We treat '[' as a normal character here.  */	    {	      wc2 = wc1; wc1 = wc; wc = wc2; /* swap */	    }	}      else	{	  if (wc == L'\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))	    wc = fetch_wc(("Unbalanced ["));	}      if (wc1 == -1)	wc1 = fetch_wc(_("Unbalanced ["));      if (wc1 == L'-')	/* build range characters.  */	{	  wc2 = fetch_wc(_("Unbalanced ["));	  if (wc2 == L']')	    {	      /* In the case [x-], the - is an ordinary hyphen,		 which is left in c1, the lookahead character. */	      lexptr -= cur_mb_len;	      lexleft += cur_mb_len;	      wc2 = wc;	    }	  else	    {	      if (wc2 == L'\\'		  && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))		wc2 = fetch_wc(_("Unbalanced ["));	      wc1 = fetch_wc(_("Unbalanced ["));	    }	  if (range_sts_al == 0)	    {	      MALLOC(work_mbc->range_sts, wchar_t, ++range_sts_al);	      MALLOC(work_mbc->range_ends, wchar_t, ++range_ends_al);	    }	  REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t,			       range_sts_al, work_mbc->nranges + 1);	  work_mbc->range_sts[work_mbc->nranges] = wc;	  REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t,			       range_ends_al, work_mbc->nranges + 1);	  work_mbc->range_ends[work_mbc->nranges++] = wc2;	}      else if (wc != -1)	/* build normal characters.  */	{	  REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al,			       work_mbc->nchars + 1);	  work_mbc->chars[work_mbc->nchars++] = wc;	}    }  while ((wc = wc1) != L']');}#endif /* MBS_SUPPORT */#ifdef __STDC__#define FUNC(F, P) static int F(int c) { return P(c); }#else#define FUNC(F, P) static int F(c) int c; { return P(c); }#endifFUNC(is_alpha, ISALPHA)FUNC(is_upper, ISUPPER)FUNC(is_lower, ISLOWER)FUNC(is_digit, ISDIGIT)FUNC(is_xdigit, ISXDIGIT)FUNC(is_space, ISSPACE)FUNC(is_punct, ISPUNCT)FUNC(is_alnum, ISALNUM)FUNC(is_print, ISPRINT)FUNC(is_graph, ISGRAPH)FUNC(is_cntrl, ISCNTRL)static intis_blank (int c){   return (c == ' ' || c == '\t');}/* The following list maps the names of the Posix named character classes   to predicate functions that determine whether a given character is in   the class.  The leading [ has already been eaten by the lexical analyzer. */static struct {  const char *name;  int (*pred) PARAMS ((int));} const prednames[] = {  { ":alpha:]", is_alpha },  { ":upper:]", is_upper },  { ":lower:]", is_lower },  { ":digit:]", is_digit },  { ":xdigit:]", is_xdigit },  { ":space:]", is_space },  { ":punct:]", is_punct },  { ":alnum:]", is_alnum },  { ":print:]", is_print },  { ":graph:]", is_graph },  { ":cntrl:]", is_cntrl },  { ":blank:]", is_blank },  { 0 }};/* Return non-zero if C is a `word-constituent' byte; zero otherwise.  */#define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_')static intlooking_at (char const *s){  size_t len;  len = strlen(s);  if (lexleft < len)    return 0;  return strncmp(s, lexptr, len) == 0;}static tokenlex (void){  unsigned c, c1, c2;  int backslash = 0, invert;  charclass ccl;  int i;  /* Basic plan: We fetch a character.  If it's a backslash,     we set the backslash flag and go through the loop again.     On the plus side, this avoids having a duplicate of the     main switch inside the backslash case.  On the minus side,     it means that just about every case begins with     "if (backslash) ...".  */  for (i = 0; i < 2; ++i)    {      FETCH(c, 0);#ifdef MBS_SUPPORT      if (MB_CUR_MAX > 1 && cur_mb_index)	/* If this is a part of a multi-byte character, we must treat	   this byte data as a normal character.	   e.g. In case of SJIS encoding, some character contains '\',	        but they must not be backslash.  */	goto normal_char;#endif /* MBS_SUPPORT  */      switch (c)	{	case '\\':	  if (backslash)	    goto normal_char;	  if (lexleft == 0)	    dfaerror(_("Unfinished \\ escape"));	  backslash = 1;	  break;	case '^':	  if (backslash)	    goto normal_char;	  if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS	      || lasttok == END	      || lasttok == LPAREN	      || lasttok == OR)	    return lasttok = BEGLINE;	  goto normal_char;	case '$':	  if (backslash)	    goto normal_char;	  if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS	      || lexleft == 0	      || (syntax_bits & RE_NO_BK_PARENS		  ? lexleft > 0 && *lexptr == ')'		  : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')')	      || (syntax_bits & RE_NO_BK_VBAR		  ? lexleft > 0 && *lexptr == '|'		  : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|')	      || ((syntax_bits & RE_NEWLINE_ALT)	          && lexleft > 0 && *lexptr == '\n'))	    return lasttok = ENDLINE;	  goto normal_char;	case '1':	case '2':	case '3':	case '4':	case '5':	case '6':	case '7':	case '8':	case '9':	  if (backslash && !(syntax_bits & RE_NO_BK_REFS))	    {	      laststart = 0;	      return lasttok = BACKREF;	    }	  goto normal_char;	case '`':	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))	    return lasttok = BEGLINE;	/* FIXME: should be beginning of string */	  goto normal_char;	case '\'':	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))	    return lasttok = ENDLINE;	/* FIXME: should be end of string */	  goto normal_char;	case '<':	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))	    return lasttok = BEGWORD;	  goto normal_char;	case '>':	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))	    return lasttok = ENDWORD;	  goto normal_char;	case 'b':	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))	    return lasttok = LIMWORD;	  goto normal_char;	case 'B':	  if (backslash && !(syntax_bits & RE_NO_GNU_OPS))	    return lasttok = NOTLIMWORD;	  goto normal_char;	case '?':	  if (syntax_bits & RE_LIMITED_OPS)	    goto normal_char;	  if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))	    goto normal_char;	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)	    goto normal_char;	  return lasttok = QMARK;	case '*':	  if (backslash)	    goto normal_char;	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)	    goto normal_char;	  return lasttok = STAR;	case '+':	  if (syntax_bits & RE_LIMITED_OPS)	    goto normal_char;	  if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0))	    goto normal_char;	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)	    goto normal_char;	  return lasttok = PLUS;	case '{':	  if (!(syntax_bits & RE_INTERVALS))	    goto normal_char;	  if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0))	    goto normal_char;	  if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)	    goto normal_char;	  if (syntax_bits & RE_NO_BK_BRACES)	    {	      /* Scan ahead for a valid interval; if it's not valid,		 treat it as a literal '{'.  */	      int lo = -1, hi = -1;	      char const *p = lexptr;	      char const *lim = p + lexleft;	      for (;  p != lim && ISASCIIDIGIT (*p);  p++)		lo = (lo < 0 ? 0 : lo * 10) + *p - '0';	      if (p != lim && *p == ',')		while (++p != lim && ISASCIIDIGIT (*p))		  hi = (hi < 0 ? 0 : hi * 10) + *p - '0';	      else		hi = lo;	      if (p == lim || *p != '}'		  || lo < 0 || RE_DUP_MAX < hi || (0 <= hi && hi < lo))		goto normal_char;	    }	  minrep = 0;	  /* Cases:	     {M} - exact count	     {M,} - minimum count, maximum is infinity	     {M,N} - M through N */	  FETCH(c, _("unfinished repeat count"));	  if (ISASCIIDIGIT (c))	    {	      minrep = c - '0';	      for (;;)		{		  FETCH(c, _("unfinished repeat count"));		  if (! ISASCIIDIGIT (c))		    break;		  minrep = 10 * minrep + c - '0';		}	    }	  else	    dfaerror(_("malformed repeat count"));	  if (c == ',')	    {	      FETCH (c, _("unfinished repeat count"));	      if (! ISASCIIDIGIT (c))		maxrep = -1;	      else		{		  maxrep = c - '0';		  for (;;)		    {		      FETCH (c, _("unfinished repeat count"));		      if (! ISASCIIDIGIT (c))			break;		      maxrep = 10 * maxrep + c - '0';		    }		  if (0 <= maxrep && maxrep < minrep)		    dfaerror (_("malformed repeat count"));		}	    }	  else	    maxrep = minrep;	  if (!(syntax_bits & RE_NO_BK_BRACES))	    {	      if (c != '\\')		dfaerror(_("malformed repeat count"));	      FETCH(c, _("unfinished repeat count"));	    }	  if (c != '}')	    dfaerror(_("malformed repeat count"));	  laststart = 0;	  return lasttok = REPMN;	case '|':	  if (syntax_bits & RE_LIMITED_OPS)	    goto normal_char;	  if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0))	    goto normal_char;	  laststart = 1;	  return lasttok = OR;	case '\n':	  if (syntax_bits & RE_LIMITED_OPS	      || backslash
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -