📄 dfa.c

📁 linux平台中
💻 C
📖 第 1 页 / 共 5 页
字号:
12 3 4 5 下一页
/* dfa.c - deterministic extended regexp routines for GNU   Copyright 1988, 1998, 2000 Free Software Foundation, Inc.   This program is free software; you can redistribute it and/or modify   it under the terms of the GNU General Public License as published by   the Free Software Foundation; either version 2, or (at your option)   any later version.   This program is distributed in the hope that it will be useful,   but WITHOUT ANY WARRANTY; without even the implied warranty of   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the   GNU General Public License for more details.   You should have received a copy of the GNU General Public License   along with this program; if not, write to the Free Software   Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA *//* Written June, 1988 by Mike Haertel   Modified July, 1988 by Arthur David Olson to assist BMG speedups  */#ifdef HAVE_CONFIG_H#include <config.h>#endif#include <assert.h>#include <ctype.h>#include <stdio.h>#include <sys/types.h>#ifdef STDC_HEADERS#include <stdlib.h>#elseextern char *calloc(), *malloc(), *realloc();extern void free();#endif#if defined(HAVE_STRING_H) || defined(STDC_HEADERS)#include <string.h>#else#include <strings.h>#endif#if HAVE_SETLOCALE# include <locale.h>#endif#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC/* We can handle multibyte string.  */# define MBS_SUPPORT#endif#ifdef MBS_SUPPORT# include <wchar.h># include <wctype.h>#endif#ifndef DEBUG	/* use the same approach as regex.c */#undef assert#define assert(e)#endif /* DEBUG */#ifndef isgraph#define isgraph(C) (isprint(C) && !isspace(C))#endif#if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))#define ISALPHA(C) isalpha(C)#define ISUPPER(C) isupper(C)#define ISLOWER(C) islower(C)#define ISDIGIT(C) isdigit(C)#define ISXDIGIT(C) isxdigit(C)#define ISSPACE(C) isspace(C)#define ISPUNCT(C) ispunct(C)#define ISALNUM(C) isalnum(C)#define ISPRINT(C) isprint(C)#define ISGRAPH(C) isgraph(C)#define ISCNTRL(C) iscntrl(C)#else#define ISALPHA(C) (isascii(C) && isalpha(C))#define ISUPPER(C) (isascii(C) && isupper(C))#define ISLOWER(C) (isascii(C) && islower(C))#define ISDIGIT(C) (isascii(C) && isdigit(C))#define ISXDIGIT(C) (isascii(C) && isxdigit(C))#define ISSPACE(C) (isascii(C) && isspace(C))#define ISPUNCT(C) (isascii(C) && ispunct(C))#define ISALNUM(C) (isascii(C) && isalnum(C))#define ISPRINT(C) (isascii(C) && isprint(C))#define ISGRAPH(C) (isascii(C) && isgraph(C))#define ISCNTRL(C) (isascii(C) && iscntrl(C))#endif/* ISASCIIDIGIT differs from ISDIGIT, as follows:   - Its arg may be any int or unsigned int; it need not be an unsigned char.   - It's guaranteed to evaluate its argument exactly once.   - It's typically faster.   Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that   only '0' through '9' are digits.  Prefer ISASCIIDIGIT to ISDIGIT unless   it's important to use the locale's definition of `digit' even when the   host does not conform to Posix.  */#define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)/* If we (don't) have I18N.  *//* glibc defines _ */#ifndef _# ifdef HAVE_LIBINTL_H#  include <libintl.h>#  ifndef _#   define _(Str) gettext (Str)#  endif# else#  define _(Str) (Str)# endif#endif#include "regex.h"#include "dfa.h"#include "hard-locale.h"/* HPUX, define those as macros in sys/param.h */#ifdef setbit# undef setbit#endif#ifdef clrbit# undef clrbit#endifstatic void dfamust PARAMS ((struct dfa *dfa));static void regexp PARAMS ((int toplevel));static ptr_txcalloc (size_t n, size_t s){  ptr_t r = calloc(n, s);  if (!r)    dfaerror(_("Memory exhausted"));  return r;}static ptr_txmalloc (size_t n){  ptr_t r = malloc(n);  assert(n != 0);  if (!r)    dfaerror(_("Memory exhausted"));  return r;}static ptr_txrealloc (ptr_t p, size_t n){  ptr_t r = realloc(p, n);  assert(n != 0);  if (!r)    dfaerror(_("Memory exhausted"));  return r;}#define CALLOC(p, t, n) ((p) = (t *) xcalloc((size_t)(n), sizeof (t)))#define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t)))#define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t)))/* Reallocate an array of type t if nalloc is too small for index. */#define REALLOC_IF_NECESSARY(p, t, nalloc, index) \  if ((index) >= (nalloc))			  \    {						  \      do					  \	(nalloc) *= 2;				  \      while ((index) >= (nalloc));		  \      REALLOC(p, t, nalloc);			  \    }#ifdef DEBUGstatic voidprtok (token t){  char const *s;  if (t < 0)    fprintf(stderr, "END");  else if (t < NOTCHAR)    fprintf(stderr, "%c", t);  else    {      switch (t)	{	case EMPTY: s = "EMPTY"; break;	case BACKREF: s = "BACKREF"; break;	case BEGLINE: s = "BEGLINE"; break;	case ENDLINE: s = "ENDLINE"; break;	case BEGWORD: s = "BEGWORD"; break;	case ENDWORD: s = "ENDWORD"; break;	case LIMWORD: s = "LIMWORD"; break;	case NOTLIMWORD: s = "NOTLIMWORD"; break;	case QMARK: s = "QMARK"; break;	case STAR: s = "STAR"; break;	case PLUS: s = "PLUS"; break;	case CAT: s = "CAT"; break;	case OR: s = "OR"; break;	case ORTOP: s = "ORTOP"; break;	case LPAREN: s = "LPAREN"; break;	case RPAREN: s = "RPAREN"; break;	case CRANGE: s = "CRANGE"; break;#ifdef MBS_SUPPORT	case ANYCHAR: s = "ANYCHAR"; break;	case MBCSET: s = "MBCSET"; break;#endif /* MBS_SUPPORT */	default: s = "CSET"; break;	}      fprintf(stderr, "%s", s);    }}#endif /* DEBUG *//* Stuff pertaining to charclasses. */static inttstbit (unsigned b, charclass c){  return c[b / INTBITS] & 1 << b % INTBITS;}static voidsetbit (unsigned b, charclass c){  c[b / INTBITS] |= 1 << b % INTBITS;}static voidclrbit (unsigned b, charclass c){  c[b / INTBITS] &= ~(1 << b % INTBITS);}static voidcopyset (charclass src, charclass dst){  memcpy (dst, src, sizeof (charclass));}static voidzeroset (charclass s){  memset (s, 0, sizeof (charclass));}static voidnotset (charclass s){  int i;  for (i = 0; i < CHARCLASS_INTS; ++i)    s[i] = ~s[i];}static intequal (charclass s1, charclass s2){  return memcmp (s1, s2, sizeof (charclass)) == 0;}/* A pointer to the current dfa is kept here during parsing. */static struct dfa *dfa;/* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */static intcharclass_index (charclass s){  int i;  for (i = 0; i < dfa->cindex; ++i)    if (equal(s, dfa->charclasses[i]))      return i;  REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex);  ++dfa->cindex;  copyset(s, dfa->charclasses[i]);  return i;}/* Syntax bits controlling the behavior of the lexical analyzer. */static reg_syntax_t syntax_bits, syntax_bits_set;/* Flag for case-folding letters into sets. */static int case_fold;/* End-of-line byte in data.  */static unsigned char eolbyte;/* Entry point to set syntax options. */voiddfasyntax (reg_syntax_t bits, int fold, unsigned char eol){  syntax_bits_set = 1;  syntax_bits = bits;  case_fold = fold;  eolbyte = eol;}/* Like setbit, but if case is folded, set both cases of a letter.  */static voidsetbit_case_fold (unsigned b, charclass c){  setbit (b, c);  if (case_fold)    {      if (ISUPPER (b))	setbit (tolower (b), c);      else if (ISLOWER (b))	setbit (toupper (b), c);    }}/* Lexical analyzer.  All the dross that deals with the obnoxious   GNU Regex syntax bits is located here.  The poor, suffering   reader is referred to the GNU Regex documentation for the   meaning of the @#%!@#%^!@ syntax bits. */static char const *lexstart;	/* Pointer to beginning of input string. */static char const *lexptr;	/* Pointer to next input character. */static int lexleft;		/* Number of characters remaining. */static token lasttok;		/* Previous token returned; initially END. */static int laststart;		/* True if we're separated from beginning or (, |				   only by zero-width characters. */static int parens;		/* Count of outstanding left parens. */static int minrep, maxrep;	/* Repeat counts for {m,n}. */static int hard_LC_COLLATE;	/* Nonzero if LC_COLLATE is hard.  */#ifdef MBS_SUPPORT/* These variables are used only if (MB_CUR_MAX > 1).  */static mbstate_t mbs;		/* Mbstate for mbrlen().  */static int cur_mb_len;		/* Byte length of the current scanning				   multibyte character.  */static int cur_mb_index;        /* Byte index of the current scanning multibyte                                   character.				   singlebyte character : cur_mb_index = 0				   multibyte character				       1st byte : cur_mb_index = 1				       2nd byte : cur_mb_index = 2				         ...				       nth byte : cur_mb_index = n  */static unsigned char *mblen_buf;/* Correspond to the input buffer in dfaexec().                                  Each element store the amount of remain                                  byte of corresponding multibyte character                                  in the input string.  A element's value                                  is 0 if corresponding character is a                                  singlebyte chracter.                                  e.g. input : 'a', <mb(0)>, <mb(1)>, <mb(2)>                                   mblen_buf :   0,       3,       2,       1                               */static wchar_t *inputwcs;	/* Wide character representation of input				   string in dfaexec().				   The length of this array is same as				   the length of input string(char array).				   inputstring[i] is a single-byte char,				   or 1st byte of a multibyte char.				   And inputwcs[i] is the codepoint.  */static unsigned char const *buf_begin;/* refference to begin in dfaexec().  */static unsigned char const *buf_end;	/* refference to end in dfaexec().  */#endif /* MBS_SUPPORT  */#ifdef MBS_SUPPORT/* This function update cur_mb_len, and cur_mb_index.   p points current lexptr, len is the remaining buffer length.  */static voidupdate_mb_len_index (unsigned char const *p, int len){  /* If last character is a part of a multibyte character,     we update cur_mb_index.  */  if (cur_mb_index)    cur_mb_index = (cur_mb_index >= cur_mb_len)? 0			: cur_mb_index + 1;  /* If last character is a single byte character, or the     last portion of a multibyte character, we check whether     next character is a multibyte character or not.  */  if (! cur_mb_index)    {      cur_mb_len = mbrlen(p, len, &mbs);      if (cur_mb_len > 1)	/* It is a multibyte character.	   cur_mb_len was already set by mbrlen().  */	cur_mb_index = 1;      else if (cur_mb_len < 1)	/* Invalid sequence.  We treat it as a singlebyte character.	   cur_mb_index is aleady 0.  */	cur_mb_len = 1;      /* Otherwise, cur_mb_len == 1, it is a singlebyte character.	 cur_mb_index is aleady 0.  */    }}#endif /* MBS_SUPPORT */#ifdef MBS_SUPPORT/* Note that characters become unsigned here. */# define FETCH(c, eoferr)			\  {						\    if (! lexleft)				\     {						\	if (eoferr != 0)			\	  dfaerror (eoferr);			\	else					\	  return lasttok = END;			\      }						\    if (MB_CUR_MAX > 1)				\      update_mb_len_index(lexptr, lexleft);	\    (c) = (unsigned char) *lexptr++;		\    --lexleft;					\  }/* This function fetch a wide character, and update cur_mb_len,   used only if the current locale is a multibyte environment.  */static wchar_tfetch_wc (char const *eoferr){  wchar_t wc;  if (! lexleft)    {      if (eoferr != 0)	dfaerror (eoferr);      else	return -1;    }  cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs);  if (cur_mb_len <= 0)   {      cur_mb_len = 1;      wc = *lexptr;    }  lexptr += cur_mb_len;  lexleft -= cur_mb_len;  return wc;}#else/* Note that characters become unsigned here. */# define FETCH(c, eoferr)   	      \  {			   	      \    if (! lexleft)	   	      \      {				      \	if (eoferr != 0)	      \	  dfaerror (eoferr);	      \	else		   	      \	  return lasttok = END;	      \      }				      \    (c) = (unsigned char) *lexptr++;  \    --lexleft;		   	      \  }#endif /* MBS_SUPPORT */#ifdef MBS_SUPPORT/* Multibyte character handling sub-routin for lex.   This function  parse a bracket expression and build a struct   mb_char_classes.  */static voidparse_bracket_exp_mb (){  wchar_t wc, wc1, wc2;  /* Work area to build a mb_char_classes.  */  struct mb_char_classes *work_mbc;
12 3 4 5 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -