regex.cpp

来自「Shorthand是一个强大的脚本语言」· C++ 代码 · 共 2,106 行 · 第 1/5 页

CPP
2,106
字号
} fail_stack_type;#endif /* INT_IS_16BIT */#define FAIL_STACK_EMPTY()     (fail_stack.avail == 0)#define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)#define FAIL_STACK_FULL()      (fail_stack.avail == fail_stack.size)/* Define macros to initialize and free the failure stack.   Do `return -2' if the alloc fails.  */#ifdef MATCH_MAY_ALLOCATE# define INIT_FAIL_STACK()						\  do {									\    fail_stack.stack = (fail_stack_elt_t *)				\      REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * sizeof (fail_stack_elt_t)); \									\    if (fail_stack.stack == NULL)					\      return -2;							\									\    fail_stack.size = INIT_FAILURE_ALLOC;				\    fail_stack.avail = 0;						\  } while (0)# define RESET_FAIL_STACK()  REGEX_FREE_STACK (fail_stack.stack)#else# define INIT_FAIL_STACK()						\  do {									\    fail_stack.avail = 0;						\  } while (0)# define RESET_FAIL_STACK()#endif/* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.   Return 1 if succeeds, and 0 if either ran out of memory   allocating space for it or it was already too large.   REGEX_REALLOCATE_STACK requires `destination' be declared.   */#define DOUBLE_FAIL_STACK(fail_stack)					\  ((fail_stack).size > (unsigned) (re_max_failures * MAX_FAILURE_ITEMS)	\   ? 0									\   : ((fail_stack).stack = (fail_stack_elt_t *)				\        REGEX_REALLOCATE_STACK ((fail_stack).stack, 			\          (fail_stack).size * sizeof (fail_stack_elt_t),		\          ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)),	\									\      (fail_stack).stack == NULL					\      ? 0								\      : ((fail_stack).size <<= 1, 					\         1)))/* Push pointer POINTER on FAIL_STACK.   Return 1 if was able to do so and 0 if ran out of memory allocating   space to do so.  */#define PUSH_PATTERN_OP(POINTER, FAIL_STACK)				\  ((FAIL_STACK_FULL ()							\    && !DOUBLE_FAIL_STACK (FAIL_STACK))					\   ? 0									\   : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER,	\      1))/* Push a pointer value onto the failure stack.   Assumes the variable `fail_stack'.  Probably should only   be called from within `PUSH_FAILURE_POINT'.  */#define PUSH_FAILURE_POINTER(item)					\  fail_stack.stack[fail_stack.avail++].pointer = (unsigned char *) (item)/* This pushes an integer-valued item onto the failure stack.   Assumes the variable `fail_stack'.  Probably should only   be called from within `PUSH_FAILURE_POINT'.  */#define PUSH_FAILURE_INT(item)					\  fail_stack.stack[fail_stack.avail++].integer = (item)/* Push a fail_stack_elt_t value onto the failure stack.   Assumes the variable `fail_stack'.  Probably should only   be called from within `PUSH_FAILURE_POINT'.  */#define PUSH_FAILURE_ELT(item)					\  fail_stack.stack[fail_stack.avail++] =  (item)/* These three POP... operations complement the three PUSH... operations.   All assume that `fail_stack' is nonempty.  */#define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer#define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer#define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]/* Used to omit pushing failure point id's when we're not debugging.  */#ifdef DEBUG# define DEBUG_PUSH PUSH_FAILURE_INT# define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT ()#else# define DEBUG_PUSH(item)# define DEBUG_POP(item_addr)#endif/* Push the information about the state we will need   if we ever fail back to it.   Requires variables fail_stack, regstart, regend, reg_info, and   num_regs_pushed be declared.  DOUBLE_FAIL_STACK requires `destination'   be declared.   Does `return FAILURE_CODE' if runs out of memory.  */#define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code)	\  do {									\    char *destination;							\    /* Must be int, so when we don't save any registers, the arithmetic	\       of 0 + -1 isn't done as unsigned.  */				\    /* Can't be int, since there is not a shred of a guarantee that int	\       is wide enough to hold a value of something to which pointer can	\       be assigned */							\    active_reg_t this_reg;						\    									\    DEBUG_STATEMENT (failure_id++);					\    DEBUG_STATEMENT (nfailure_points_pushed++);				\    DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id);		\    DEBUG_PRINT2 ("  Before push, next avail: %d\n", (fail_stack).avail);\    DEBUG_PRINT2 ("                     size: %d\n", (fail_stack).size);\									\    DEBUG_PRINT2 ("  slots needed: %ld\n", NUM_FAILURE_ITEMS);		\    DEBUG_PRINT2 ("     available: %d\n", REMAINING_AVAIL_SLOTS);	\									\    /* Ensure we have enough space allocated for what we will push.  */	\    while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS)			\      {									\        if (!DOUBLE_FAIL_STACK (fail_stack))				\          return failure_code;						\									\        DEBUG_PRINT2 ("\n  Doubled stack; size now: %d\n",		\		       (fail_stack).size);				\        DEBUG_PRINT2 ("  slots available: %d\n", REMAINING_AVAIL_SLOTS);\      }									\									\    /* Push the info, starting with the registers.  */			\    DEBUG_PRINT1 ("\n");						\									\    if (1)								\      for (this_reg = lowest_active_reg; this_reg <= highest_active_reg; \	   this_reg++)							\	{								\	  DEBUG_PRINT2 ("  Pushing reg: %lu\n", this_reg);		\	  DEBUG_STATEMENT (num_regs_pushed++);				\									\	  DEBUG_PRINT2 ("    start: %p\n", regstart[this_reg]);		\	  PUSH_FAILURE_POINTER (regstart[this_reg]);			\									\	  DEBUG_PRINT2 ("    end: %p\n", regend[this_reg]);		\	  PUSH_FAILURE_POINTER (regend[this_reg]);			\									\	  DEBUG_PRINT2 ("    info: %p\n      ",				\			reg_info[this_reg].word.pointer);		\	  DEBUG_PRINT2 (" match_null=%d",				\			REG_MATCH_NULL_STRING_P (reg_info[this_reg]));	\	  DEBUG_PRINT2 (" active=%d", IS_ACTIVE (reg_info[this_reg]));	\	  DEBUG_PRINT2 (" matched_something=%d",			\			MATCHED_SOMETHING (reg_info[this_reg]));	\	  DEBUG_PRINT2 (" ever_matched=%d",				\			EVER_MATCHED_SOMETHING (reg_info[this_reg]));	\	  DEBUG_PRINT1 ("\n");						\	  PUSH_FAILURE_ELT (reg_info[this_reg].word);			\	}								\									\    DEBUG_PRINT2 ("  Pushing  low active reg: %ld\n", lowest_active_reg);\    PUSH_FAILURE_INT (lowest_active_reg);				\									\    DEBUG_PRINT2 ("  Pushing high active reg: %ld\n", highest_active_reg);\    PUSH_FAILURE_INT (highest_active_reg);				\									\    DEBUG_PRINT2 ("  Pushing pattern %p:\n", pattern_place);		\    DEBUG_PRINT_COMPILED_PATTERN (bufp, pattern_place, pend);		\    PUSH_FAILURE_POINTER (pattern_place);				\									\    DEBUG_PRINT2 ("  Pushing string %p: `", string_place);		\    DEBUG_PRINT_DOUBLE_STRING (string_place, string1, size1, string2,   \				 size2);				\    DEBUG_PRINT1 ("'\n");						\    PUSH_FAILURE_POINTER (string_place);				\									\    DEBUG_PRINT2 ("  Pushing failure id: %u\n", failure_id);		\    DEBUG_PUSH (failure_id);						\  } while (0)/* This is the number of items that are pushed and popped on the stack   for each register.  */#define NUM_REG_ITEMS  3/* Individual items aside from the registers.  */#ifdef DEBUG# define NUM_NONREG_ITEMS 5 /* Includes failure point id.  */#else# define NUM_NONREG_ITEMS 4#endif/* We push at most this many items on the stack.  *//* We used to use (num_regs - 1), which is the number of registers   this regexp will save; but that was changed to 5   to avoid stack overflow for a regexp with lots of parens.  */#define MAX_FAILURE_ITEMS (5 * NUM_REG_ITEMS + NUM_NONREG_ITEMS)/* We actually push this many items.  */#define NUM_FAILURE_ITEMS				\  (((0							\     ? 0 : highest_active_reg - lowest_active_reg + 1)	\    * NUM_REG_ITEMS)					\   + NUM_NONREG_ITEMS)/* How many items can still be added to the stack without overflowing it.  */#define REMAINING_AVAIL_SLOTS ((fail_stack).size - (fail_stack).avail)/* Pops what PUSH_FAIL_STACK pushes.   We restore into the parameters, all of which should be lvalues:     STR -- the saved data position.     PAT -- the saved pattern position.     LOW_REG, HIGH_REG -- the highest and lowest active registers.     REGSTART, REGEND -- arrays of string positions.     REG_INFO -- array of information about each subexpression.   Also assumes the variables `fail_stack' and (if debugging), `bufp',   `pend', `string1', `size1', `string2', and `size2'.  */#define POP_FAILURE_POINT(str, pat, low_reg, high_reg, regstart, regend, reg_info)\{									\  DEBUG_STATEMENT (unsigned failure_id;)				\  active_reg_t this_reg;						\  const unsigned char *string_temp;					\									\  assert (!FAIL_STACK_EMPTY ());					\									\  /* Remove failure points and point to how many regs pushed.  */	\  DEBUG_PRINT1 ("POP_FAILURE_POINT:\n");				\  DEBUG_PRINT2 ("  Before pop, next avail: %d\n", fail_stack.avail);	\  DEBUG_PRINT2 ("                    size: %d\n", fail_stack.size);	\									\  assert (fail_stack.avail >= NUM_NONREG_ITEMS);			\									\  DEBUG_POP (&failure_id);						\  DEBUG_PRINT2 ("  Popping failure id: %u\n", failure_id);		\									\  /* If the saved string location is NULL, it came from an		\     on_failure_keep_string_jump opcode, and we want to throw away the	\     saved NULL, thus retaining our current position in the string.  */	\  string_temp = POP_FAILURE_POINTER ();					\  if (string_temp != NULL)						\    str = (const char *) string_temp;					\									\  DEBUG_PRINT2 ("  Popping string %p: `", str);				\  DEBUG_PRINT_DOUBLE_STRING (str, string1, size1, string2, size2);	\  DEBUG_PRINT1 ("'\n");							\									\  pat = (unsigned char *) POP_FAILURE_POINTER ();			\  DEBUG_PRINT2 ("  Popping pattern %p:\n", pat);			\  DEBUG_PRINT_COMPILED_PATTERN (bufp, pat, pend);			\									\  /* Restore register info.  */						\  high_reg = (active_reg_t) POP_FAILURE_INT ();				\  DEBUG_PRINT2 ("  Popping high active reg: %ld\n", high_reg);		\									\  low_reg = (active_reg_t) POP_FAILURE_INT ();				\  DEBUG_PRINT2 ("  Popping  low active reg: %ld\n", low_reg);		\									\  if (1)								\    for (this_reg = high_reg; this_reg >= low_reg; this_reg--)		\      {									\	DEBUG_PRINT2 ("    Popping reg: %ld\n", this_reg);		\									\	reg_info[this_reg].word = POP_FAILURE_ELT ();			\	DEBUG_PRINT2 ("      info: %p\n",				\		      reg_info[this_reg].word.pointer);			\									\	regend[this_reg] = (const char *) POP_FAILURE_POINTER ();	\	DEBUG_PRINT2 ("      end: %p\n", regend[this_reg]);		\									\	regstart[this_reg] = (const char *) POP_FAILURE_POINTER ();	\	DEBUG_PRINT2 ("      start: %p\n", regstart[this_reg]);		\      }									\  else									\    {									\      for (this_reg = highest_active_reg; this_reg > high_reg; this_reg--) \	{								\	  reg_info[this_reg].word.integer = 0;				\	  regend[this_reg] = 0;						\	  regstart[this_reg] = 0;					\	}								\      highest_active_reg = high_reg;					\    }									\									\  set_regs_matched_done = 0;						\  DEBUG_STATEMENT (nfailure_points_popped++);				\} /* POP_FAILURE_POINT *//* Structure for per-register (a.k.a. per-group) information.   Other register information, such as the   starting and ending positions (which are addresses), and the list of   inner groups (which is a bits list) are maintained in separate   variables.   We are making a (strictly speaking) nonportable assumption here: that   the compiler will pack our bit fields into something that fits into   the type of `word', i.e., is something that fits into one item on the   failure stack.  *//* Declarations and macros for re_match_2.  */typedef union{  fail_stack_elt_t word;  struct  {      /* This field is one if this group can match the empty string,         zero if not.  If not yet determined,  `MATCH_NULL_UNSET_VALUE'.  */#define MATCH_NULL_UNSET_VALUE 3    unsigned match_null_string_p : 2;    unsigned is_active : 1;    unsigned matched_something : 1;    unsigned ever_matched_something : 1;  } bits;} register_info_type;#define REG_MATCH_NULL_STRING_P(R)  ((R).bits.match_null_string_p)#define IS_ACTIVE(R)  ((R).bits.is_active)#define MATCHED_SOMETHING(R)  ((R).bits.matched_something)#define EVER_MATCHED_SOMETHING(R)  ((R).bits.ever_matched_something)/* Call this when have matched a real character; it sets `matched' flags   for the subexpressions which we are currently inside.  Also records   that those subexprs have matched.  */#define SET_REGS_MATCHED()						\  do									\    {									\      if (!set_regs_matched_done)					\	{								\	  active_reg_t r;						\	  set_regs_matched_done = 1;					\	  for (r = lowest_active_reg; r <= highest_active_reg; r++)	\	    {								\	      MATCHED_SOMETHING (reg_info[r])				\		= EVER_MATCHED_SOMETHING (reg_info[r])			\		= 1;							\	    }								\	}								\    }									\  while (0)/* Registers are set to a sentinel when they haven't yet matched.  */static char reg_unset_dummy;#define REG_UNSET_VALUE (&reg_unset_dummy)#define REG_UNSET(e) ((e) == REG_UNSET_VALUE)/* Subroutine declarations and macros for regex_compile.  */static reg_errcode_t regex_compile _RE_ARGS ((const char *pattern, size_t size,					      reg_syntax_t syntax,					      struct re_pattern_buffer *bufp));static void store_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc, int arg));static void store_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,				 int arg1, int arg2));static void insert_op1 _RE_ARGS ((re_opcode_t op, unsigned char *loc,				  int arg, unsigned char *end));static void insert_op2 _RE_ARGS ((re_opcode_t op, unsigned char *loc,				  int arg1, int arg2, unsigned char *end));static boolean at_begline_loc_p _RE_ARGS ((const char *pattern, const char *p,					   reg_syntax_t syntax));static boolean at_endline_loc_p _RE_ARGS ((const char *p, const char *pend,					   reg_syntax_t syntax));static reg_errcode_t compile_range _RE_ARGS ((unsigned int range_start,					      const char **p_ptr,					      const char *pend,					      char *translate,					      reg_syntax_t syntax,					      unsigned char *b));/* Fetch the next character in the uncompiled pattern---translating it   if necessary.  Also cast from a signed character in the constant   string passed to us by the user to an unsigned char that we can use   as an array index (in, e.g., `translate').  */#ifndef PATFETCH# define PATFETCH(c)							\  do {if (p == pend) return REG_EEND;					\    c = (unsigned char) *p++;						\    if (translate) c = (unsigned char) translate[c];			\  } while (0)#endif/* Fetch the next character in the uncompiled pattern, with no   translation.  */#define PATFETCH_RAW(c)							\  do {if (p == pend) return REG_EEND;					\    c = (unsigned char) *p++; 						\  } while (0)/* Go backwards one character in the pattern.  */#define PATUNFETCH p--/* If `translate' is non-null, return translate[D], else just D.  We   cast the subscript to translate because some data is declared as   `char *', to avoid warnings when a string constant is passed.  But   when we use a character as a subscript we must make it unsigned.  */#ifndef TRANSLATE# define TRANSLATE(d) \  (translate ? (char) translate[(unsigned char) (d)] : (d))#endif/* Macros for outputting the compiled pattern into `buffer'.  *//* If the buffer isn't allocated when it comes in, use this.  */#define INIT_BUF_SIZE  32

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?