⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 regex.c

📁 这是一个同样来自贝尔实验室的和UNIX有着渊源的操作系统, 其简洁的设计和实现易于我们学习和理解
💻 C
📖 第 1 页 / 共 5 页
字号:
/* Structure to manage work area for range table.  */struct range_table_work_area{  int *table;			/* actual work area.  */  int allocated;		/* allocated size for work area in bytes.  */  int used;			/* actually used size in words.	 */};/* Make sure that WORK_AREA can hold more N multibyte characters.  */#define EXTEND_RANGE_TABLE_WORK_AREA(work_area, n)			  \  do {									  \    if (((work_area).used + (n)) * sizeof (int) > (work_area).allocated)  \      {									  \	(work_area).allocated += 16 * sizeof (int);			  \	if ((work_area).table)						  \	  (work_area).table						  \	    = (int *) realloc ((work_area).table, (work_area).allocated); \	else								  \	  (work_area).table						  \	    = (int *) malloc ((work_area).allocated);			  \	if ((work_area).table == 0)					  \	  FREE_STACK_RETURN (REG_ESPACE);				  \      }									  \  } while (0)/* Set a range (RANGE_START, RANGE_END) to WORK_AREA.  */#define SET_RANGE_TABLE_WORK_AREA(work_area, range_start, range_end)	\  do {									\    EXTEND_RANGE_TABLE_WORK_AREA ((work_area), 2);			\    (work_area).table[(work_area).used++] = (range_start);		\    (work_area).table[(work_area).used++] = (range_end);		\  } while (0)/* Free allocated memory for WORK_AREA.	 */#define FREE_RANGE_TABLE_WORK_AREA(work_area)	\  do {						\    if ((work_area).table)			\      free ((work_area).table);			\  } while (0)#define CLEAR_RANGE_TABLE_WORK_USED(work_area) ((work_area).used = 0)#define RANGE_TABLE_WORK_USED(work_area) ((work_area).used)#define RANGE_TABLE_WORK_ELT(work_area, i) ((work_area).table[i])/* Set the bit for character C in a list.  */#define SET_LIST_BIT(c)				      \  (b[((unsigned char) (c)) / BYTEWIDTH]		      \   |= 1 << (((unsigned char) c) % BYTEWIDTH))/* Get the next unsigned number in the uncompiled pattern.  */#define GET_UNSIGNED_NUMBER(num)					\  { if (p != pend)							\     {									\       PATFETCH (c);							\       while (ISDIGIT (c))						\	 {								\	   if (num < 0)							\	      num = 0;							\	   num = num * 10 + c - '0';					\	   if (p == pend)						\	      break;							\	   PATFETCH (c);						\	 }								\       }								\    }#define CHAR_CLASS_MAX_LENGTH  6 /* Namely, `xdigit'.  */#define IS_CHAR_CLASS(string)						\   (STREQ (string, "alpha") || STREQ (string, "upper")			\    || STREQ (string, "lower") || STREQ (string, "digit")		\    || STREQ (string, "alnum") || STREQ (string, "xdigit")		\    || STREQ (string, "space") || STREQ (string, "print")		\    || STREQ (string, "punct") || STREQ (string, "graph")		\    || STREQ (string, "cntrl") || STREQ (string, "blank"))#ifndef MATCH_MAY_ALLOCATE/* If we cannot allocate large objects within re_match_2_internal,   we make the fail stack and register vectors global.   The fail stack, we grow to the maximum size when a regexp   is compiled.   The register vectors, we adjust in size each time we   compile a regexp, according to the number of registers it needs.  */static fail_stack_type fail_stack;/* Size with which the following vectors are currently allocated.   That is so we can make them bigger as needed,   but never make them smaller.	 */static int regs_allocated_size;static const char **	 regstart, **	  regend;static const char ** old_regstart, ** old_regend;static const char **best_regstart, **best_regend;static register_info_type *reg_info;static const char **reg_dummy;static register_info_type *reg_info_dummy;/* Make the register vectors big enough for NUM_REGS registers,   but don't make them smaller.	 */staticregex_grow_registers (num_regs)     int num_regs;{  if (num_regs > regs_allocated_size)    {      RETALLOC_IF (regstart,	 num_regs, const char *);      RETALLOC_IF (regend,	 num_regs, const char *);      RETALLOC_IF (old_regstart, num_regs, const char *);      RETALLOC_IF (old_regend,	 num_regs, const char *);      RETALLOC_IF (best_regstart, num_regs, const char *);      RETALLOC_IF (best_regend,	 num_regs, const char *);      RETALLOC_IF (reg_info,	 num_regs, register_info_type);      RETALLOC_IF (reg_dummy,	 num_regs, const char *);      RETALLOC_IF (reg_info_dummy, num_regs, register_info_type);      regs_allocated_size = num_regs;    }}#endif /* not MATCH_MAY_ALLOCATE *//* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.   Returns one of error codes defined in `regex.h', or zero for success.   Assumes the `allocated' (and perhaps `buffer') and `translate'   fields are set in BUFP on entry.   If it succeeds, results are put in BUFP (if it returns an error, the   contents of BUFP are undefined):     `buffer' is the compiled pattern;     `syntax' is set to SYNTAX;     `used' is set to the length of the compiled pattern;     `fastmap_accurate' is zero;     `re_nsub' is the number of subexpressions in PATTERN;     `not_bol' and `not_eol' are zero;   The `fastmap' and `newline_anchor' fields are neither   examined nor set.  *//* Return, freeing storage we allocated.  */#define FREE_STACK_RETURN(value)		\  do {							\    FREE_RANGE_TABLE_WORK_AREA (range_table_work);	\    free (compile_stack.stack);				\    return value;					\  } while (0)static reg_errcode_tregex_compile (pattern, size, syntax, bufp)     const char *pattern;     int size;     reg_syntax_t syntax;     struct re_pattern_buffer *bufp;{  /* We fetch characters from PATTERN here.  Even though PATTERN is     `char *' (i.e., signed), we declare these variables as unsigned, so     they can be reliably used as array indices.  */  register unsigned int c, c1;  /* A random temporary spot in PATTERN.  */  const char *p1;  /* Points to the end of the buffer, where we should append.  */  register unsigned char *b;  /* Keeps track of unclosed groups.  */  compile_stack_type compile_stack;  /* Points to the current (ending) position in the pattern.  */#ifdef AIX  /* `const' makes AIX compiler fail.  */  char *p = pattern;#else  const char *p = pattern;#endif  const char *pend = pattern + size;  /* How to translate the characters in the pattern.  */  RE_TRANSLATE_TYPE translate = bufp->translate;  /* Address of the count-byte of the most recently inserted `exactn'     command.  This makes it possible to tell if a new exact-match     character can be added to that command or if the character requires     a new `exactn' command.  */  unsigned char *pending_exact = 0;  /* Address of start of the most recently finished expression.     This tells, e.g., postfix * where to find the start of its     operand.  Reset at the beginning of groups and alternatives.  */  unsigned char *laststart = 0;  /* Address of beginning of regexp, or inside of last group.  */  unsigned char *begalt;  /* Place in the uncompiled pattern (i.e., the {) to     which to go back if the interval is invalid.  */  const char *beg_interval;  /* Address of the place where a forward jump should go to the end of     the containing expression.	 Each alternative of an `or' -- except the     last -- ends with a forward jump of this sort.  */  unsigned char *fixup_alt_jump = 0;  /* Counts open-groups as they are encountered.  Remembered for the     matching close-group on the compile stack, so the same register     number is put in the stop_memory as the start_memory.  */  regnum_t regnum = 0;  /* Work area for range table of charset.  */  struct range_table_work_area range_table_work;#ifdef DEBUG  DEBUG_PRINT1 ("\nCompiling pattern: ");  if (debug)    {      unsigned debug_count;      for (debug_count = 0; debug_count < size; debug_count++)	putchar (pattern[debug_count]);      putchar ('\n');    }#endif /* DEBUG */  /* Initialize the compile stack.  */  compile_stack.stack = TALLOC (INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);  if (compile_stack.stack == NULL)    return REG_ESPACE;  compile_stack.size = INIT_COMPILE_STACK_SIZE;  compile_stack.avail = 0;  range_table_work.table = 0;  range_table_work.allocated = 0;  /* Initialize the pattern buffer.  */  bufp->syntax = syntax;  bufp->fastmap_accurate = 0;  bufp->not_bol = bufp->not_eol = 0;  /* Set `used' to zero, so that if we return an error, the pattern     printer (for debugging) will think there's no pattern.  We reset it     at the end.  */  bufp->used = 0;  /* Always count groups, whether or not bufp->no_sub is set.  */  bufp->re_nsub = 0;#ifdef emacs  /* bufp->multibyte is set before regex_compile is called, so don't alter     it. */#else  /* not emacs */  /* Nothing is recognized as a multibyte character.  */  bufp->multibyte = 0;#endif#if !defined (emacs) && !defined (SYNTAX_TABLE)  /* Initialize the syntax table.  */   init_syntax_once ();#endif  if (bufp->allocated == 0)    {      if (bufp->buffer)	{ /* If zero allocated, but buffer is non-null, try to realloc	     enough space.  This loses if buffer's address is bogus, but	     that is the user's responsibility.	 */	  RETALLOC (bufp->buffer, INIT_BUF_SIZE, unsigned char);	}      else	{ /* Caller did not allocate a buffer.	Do it for them.	 */	  bufp->buffer = TALLOC (INIT_BUF_SIZE, unsigned char);	}      if (!bufp->buffer) FREE_STACK_RETURN (REG_ESPACE);      bufp->allocated = INIT_BUF_SIZE;    }  begalt = b = bufp->buffer;  /* Loop through the uncompiled pattern until we're at the end.  */  while (p != pend)    {      PATFETCH (c);      switch (c)	{	case '^':	  {	    if (   /* If at start of pattern, it's an operator.	 */		   p == pattern + 1		   /* If context independent, it's an operator.	 */		|| syntax & RE_CONTEXT_INDEP_ANCHORS		   /* Otherwise, depends on what's come before.	 */		|| at_begline_loc_p (pattern, p, syntax))	      BUF_PUSH (begline);	    else	      goto normal_char;	  }	  break;	case '$':	  {	    if (   /* If at end of pattern, it's an operator.  */		   p == pend		   /* If context independent, it's an operator.	 */		|| syntax & RE_CONTEXT_INDEP_ANCHORS		   /* Otherwise, depends on what's next.  */		|| at_endline_loc_p (p, pend, syntax))	       BUF_PUSH (endline);	     else	       goto normal_char;	   }	   break;	case '+':	case '?':	  if ((syntax & RE_BK_PLUS_QM)	      || (syntax & RE_LIMITED_OPS))	    goto normal_char;	handle_plus:	case '*':	  /* If there is no previous pattern... */	  if (!laststart)	    {	      if (syntax & RE_CONTEXT_INVALID_OPS)		FREE_STACK_RETURN (REG_BADRPT);	      else if (!(syntax & RE_CONTEXT_INDEP_OPS))		goto normal_char;	    }	  {	    /* Are we optimizing this jump?  */	    boolean keep_string_p = false;	    /* 1 means zero (many) matches is allowed.	*/	    char zero_times_ok = 0, many_times_ok = 0;	    /* If there is a sequence of repetition chars, collapse it	       down to just one (the right one).  We can't combine	       interval operators with these because of, e.g., `a{2}*',	       which should only match an even number of `a's.	*/	    for (;;)	      {		zero_times_ok |= c != '+';		many_times_ok |= c != '?';		if (p == pend)		  break;		PATFETCH (c);		if (c == '*'		    || (!(syntax & RE_BK_PLUS_QM) && (c == '+' || c == '?')))		  ;		else if (syntax & RE_BK_PLUS_QM	 &&  c == '\\')		  {		    if (p == pend) FREE_STACK_RETURN (REG_EESCAPE);		    PATFETCH (c1);		    if (!(c1 == '+' || c1 == '?'))		      {			PATUNFETCH;			PATUNFETCH;			break;		      }		    c = c1;		  }		else		  {		    PATUNFETCH;		    break;		  }		/* If we get here, we found another repeat character.  */	       }	    /* Star, etc. applied to an empty pattern is equivalent	       to an empty pattern.  */	    if (!laststart)	      break;	    /* Now we know whether or not zero matches is allowed	       and also whether or not two or more matches is allowed.	*/	    if (many_times_ok)	      { /* More than one repetition is allowed, so put in at the		   end a backward relative jump from `b' to before the next		   jump we're going to put in below (which jumps from		   laststart to after this jump).		   But if we are at the `*' in the exact sequence `.*\n',		   insert an unconditional jump backwards to the .,		   instead of the beginning of the loop.  This way we only		   push a failure point once, instead of every time		   through the loop.  */		assert (p - 1 > pattern);		/* Allocate the space for the jump.  */		GET_BUFFER_SPACE (3);		/* We know we are not at the first character of the pattern,		   because laststart was nonzero.  And we've already		   incremented `p', by the way, to be the character after		   the `*'.  Do we have to do something analogous here		   for null bytes, because of RE_DOT_NOT_NULL?	*/		if (TRANSLATE ((unsigned char)*(p - 2)) == TRANSLATE ('.')		    && zero_times_ok		    && p < pend		    && TRANSLATE ((unsigned char)*p) == TRANSLATE ('\n')		    && !(syntax & RE_DOT_NEWLINE))		  { /* We have .*\n.  */		    STORE_JUMP (jump, b, laststart);		    keep_string_p = true;		  }		else		  /* Anything else.  */		  STORE_JUMP (maybe_pop_jump, b, lasts

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -