⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 regularexp.c

📁 nedit 是一款linux下的开发源码的功能强大的编辑器
💻 C
📖 第 1 页 / 共 5 页
字号:
      Implements back references into a previously matched but separate regular      expression.  This is used by syntax highlighting patterns. This node will      attempt to match whatever text was most captured by the index'th set of      parentheses of the separate regex passed to ExecRE. X_REGEX_BR_CI is case      insensitive version.   POS_AHEAD_OPEN, NEG_AHEAD_OPEN, LOOK_AHEAD_CLOSE      Operand(s): None      Implements positive and negative look ahead.  Look ahead is an assertion      that something is either there or not there.   Once this is determined the      regex engine backtracks to where it was just before the look ahead was      encountered, i.e. look ahead is a zero width assertion.   POS_BEHIND_OPEN, NEG_BEHIND_OPEN, LOOK_BEHIND_CLOSE      Operand(s): 2x2 bytes for OPEN (match boundaries), None for CLOSE      Implements positive and negative look behind.  Look behind is an assertion      that something is either there or not there in front of the current      position.  Look behind is a zero width assertion, with the additional      constraint that it must have a bounded length (for complexity and      efficiency reasons; note that most other implementation even impose      fixed length).   OPEN, CLOSE      Operand(s): None      OPEN  + n = Start of parenthesis 'n', CLOSE + n = Close of parenthesis      'n', and are numbered at compile time. *//* A node is one char of opcode followed by two chars of NEXT pointer plus * any operands.  NEXT pointers are stored as two 8-bit pieces, high order * first.  The value is a positive offset from the opcode of the node * containing it.  An operand, if any, simply follows the node.  (Note that * much of the code generation knows about this implicit relationship.) * * Using two bytes for NEXT_PTR_SIZE is vast overkill for most things, * but allows patterns to get big without disasters. */#define OP_CODE_SIZE    1#define NEXT_PTR_SIZE   2#define INDEX_SIZE      1#define LENGTH_SIZE	4#define NODE_SIZE       (NEXT_PTR_SIZE + OP_CODE_SIZE)#define GET_OP_CODE(p)  (*(unsigned char *)(p))#define OPERAND(p)      ((p) + NODE_SIZE)#define GET_OFFSET(p)   ((( *((p) + 1) & 0377) << 8) + (( *((p) + 2)) & 0377))#define PUT_OFFSET_L(v) (unsigned char)(((v) >> 8) & 0377)#define PUT_OFFSET_R(v) (unsigned char) ((v)       & 0377)#define GET_LOWER(p)    ((( *((p) + NODE_SIZE) & 0377) << 8) + \                         (( *((p) + NODE_SIZE+1)) & 0377))#define GET_UPPER(p)    ((( *((p) + NODE_SIZE+2) & 0377) << 8) + \                         (( *((p) + NODE_SIZE+3)) & 0377))/* Utility definitions. */#define REG_FAIL(m)      {*Error_Ptr = (m); return (NULL);}#define IS_QUANTIFIER(c) ((c) == '*' || (c) == '+' || \                          (c) == '?' || (c) == Brace_Char)#define SET_BIT(i,n)     ((i) |= (1 << ((n) - 1)))#define TEST_BIT(i,n)    ((i) &  (1 << ((n) - 1)))#define U_CHAR_AT(p)     ((unsigned int) *(unsigned char *)(p))/* Flags to be passed up and down via function parameters during compile. */#define WORST             0  /* Worst case. No assumptions can be made.*/#define HAS_WIDTH         1  /* Known never to match null string. */#define SIMPLE            2  /* Simple enough to be STAR/PLUS operand. */#define NO_PAREN          0  /* Only set by initial call to "chunk". */#define PAREN             1  /* Used for normal capturing parentheses. */#define NO_CAPTURE        2  /* Non-capturing parentheses (grouping only). */#define INSENSITIVE       3  /* Case insensitive parenthetical construct */#define SENSITIVE         4  /* Case sensitive parenthetical construct */#define NEWLINE           5  /* Construct to match newlines in most cases */#define NO_NEWLINE        6  /* Construct to match newlines normally */#define REG_INFINITY    0UL#define REG_ZERO        0UL#define REG_ONE         1UL/* Flags for function shortcut_escape() */#define CHECK_ESCAPE       0  /* Check an escape sequence for validity only. */#define CHECK_CLASS_ESCAPE 1  /* Check the validity of an escape within a                                 character class */#define EMIT_CLASS_BYTES   2  /* Emit equivalent character class bytes,                                 e.g \d=0123456789 */#define EMIT_NODE          3  /* Emit the appropriate node. *//* Array sizes for arrays used by function init_ansi_classes. */#define WHITE_SPACE_SIZE   16#define ALNUM_CHAR_SIZE   256/* Number of bytes to offset from the beginning of the regex program to the start   of the actual compiled regex code, i.e. skipping over the MAGIC number and   the two counters at the front.  */#define REGEX_START_OFFSET 3#define MAX_COMPILED_SIZE  32767UL  /* Largest size a compiled regex can be.                                       Probably could be 65535UL. *//* Global work variables for `CompileRE'. */static unsigned char *Reg_Parse;       /* Input scan ptr (scans user's regex) */static int            Total_Paren;     /* Parentheses, (),  counter. */static int            Num_Braces;      /* Number of general {m,n} constructs.                                          {m,n} quantifiers of SIMPLE atoms are                                          not included in this count. */static int            Closed_Parens;   /* Bit flags indicating () closure. */static int            Paren_Has_Width; /* Bit flags indicating ()'s that are                                          known to not match the empty string */static unsigned char  Compute_Size;    /* Address of this used as flag. */static unsigned char *Code_Emit_Ptr;   /* When Code_Emit_Ptr is set to                                          &Compute_Size no code is emitted.                                          Instead, the size of code that WOULD                                          have been generated is accumulated in                                          Reg_Size.  Otherwise, Code_Emit_Ptr                                          points to where compiled regex code is                                          to be written. */static unsigned long  Reg_Size;        /* Size of compiled regex code. */static char         **Error_Ptr;       /* Place to store error messages so                                          they can be returned by `CompileRE' */static char           Error_Text [128];/* Sting to build error messages in. */static unsigned char  White_Space [WHITE_SPACE_SIZE]; /* Arrays used by       */static unsigned char  Word_Char   [ALNUM_CHAR_SIZE];  /* functions            */static unsigned char  Letter_Char [ALNUM_CHAR_SIZE];  /* init_ansi_classes () */                                                      /* and                                                         shortcut_escape ().  */static unsigned char  ASCII_Digits [] = "0123456789"; /* Same for all */                                                      /* locales.     */static int            Is_Case_Insensitive;static int            Match_Newline;static int            Enable_Counting_Quantifier = 1;static unsigned char  Brace_Char;static unsigned char  Default_Meta_Char [] = "{.*+?[(|)^<>$";static unsigned char *Meta_Char;typedef struct { long lower; long upper; } len_range;/* Forward declarations for functions used by `CompileRE'. */static unsigned char * alternative     (int *flag_param, len_range *range_param);static unsigned char * back_ref        (unsigned char *c, int *flag_param,                                        int emit);static unsigned char * chunk           (int paren, int *flag_param, len_range *range_param);static void            emit_byte       (unsigned char c);static void            emit_class_byte (unsigned char c);static unsigned char * emit_node       (int op_code);static unsigned char * emit_special    (unsigned char op_code,                                        unsigned long test_val,                                        int index);static unsigned char   literal_escape  (unsigned char c);static unsigned char   numeric_escape  (unsigned char c, unsigned char **parse);static unsigned char * atom            (int *flag_param, len_range *range_param);static void            reg_error       (char *str);static unsigned char * insert          (unsigned char op, unsigned char *opnd,                                        long min, long max, int index);static unsigned char * next_ptr        (unsigned char *ptr);static void            offset_tail     (unsigned char *ptr, int offset,                                        unsigned char *val);static void            branch_tail     (unsigned char *ptr, int offset,                                        unsigned char *val);static unsigned char * piece           (int *flag_param, len_range *range_param);static void            tail            (unsigned char *search_from,                                        unsigned char *point_t);static unsigned char * shortcut_escape (unsigned char c, int *flag_param,                                        int emit);static int             init_ansi_classes  (void);/*----------------------------------------------------------------------* * CompileRE * * Compiles a regular expression into the internal format used by * `ExecRE'. * * The default behaviour wrt. case sensitivity and newline matching can * be controlled through the defaultFlags argument (Markus Schwarzenberg).  * Future extensions are possible by using other flag bits. * Note that currently only the case sensitivity flag is effectively used. * * Beware that the optimization and preparation code in here knows about * some of the structure of the compiled regexp. *----------------------------------------------------------------------*/regexp * CompileRE (const char *exp, char **errorText, int defaultFlags) {   register                regexp *comp_regex = NULL;   register unsigned char *scan;                     int   flags_local, pass;	 	     len_range range_local;   if (Enable_Counting_Quantifier) {      Brace_Char  = '{';      Meta_Char   = &Default_Meta_Char [0];   } else {      Brace_Char  = '*';                    /* Bypass the '{' in */      Meta_Char   = &Default_Meta_Char [1]; /* Default_Meta_Char */   }   /* Set up errorText to receive failure reports. */    Error_Ptr = errorText;   *Error_Ptr = "";   if (exp == NULL) REG_FAIL ("NULL argument, `CompileRE\'");   /* Initialize arrays used by function `shortcut_escape'. */   if (!init_ansi_classes ()) REG_FAIL ("internal error #1, `CompileRE\'");   Code_Emit_Ptr = &Compute_Size;   Reg_Size      = 0UL;   /* We can't allocate space until we know how big the compiled form will be,      but we can't compile it (and thus know how big it is) until we've got a      place to put the code.  So we cheat: we compile it twice, once with code      generation turned off and size counting turned on, and once "for real".      This also means that we don't allocate space until we are sure that the      thing really will compile successfully, and we never have to move the      code and thus invalidate pointers into it.  (Note that it has to be in      one piece because free() must be able to free it all.) */   for (pass = 1; pass <= 2; pass++) {      /*-------------------------------------------*       * FIRST  PASS: Determine size and legality. *       * SECOND PASS: Emit code.                   *       *-------------------------------------------*/      /*  Schwarzenberg:       *  If defaultFlags = 0 use standard defaults:       *    Is_Case_Insensitive: Case sensitive is the default       *    Match_Newline:       Newlines are NOT matched by default        *                         in character classes         */      Is_Case_Insensitive = ((defaultFlags & REDFLT_CASE_INSENSITIVE) ? 1 : 0);      Match_Newline = 0;  /* ((defaultFlags & REDFLT_MATCH_NEWLINE)   ? 1 : 0);                              Currently not used. Uncomment if needed. */      Reg_Parse       = (unsigned char *) exp;      Total_Paren     = 1;      Num_Braces      = 0;      Closed_Parens   = 0;      Paren_Has_Width = 0;      emit_byte (MAGIC);      emit_byte ('%');  /* Placeholder for num of capturing parentheses.    */      emit_byte ('%');  /* Placeholder for num of general {m,n} constructs. */      if (chunk (NO_PAREN, &flags_local, &range_local) == NULL) 	  return (NULL); /* Something went wrong */      if (pass == 1) {         if (Reg_Size >= MAX_COMPILED_SIZE) {            /* Too big for NEXT pointers NEXT_PTR_SIZE bytes long to span.               This is a real issue since the first BRANCH node usually points               to the end of the compiled regex code. */            sprintf  (Error_Text, "regexp > %lu bytes", MAX_COMPILED_SIZE);            REG_FAIL (Error_Text);         }         /* Allocate memory. */         comp_regex = (regexp *) malloc (sizeof (regexp) + Reg_Size);         if (comp_regex == NULL) REG_FAIL ("out of memory in `CompileRE\'");         Code_Emit_Ptr = (unsigned char *) comp_regex->program;      }   }   comp_regex->program [1] = (unsigned char) Total_Paren - 1;   comp_regex->program [2] = (unsigned char) Num_Braces;   /*----------------------------------------*    * Dig out information for optimizations. *    *----------------------------------------*/   comp_regex->match_start = '\0';   /* Worst-case defaults. */   comp_regex->anchor      =   0;   /* First BRANCH. */   scan = (unsigned char *) (comp_regex->program + REGEX_START_OFFSET);   if (GET_OP_CODE (next_ptr (scan)) == END) { /* Only one top-level choice. */      scan = OPERAND (scan);      /* Starting-point info. */      if (GET_OP_CODE (scan) == EXACTLY) {         comp_regex->match_start = *OPERAND (scan);      } else if (PLUS <= GET_OP_CODE (scan) &&                         GET_OP_CODE (scan) <= LAZY_PLUS) {         /* Allow x+ or x+? at the start of the regex to be            optimized. */         if (GET_OP_CODE (scan + NODE_SIZE) == EXACTLY) {            comp_regex->match_start = *OPERAND (scan + NODE_SIZE);         }      } else if (GET_OP_CODE (scan) == BOL) {         comp_regex->anchor++;      }   }   return (comp_regex);}/*----------------------------------------------------------------------* * chunk                                                                * *                                                                      *

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -