📄 regex.c
字号:
/* * $Id: regex.c,v 1.3 2000/05/07 12:26:11 fnevgeny Exp $ * * Copyright (c) 1991 HaL Computer Systems, Inc. All rights reserved. * * HAL COMPUTER SYSTEMS INTERNATIONAL, LTD. * 1315 Dell Avenue * Campbell, CA 95008 * * Author: Greg Hilton * Contributors: Tom Lang, Frank Bieser, and others * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * http://www.gnu.org/copyleft/gpl.html * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */#include <config.h>/*regex.cAuthor: Tatu Ylonen <ylo@ngs.fi>Copyright (c) 1991 Tatu Ylonen, Espoo, FinlandPermission to use, copy, modify, distribute, and sell this softwareand its documentation for any purpose is hereby granted without fee,provided that the above copyright notice appear in all copies. Thissoftware is provided "as is" without express or implied warranty.Created: Thu Sep 26 17:14:05 1991 yloLast modified: Mon Nov 4 17:06:48 1991 yloThis code draws many ideas from the regular expression packages byHenry Spencer of the University of Toronto and Richard Stallman of theFree Software Foundation.Emacs-specific code and syntax table code is almost directly borrowedfrom GNU regexp.*/#include <stdio.h>#include <string.h>#include <assert.h>#include "regex.h"char *malloc();void free();char *realloc();#define MACRO_BEGIN do {#define MACRO_END } while (0)enum regexp_compiled_ops /* opcodes for compiled regexp */{ Cend, /* end of pattern reached */ Cbol, /* beginning of line */ Ceol, /* end of line */ Cset, /* character set. Followed by 32 bytes of set. */ Cexact, /* followed by a byte to match */ Canychar, /* matches any character except newline */ Cstart_memory, /* set register start addr (followed by reg number) */ Cend_memory, /* set register end addr (followed by reg number) */ Cmatch_memory, /* match a duplicate of reg contents (regnum follows)*/ Cjump, /* followed by two bytes (lsb,msb) of displacement. */ Cstar_jump, /* will change to jump/update_failure_jump at runtime */ Cfailure_jump, /* jump to addr on failure */ Cupdate_failure_jump, /* update topmost failure point and jump */ Cdummy_failure_jump, /* push a dummy failure point and jump */ Cbegbuf, /* match at beginning of buffer */ Cendbuf, /* match at end of buffer */ Cwordbeg, /* match at beginning of word */ Cwordend, /* match at end of word */ Cwordbound, /* match if at word boundary */ Cnotwordbound, /* match if not at word boundary */#ifdef emacs Cemacs_at_dot, /* emacs only: matches at dot */#endif /* emacs */ Csyntaxspec, /* matches syntax code (1 byte follows) */ Cnotsyntaxspec /* matches if syntax code does not match (1 byte foll)*/};enum regexp_syntax_op /* syntax codes for plain and quoted characters */{ Rend, /* special code for end of regexp */ Rnormal, /* normal character */ Ranychar, /* any character except newline */ Rquote, /* the quote character */ Rbol, /* match beginning of line */ Reol, /* match end of line */ Roptional, /* match preceding expression optionally */ Rstar, /* match preceding expr zero or more times */ Rplus, /* match preceding expr one or more times */ Ror, /* match either of alternatives */ Ropenpar, /* opening parenthesis */ Rclosepar, /* closing parenthesis */ Rmemory, /* match memory register */ Rextended_memory, /* \vnn to match registers 10-99 */ Ropenset, /* open set. Internal syntax hard-coded below. */ /* the following are gnu extensions to "normal" regexp syntax */ Rbegbuf, /* beginning of buffer */ Rendbuf, /* end of buffer */ Rwordchar, /* word character */ Rnotwordchar, /* not word character */ Rwordbeg, /* beginning of word */ Rwordend, /* end of word */ Rwordbound, /* word bound */ Rnotwordbound, /* not word bound */#ifdef emacs Remacs_at_dot, /* emacs: at dot */ Remacs_syntaxspec, /* syntaxspec */ Remacs_notsyntaxspec, /* notsyntaxspec */#endif /* emacs */ Rnum_ops};static int hre_compile_initialized = 0;static int regexp_syntax = 0;static unsigned char regexp_plain_ops[256];static unsigned char regexp_quoted_ops[256];static unsigned char regexp_precedences[Rnum_ops];static int regexp_context_indep_ops;static int regexp_ansi_sequences;#define NUM_LEVELS 5 /* number of precedence levels in use */#define MAX_NESTING 100 /* max nesting level of operators */#ifdef emacs/* This code is for emacs compatibility only. */#include "config.h"#include "lisp.h"#include "buffer.h"#include "syntax.h"/* emacs defines NULL in some strange way? */#undef NULL#define NULL 0#else /* emacs */#define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)]#define Sword 1#ifdef SYNTAX_TABLEchar *re_syntax_table;#elsestatic char re_syntax_table[256];#endif /* SYNTAX_TABLE */#endif /* emacs */static void hre_compile_initialize(){ int a; #if !defined(emacs) && !defined(SYNTAX_TABLE) static int syntax_table_inited = 0; if (!syntax_table_inited) { syntax_table_inited = 1; memset(re_syntax_table, 0, 256); for (a = 'a'; a <= 'z'; a++) re_syntax_table[a] = Sword; for (a = 'A'; a <= 'Z'; a++) re_syntax_table[a] = Sword; for (a = '0'; a <= '9'; a++) re_syntax_table[a] = Sword; }#endif /* !emacs && !SYNTAX_TABLE */ hre_compile_initialized = 1; for (a = 0; a < 256; a++) { regexp_plain_ops[a] = Rnormal; regexp_quoted_ops[a] = Rnormal; } for (a = '0'; a <= '9'; a++) regexp_quoted_ops[a] = Rmemory; regexp_plain_ops['\134'] = Rquote; if (regexp_syntax & RE_NO_BK_PARENS) { regexp_plain_ops['('] = Ropenpar; regexp_plain_ops[')'] = Rclosepar; } else { regexp_quoted_ops['('] = Ropenpar; regexp_quoted_ops[')'] = Rclosepar; } if (regexp_syntax & RE_NO_BK_VBAR) regexp_plain_ops['\174'] = Ror; else regexp_quoted_ops['\174'] = Ror; regexp_plain_ops['*'] = Rstar; if (regexp_syntax & RE_BK_PLUS_QM) { regexp_quoted_ops['+'] = Rplus; regexp_quoted_ops['?'] = Roptional; } else { regexp_plain_ops['+'] = Rplus; regexp_plain_ops['?'] = Roptional; } if (regexp_syntax & RE_NEWLINE_OR) regexp_plain_ops['\n'] = Ror; regexp_plain_ops['\133'] = Ropenset; regexp_plain_ops['\136'] = Rbol; regexp_plain_ops['$'] = Reol; regexp_plain_ops['.'] = Ranychar; if (!(regexp_syntax & RE_NO_GNU_EXTENSIONS)) {#ifdef emacs regexp_quoted_ops['='] = Remacs_at_dot; regexp_quoted_ops['s'] = Remacs_syntaxspec; regexp_quoted_ops['S'] = Remacs_notsyntaxspec;#endif /* emacs */ regexp_quoted_ops['w'] = Rwordchar; regexp_quoted_ops['W'] = Rnotwordchar; regexp_quoted_ops['<'] = Rwordbeg; regexp_quoted_ops['>'] = Rwordend; regexp_quoted_ops['b'] = Rwordbound; regexp_quoted_ops['B'] = Rnotwordbound; regexp_quoted_ops['`'] = Rbegbuf; regexp_quoted_ops['\''] = Rendbuf; } if (regexp_syntax & RE_ANSI_HEX) regexp_quoted_ops['v'] = Rextended_memory; for (a = 0; a < Rnum_ops; a++) regexp_precedences[a] = 4; if (regexp_syntax & RE_TIGHT_VBAR) { regexp_precedences[Ror] = 3; regexp_precedences[Rbol] = 2; regexp_precedences[Reol] = 2; } else { regexp_precedences[Ror] = 2; regexp_precedences[Rbol] = 3; regexp_precedences[Reol] = 3; } regexp_precedences[Rclosepar] = 1; regexp_precedences[Rend] = 0; regexp_context_indep_ops = (regexp_syntax & RE_CONTEXT_INDEP_OPS) != 0; regexp_ansi_sequences = (regexp_syntax & RE_ANSI_HEX) != 0;}int hre_set_syntax(syntax)int syntax;{ int ret; ret = regexp_syntax; regexp_syntax = syntax; hre_compile_initialize(); return ret;}static int hex_char_to_decimal(ch)int ch;{ if (ch >= '0' && ch <= '9') return ch - '0'; if (ch >= 'a' && ch <= 'f') return ch - 'a' + 10; if (ch >= 'A' && ch <= 'F') return ch - 'A' + 10; return 16;}char *hre_compile_pattern(regex, size, bufp)char *regex;int size;regexp_t bufp;{ int a, pos, op, current_level, level, opcode; int pattern_offset, alloc; int starts[NUM_LEVELS * MAX_NESTING], starts_base; int future_jumps[MAX_NESTING], num_jumps; unsigned char ch; char *pattern, *translate; int next_register, paren_depth, num_open_registers, open_registers[RE_NREGS]; int beginning_context;#define NEXTCHAR(var) \ MACRO_BEGIN \ if (pos >= size) \ goto ends_prematurely; \ (var) = regex[pos]; \ pos++; \ MACRO_END#define ALLOC(amount) \ MACRO_BEGIN \ if (pattern_offset+(amount) > alloc) \ { \ alloc += 256 + (amount); \ pattern = realloc(pattern, alloc); \ if (!pattern) \ goto out_of_memory; \ } \ MACRO_END#define STORE(ch) pattern[pattern_offset++] = (ch)#define CURRENT_LEVEL_START (starts[starts_base + current_level])#define SET_LEVEL_START starts[starts_base + current_level] = pattern_offset#define PUSH_LEVEL_STARTS if (starts_base < (MAX_NESTING-1)*NUM_LEVELS) \ starts_base += NUM_LEVELS; \ else \ goto too_complex#define POP_LEVEL_STARTS starts_base -= NUM_LEVELS#define PUT_ADDR(offset,addr) \ MACRO_BEGIN \ int disp = (addr) - (offset) - 2; \ pattern[(offset)] = disp & 0xff; \ pattern[(offset)+1] = (disp>>8) & 0xff; \ MACRO_END#define INSERT_JUMP(pos,type,addr) \ MACRO_BEGIN \ int a, p = (pos), t = (type), ad = (addr); \ for (a = pattern_offset - 1; a >= p; a--) \ pattern[a + 3] = pattern[a]; \ pattern[p] = t; \ PUT_ADDR(p+1,ad); \ pattern_offset += 3; \ MACRO_END#define SETBIT(buf,offset,bit) (buf)[(offset)+(bit)/8] |= (1<<((bit) & 7))#define SET_FIELDS \ MACRO_BEGIN \ bufp->allocated = alloc; \ bufp->buffer = pattern; \ bufp->used = pattern_offset; \ MACRO_END #define GETHEX(var) \ MACRO_BEGIN \ char gethex_ch, gethex_value; \ NEXTCHAR(gethex_ch); \ gethex_value = hex_char_to_decimal(gethex_ch); \ if (gethex_value == 16) \ goto hex_error; \ NEXTCHAR(gethex_ch); \ gethex_ch = hex_char_to_decimal(gethex_ch); \ if (gethex_ch == 16) \ goto hex_error; \ (var) = gethex_value * 16 + gethex_ch; \ MACRO_END#define ANSI_TRANSLATE(ch) \ MACRO_BEGIN \ switch (ch) \ { \ case 'a': \ case 'A': \ ch = 7; /* audible bell */ \ break; \ case 'b': \ case 'B': \ ch = 8; /* backspace */ \ break; \ case 'f': \ case 'F': \ ch = 12; /* form feed */ \ break; \ case 'n': \ case 'N': \ ch = 10; /* line feed */ \ break; \ case 'r': \ case 'R': \ ch = 13; /* carriage return */ \ break; \ case 't': \ case 'T': \ ch = 9; /* tab */ \ break; \ case 'v': \ case 'V': \ ch = 11; /* vertical tab */ \ break; \ case 'x': /* hex code */ \ case 'X': \ GETHEX(ch); \ break; \ default: \ /* other characters passed through */ \ if (translate) \ ch = translate[(unsigned char)ch]; \ break; \ } \ MACRO_END if (!hre_compile_initialized) hre_compile_initialize(); bufp->used = 0; bufp->fastmap_accurate = 0; bufp->uses_registers = 0; translate = bufp->translate; pattern = bufp->buffer; alloc = bufp->allocated; if (alloc == 0 || pattern == NULL) { alloc = 256; pattern = malloc(alloc); if (!pattern) goto out_of_memory; } pattern_offset = 0; starts_base = 0; num_jumps = 0; current_level = 0; SET_LEVEL_START; num_open_registers = 0; next_register = 1; paren_depth = 0; beginning_context = 1; op = -1; /* we use Rend dummy to ensure that pending jumps are updated (due to low priority of Rend) before exiting the loop. */ pos = 0; while (op != Rend) { if (pos >= size) op = Rend; else { NEXTCHAR(ch); if (translate) ch = translate[(unsigned char)ch]; op = regexp_plain_ops[(unsigned char)ch]; if (op == Rquote) { NEXTCHAR(ch); op = regexp_quoted_ops[(unsigned char)ch]; if (op == Rnormal && regexp_ansi_sequences) ANSI_TRANSLATE(ch); } } level = regexp_precedences[op]; /* printf("ch='%c' op=%d level=%d current_level=%d curlevstart=%d\n", ch, op, level, current_level, CURRENT_LEVEL_START); */ if (level > current_level) { for (current_level++; current_level < level; current_level++) SET_LEVEL_START; SET_LEVEL_START; } else if (level < current_level) { current_level = level; for (;num_jumps > 0 && future_jumps[num_jumps-1] >= CURRENT_LEVEL_START; num_jumps--) PUT_ADDR(future_jumps[num_jumps-1], pattern_offset); } switch (op) { case Rend: break; case Rnormal: normal_char: opcode = Cexact; store_opcode_and_arg: /* opcode & ch must be set */ SET_LEVEL_START; ALLOC(2); STORE(opcode); STORE(ch); break; case Ranychar: opcode = Canychar; store_opcode: SET_LEVEL_START; ALLOC(1); STORE(opcode); break; case Rquote: abort(); /*NOTREACHED*/ case Rbol: if (!beginning_context) if (regexp_context_indep_ops) goto op_error; else goto normal_char; opcode = Cbol; goto store_opcode; case Reol: if (!((pos >= size) || ((regexp_syntax & RE_NO_BK_VBAR) ? (regex[pos] == '\174') : (pos+1 < size && regex[pos] == '\134' && regex[pos+1] == '\174')) || ((regexp_syntax & RE_NO_BK_PARENS)? (regex[pos] == ')'): (pos+1 < size && regex[pos] == '\134' && regex[pos+1] == ')')))) if (regexp_context_indep_ops) goto op_error; else goto normal_char; opcode = Ceol; goto store_opcode; break; case Roptional: if (beginning_context) if (regexp_context_indep_ops) goto op_error; else goto normal_char; if (CURRENT_LEVEL_START == pattern_offset) break; /* ignore empty patterns for ? */ ALLOC(3); INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 3); break; case Rstar: case Rplus: if (beginning_context) if (regexp_context_indep_ops) goto op_error; else goto normal_char; if (CURRENT_LEVEL_START == pattern_offset) break; /* ignore empty patterns for + and * */ ALLOC(9); INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 6); INSERT_JUMP(pattern_offset, Cstar_jump, CURRENT_LEVEL_START); if (op == Rplus) /* jump over initial failure_jump */ INSERT_JUMP(CURRENT_LEVEL_START, Cdummy_failure_jump, CURRENT_LEVEL_START + 6); break; case Ror: ALLOC(6); INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump, pattern_offset + 6); if (num_jumps >= MAX_NESTING) goto too_complex; STORE(Cjump); future_jumps[num_jumps++] = pattern_offset; STORE(0); STORE(0); SET_LEVEL_START;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -