📄 mbregex.c
字号:
/* Extended regular expression matching and search library. Copyright (C) 1993, 94, 95, 96, 97, 98 Free Software Foundation, Inc. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with the GNU C Library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *//* Multi-byte extension added May, 1993 by t^2 (Takahiro Tanimoto) Last change: May 21, 1993 by t^2 *//* removed gapped buffer support, multiple syntax support by matz <matz@nts.co.jp> *//* Perl5 extension added by matz <matz@caelum.co.jp> *//* UTF-8 extension added Jan 16 1999 by Yoshida Masato <yoshidam@tau.bekkoame.ne.jp> */#include "php.h"#ifdef HAVE_CONFIG_H#include "config.h"#endif#if HAVE_MBREGEX#define re_compile_pattern mbre_compile_pattern#define re_free_pattern mbre_free_pattern#define re_adjust_startpos mbre_adjust_startpos#define re_compile_fastmap mbre_compile_fastmap#define re_search mbre_search#define re_match mbre_match#define re_set_casetable mbre_set_casetable#define re_copy_registers mbre_copy_registers#define re_free_registers mbre_free_registers#ifdef HAVE_STRING_H# include <string.h>#else# include <strings.h>#endif/* We write fatal error messages on standard error. */#include <stdio.h>/* isalpha(3) etc. are used for the character classes. */#include <ctype.h>#include <sys/types.h>#ifndef PARAMS# if defined __GNUC__ || (defined __STDC__ && __STDC__)# define PARAMS(args) args# else# define PARAMS(args) ()# endif /* GCC. */#endif /* Not PARAMS. */#if defined(STDC_HEADERS)# include <stddef.h>#else/* We need this for `regex.h', and perhaps for the Emacs include files. */# include <sys/types.h>#endif#ifndef __STDC__# define volatile#endif#ifdef HAVE_PROTOTYPES# define _(args) args#else# define _(args) ()#endif#ifdef RUBY_PLATFORM#include "defines.h"# define RUBYextern int rb_prohibit_interrupt;extern int rb_trap_pending;void rb_trap_exec _((void));# define CHECK_INTS if (!rb_prohibit_interrupt) {\ if (rb_trap_pending) rb_trap_exec();\}#define xmalloc ruby_xmalloc#define xcalloc ruby_xcalloc#define xrealloc ruby_xrealloc#define xfree ruby_xfreevoid *xmalloc _((size_t));void *xcalloc _((size_t,size_t));void *xrealloc _((void*,size_t));void xfree _((void*));#endif#define xmalloc emalloc#define xcalloc ecalloc#define xrealloc erealloc#define xfree efree/* Make alloca work the best possible way. */#ifdef __GNUC__# ifndef atarist# ifndef alloca# define alloca __builtin_alloca# endif# endif /* atarist */#else# if defined(HAVE_ALLOCA_H)# include <alloca.h># elif !defined(alloca)char *alloca();# endif#endif /* __GNUC__ */#ifdef _AIX#pragma alloca#endif#ifdef HAVE_STRING_H# include <string.h>#else# include <strings.h>#endif#ifdef C_ALLOCA#define FREE_VARIABLES() alloca(0)#else#define FREE_VARIABLES()#endif#define FREE_AND_RETURN_VOID(stackb) do { \ FREE_VARIABLES(); \ if (stackb != stacka) xfree(stackb); \ return; \} while(0)#define FREE_AND_RETURN(stackb,val) do { \ FREE_VARIABLES(); \ if (stackb != stacka) xfree(stackb); \ return(val); \} while(0)#define DOUBLE_STACK(type) do { \ type *stackx; \ unsigned int xlen = stacke - stackb; \ if (stackb == stacka) { \ stackx = (type*)xmalloc(2 * xlen * sizeof(type)); \ memcpy(stackx, stackb, xlen * sizeof (type)); \ } \ else { \ stackx = (type*)xrealloc(stackb, 2 * xlen * sizeof(type)); \ } \ /* Rearrange the pointers. */ \ stackp = stackx + (stackp - stackb); \ stackb = stackx; \ stacke = stackb + 2 * xlen; \} while (0)#define RE_TALLOC(n,t) ((t*)alloca((n)*sizeof(t)))#define TMALLOC(n,t) ((t*)xmalloc((n)*sizeof(t)))#define TREALLOC(s,n,t) (s=((t*)xrealloc(s,(n)*sizeof(t))))#define EXPAND_FAIL_STACK() DOUBLE_STACK(unsigned char*)#define ENSURE_FAIL_STACK(n) \ do { \ if (stacke - stackp <= (n)) { \ /* if (len > re_max_failures * MAX_NUM_FAILURE_ITEMS) \ { \ FREE_AND_RETURN(stackb,(-2)); \ }*/ \ \ /* Roughly double the size of the stack. */ \ EXPAND_FAIL_STACK(); \ } \ } while (0)/* Get the interface, including the syntax bits. */#include "mbregex.h"/* Subroutines for re_compile_pattern. */static void store_jump _((char*, int, char*));static void insert_jump _((int, char*, char*, char*));static void store_jump_n _((char*, int, char*, unsigned));static void insert_jump_n _((int, char*, char*, char*, unsigned));#if 0static void insert_op _((int, char*, char*));#endifstatic void insert_op_2 _((int, char*, char*, int, int));static int memcmp_translate _((unsigned char*, unsigned char*, int, const unsigned char*));static const unsigned char* re_mbctab_get _((int));/* Define the syntax stuff, so we can do the \<, \>, etc. *//* This must be nonzero for the wordchar and notwordchar pattern commands in re_match. */#define Sword 1#define Sword2 2#define SYNTAX(c) re_syntax_table[c]static const char casetable[] = { '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', '\010', '\011', '\012', '\013', '\014', '\015', '\016', '\017', '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037', /* ' ' '!' '"' '#' '$' '%' '&' ''' */ '\040', '\041', '\042', '\043', '\044', '\045', '\046', '\047', /* '(' ')' '*' '+' ',' '-' '.' '/' */ '\050', '\051', '\052', '\053', '\054', '\055', '\056', '\057', /* '0' '1' '2' '3' '4' '5' '6' '7' */ '\060', '\061', '\062', '\063', '\064', '\065', '\066', '\067', /* '8' '9' ':' ';' '<' '=' '>' '?' */ '\070', '\071', '\072', '\073', '\074', '\075', '\076', '\077', /* '@' 'A' 'B' 'C' 'D' 'E' 'F' 'G' */ '\100', '\141', '\142', '\143', '\144', '\145', '\146', '\147', /* 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' */ '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157', /* 'P' 'Q' 'R' 'S' 'T' 'U' 'V' 'W' */ '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167', /* 'X' 'Y' 'Z' '[' '\' ']' '^' '_' */ '\170', '\171', '\172', '\133', '\134', '\135', '\136', '\137', /* '`' 'a' 'b' 'c' 'd' 'e' 'f' 'g' */ '\140', '\141', '\142', '\143', '\144', '\145', '\146', '\147', /* 'h' 'i' 'j' 'k' 'l' 'm' 'n' 'o' */ '\150', '\151', '\152', '\153', '\154', '\155', '\156', '\157', /* 'p' 'q' 'r' 's' 't' 'u' 'v' 'w' */ '\160', '\161', '\162', '\163', '\164', '\165', '\166', '\167', /* 'x' 'y' 'z' '{' '|' '}' '~' */ '\170', '\171', '\172', '\173', '\174', '\175', '\176', '\177', '\200', '\201', '\202', '\203', '\204', '\205', '\206', '\207', '\210', '\211', '\212', '\213', '\214', '\215', '\216', '\217', '\220', '\221', '\222', '\223', '\224', '\225', '\226', '\227', '\230', '\231', '\232', '\233', '\234', '\235', '\236', '\237', '\240', '\241', '\242', '\243', '\244', '\245', '\246', '\247', '\250', '\251', '\252', '\253', '\254', '\255', '\256', '\257', '\260', '\261', '\262', '\263', '\264', '\265', '\266', '\267', '\270', '\271', '\272', '\273', '\274', '\275', '\276', '\277', '\300', '\301', '\302', '\303', '\304', '\305', '\306', '\307', '\310', '\311', '\312', '\313', '\314', '\315', '\316', '\317', '\320', '\321', '\322', '\323', '\324', '\325', '\326', '\327', '\330', '\331', '\332', '\333', '\334', '\335', '\336', '\337', '\340', '\341', '\342', '\343', '\344', '\345', '\346', '\347', '\350', '\351', '\352', '\353', '\354', '\355', '\356', '\357', '\360', '\361', '\362', '\363', '\364', '\365', '\366', '\367', '\370', '\371', '\372', '\373', '\374', '\375', '\376', '\377',};static char re_syntax_table[256];static void init_syntax_once _((void));static const unsigned char *translate = (const unsigned char*)casetable;static void init_regs _((struct mbre_registers*, unsigned int));static void bm_init_skip _((int *, unsigned char*, int, const unsigned char*));#if 0static int current_mbctype = MBCTYPE_ASCII;#endif#undef Pstatic unsigned longscan_oct(start, len, retlen)const char *start;int len;int *retlen;{ register const char *s = start; register unsigned long retval = 0; while (len-- && *s >= '0' && *s <= '7') { retval <<= 3; retval |= *s++ - '0'; } *retlen = s - start; return retval;}static unsigned longscan_hex(start, len, retlen)const char *start;int len;int *retlen;{ static char hexdigit[] = "0123456789abcdef0123456789ABCDEFx"; register const char *s = start; register unsigned long retval = 0; char *tmp; while (len-- && *s && (tmp = strchr(hexdigit, *s))) { retval <<= 4; retval |= (tmp - hexdigit) & 15; s++; } *retlen = s - start; return retval;}#define rt re_syntax_tablestatic voidinit_syntax_once(){ register int c; static int done = 0;#ifdef ZTS extern MUTEX_T mbregex_locale_mutex;#endif if (done) { return; }#ifdef ZTS tsrm_mutex_lock( mbregex_locale_mutex );#endif memset(re_syntax_table, 0, sizeof(re_syntax_table)); for (c=0; c<=0x7f; c++) { if (isalnum(c)) { re_syntax_table[c] = Sword; } } re_syntax_table['_'] = Sword; for (c=0x80; c<=0xff; c++) { if (isalnum(c)) { re_syntax_table[c] = Sword2; } }#ifdef ZTS tsrm_mutex_unlock( mbregex_locale_mutex );#endif done = 1;}voidre_set_casetable(table) const char *table;{ translate = (const unsigned char*)table;}/* Jim Meyering writes: "... Some ctype macros are valid only for character codes that isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when using /bin/cc or gcc but without giving an ansi option). So, all ctype uses should be through macros like ISPRINT... If STDC_HEADERS is defined, then autoconf has verified that the ctype macros don't need to be guarded with references to isascii. ... Defining isascii to 1 should let any compiler worth its salt eliminate the && through constant folding." Solaris defines some of these symbols so we must undefine them first. */#undef ISASCII#if defined STDC_HEADERS || (!defined isascii && !defined HAVE_ISASCII)# define ISASCII(c) 1#else# define ISASCII(c) isascii(c)#endif#ifdef isblank# define ISBLANK(c) (ISASCII(c) && isblank(c))#else# define ISBLANK(c) ((c) == ' ' || (c) == '\t')#endif#ifdef isgraph# define ISGRAPH(c) (ISASCII(c) && isgraph(c))#else# define ISGRAPH(c) (ISASCII(c) && isprint(c) && !isspace(c))#endif#undef ISPRINT#define ISPRINT(c) (ISASCII(c) && isprint(c))#define ISDIGIT(c) (ISASCII(c) && isdigit(c))#define ISALNUM(c) (ISASCII(c) && isalnum(c))#define ISALPHA(c) (ISASCII(c) && isalpha(c))#define ISCNTRL(c) (ISASCII(c) && iscntrl(c))#define ISLOWER(c) (ISASCII(c) && islower(c))#define ISPUNCT(c) (ISASCII(c) && ispunct(c))#define ISSPACE(c) (ISASCII(c) && isspace(c))#define ISUPPER(c) (ISASCII(c) && isupper(c))#define ISXDIGIT(c) (ISASCII(c) && isxdigit(c))#ifndef NULL# define NULL (void *)0#endif/* We remove any previous definition of `SIGN_EXTEND_CHAR', since ours (we hope) works properly with all combinations of machines, compilers, `char' and `unsigned char' argument types. (Per Bothner suggested the basic approach.) */#undef SIGN_EXTEND_CHAR#if __STDC__# define SIGN_EXTEND_CHAR(c) ((signed char)(c))#else /* not __STDC__ *//* As in Harbison and Steele. */# define SIGN_EXTEND_CHAR(c) ((((unsigned char)(c)) ^ 128) - 128)#endif/* These are the command codes that appear in compiled regular expressions, one per byte. Some command codes are followed by argument bytes. A command code can specify any interpretation whatsoever for its arguments. Zero-bytes may appear in the compiled regular expression. The value of `exactn' is needed in search.c (search_buffer) in emacs. So regex.h defines a symbol `RE_EXACTN_VALUE' to be 1; the value of `exactn' we use here must also be 1. */enum regexpcode { unused=0, exactn=1, /* Followed by one byte giving n, then by n literal bytes. */ begline, /* Fail unless at beginning of line. */ endline, /* Fail unless at end of line. */ begbuf, /* Succeeds if at beginning of buffer (if emacs) or at beginning of string to be matched (if not). */ endbuf, /* Analogously, for end of buffer/string. */ endbuf2, /* End of buffer/string, or newline just before it. */ begpos, /* Matches where last scan//gsub left off. */ jump, /* Followed by two bytes giving relative address to jump to. */ jump_past_alt,/* Same as jump, but marks the end of an alternative. */ on_failure_jump, /* Followed by two bytes giving relative address of place to resume at in case of failure. */ finalize_jump, /* Throw away latest failure point and then jump to address. */ maybe_finalize_jump, /* Like jump but finalize if safe to do so. This is used to jump back to the beginning of a repeat. If the command that follows this jump is clearly incompatible with the one at the beginning of the repeat, such that we can be sure that there is no use backtracking out of repetitions already completed, then we finalize. */ dummy_failure_jump, /* Jump, and push a dummy failure point. This failure point will be thrown away if an attempt is made to use it for a failure. A + construct makes this before the first repeat. Also use it as an intermediary kind of jump when compiling an or construct. */ push_dummy_failure, /* Push a dummy failure point and continue. Used at the end of alternatives. */ succeed_n, /* Used like on_failure_jump except has to succeed n times; then gets turned into an on_failure_jump. The relative address following it is useless until then. The address is followed by two bytes containing n. */ jump_n, /* Similar to jump, but jump n times only; also the relative address following is in turn followed by yet two more bytes containing n. */ try_next, /* Jump to next pattern for the first time, leaving this pattern on the failure stack. */ finalize_push, /* Finalize stack and push the beginning of the pattern on the stack to retry (used for non-greedy match) */ finalize_push_n, /* Similar to finalize_push, buf finalize n time only */ set_number_at, /* Set the following relative location to the subsequent number. */ anychar, /* Matches any (more or less) one character excluding newlines. */ anychar_repeat, /* Matches sequence of characters excluding newlines. */ charset, /* Matches any one char belonging to specified set. First following byte is number of bitmap bytes. Then come bytes for a bitmap saying which chars are in. Bits in each byte are ordered low-bit-first. A character is in the set if its bit is 1. A character too large to have a bit in the map is automatically not in the set. */ charset_not, /* Same parameters as charset, but match any character that is not one of those specified. */ start_memory, /* Start remembering the text that is matched, for storing in a memory register. Followed by one byte containing the register number. Register numbers must be in the range 0 through MBRE_NREGS. */ stop_memory, /* Stop remembering the text that is matched
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -