📄 regex_internal.h
字号:
/* Extended regular expression matching and search library. Copyright (C) 2002, 2003 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */#ifndef _REGEX_INTERNAL_H#define _REGEX_INTERNAL_H 1#ifdef HAVE_CONFIG_H#include "config.h"#endif#include <assert.h>#include <ctype.h>#include <limits.h>#include <stdio.h>#include <stdlib.h>#include <string.h>#if defined HAVE_LOCALE_H || defined _LIBC# include <locale.h>#endif#if defined HAVE_WCHAR_H || defined _LIBC# include <wchar.h>#endif /* HAVE_WCHAR_H || _LIBC */#if defined HAVE_WCTYPE_H || defined _LIBC# include <wctype.h>#endif /* HAVE_WCTYPE_H || _LIBC *//* In case that the system doesn't have isblank(). */#if !defined _LIBC && !defined HAVE_ISBLANK && !defined isblank# define isblank(ch) ((ch) == ' ' || (ch) == '\t')#endif#ifdef _LIBC# ifndef _RE_DEFINE_LOCALE_FUNCTIONS# define _RE_DEFINE_LOCALE_FUNCTIONS 1# include <locale/localeinfo.h># include <locale/elem-hash.h># include <locale/coll-lookup.h># endif#endif/* This is for other GNU distributions with internationalized messages. */#if HAVE_LIBINTL_H || defined _LIBC# include <libintl.h># ifdef _LIBC# undef gettext# define gettext(msgid) \ INTUSE(__dcgettext) (INTUSE(_libc_intl_domainname), msgid, LC_MESSAGES)# endif#else# define gettext(msgid) (msgid)#endif#ifndef gettext_noop/* This define is so xgettext can find the internationalizable strings. */# define gettext_noop(String) String#endif#if (defined MB_CUR_MAX && HAVE_LOCALE_H && HAVE_WCTYPE_H && HAVE_WCHAR_H && HAVE_WCRTOMB && HAVE_MBRTOWC && HAVE_WCSCOLL) || _LIBC# define RE_ENABLE_I18N#endif#if __GNUC__ >= 3# define BE(expr, val) __builtin_expect (expr, val)#else# define BE(expr, val) (expr)# define inline#endif/* Number of bits in a byte. */#define BYTE_BITS 8/* Number of single byte character. */#define SBC_MAX 256#define COLL_ELEM_LEN_MAX 8/* The character which represents newline. */#define NEWLINE_CHAR '\n'#define WIDE_NEWLINE_CHAR L'\n'/* Rename to standard API for using out of glibc. */#ifndef _LIBC# define __wctype wctype# define __iswctype iswctype# define __btowc btowc# define __mempcpy mempcpy# define __wcrtomb wcrtomb# define attribute_hidden#endif /* not _LIBC */extern const char __re_error_msgid[] attribute_hidden;extern const size_t __re_error_msgid_idx[] attribute_hidden;/* Number of bits in an unsinged int. */#define UINT_BITS (sizeof (unsigned int) * BYTE_BITS)/* Number of unsigned int in an bit_set. */#define BITSET_UINTS ((SBC_MAX + UINT_BITS - 1) / UINT_BITS)typedef unsigned int bitset[BITSET_UINTS];typedef unsigned int *re_bitset_ptr_t;#define bitset_set(set,i) (set[i / UINT_BITS] |= 1 << i % UINT_BITS)#define bitset_clear(set,i) (set[i / UINT_BITS] &= ~(1 << i % UINT_BITS))#define bitset_contain(set,i) (set[i / UINT_BITS] & (1 << i % UINT_BITS))#define bitset_empty(set) memset (set, 0, sizeof (unsigned int) * BITSET_UINTS)#define bitset_set_all(set) \ memset (set, 255, sizeof (unsigned int) * BITSET_UINTS)#define bitset_copy(dest,src) \ memcpy (dest, src, sizeof (unsigned int) * BITSET_UINTS)static inline void bitset_not (bitset set);static inline void bitset_merge (bitset dest, const bitset src);static inline void bitset_not_merge (bitset dest, const bitset src);#define PREV_WORD_CONSTRAINT 0x0001#define PREV_NOTWORD_CONSTRAINT 0x0002#define NEXT_WORD_CONSTRAINT 0x0004#define NEXT_NOTWORD_CONSTRAINT 0x0008#define PREV_NEWLINE_CONSTRAINT 0x0010#define NEXT_NEWLINE_CONSTRAINT 0x0020#define PREV_BEGBUF_CONSTRAINT 0x0040#define NEXT_ENDBUF_CONSTRAINT 0x0080#define DUMMY_CONSTRAINT 0x0100typedef enum{ INSIDE_WORD = PREV_WORD_CONSTRAINT | NEXT_WORD_CONSTRAINT, WORD_FIRST = PREV_NOTWORD_CONSTRAINT | NEXT_WORD_CONSTRAINT, WORD_LAST = PREV_WORD_CONSTRAINT | NEXT_NOTWORD_CONSTRAINT, LINE_FIRST = PREV_NEWLINE_CONSTRAINT, LINE_LAST = NEXT_NEWLINE_CONSTRAINT, BUF_FIRST = PREV_BEGBUF_CONSTRAINT, BUF_LAST = NEXT_ENDBUF_CONSTRAINT, WORD_DELIM = DUMMY_CONSTRAINT} re_context_type;typedef struct{ int alloc; int nelem; int *elems;} re_node_set;typedef enum{ NON_TYPE = 0, /* Token type, these are used only by token. */ OP_OPEN_BRACKET, OP_CLOSE_BRACKET, OP_CHARSET_RANGE, OP_OPEN_DUP_NUM, OP_CLOSE_DUP_NUM, OP_NON_MATCH_LIST, OP_OPEN_COLL_ELEM, OP_CLOSE_COLL_ELEM, OP_OPEN_EQUIV_CLASS, OP_CLOSE_EQUIV_CLASS, OP_OPEN_CHAR_CLASS, OP_CLOSE_CHAR_CLASS, OP_WORD, OP_NOTWORD, BACK_SLASH, /* Tree type, these are used only by tree. */ CONCAT, ALT, SUBEXP, SIMPLE_BRACKET,#ifdef RE_ENABLE_I18N COMPLEX_BRACKET,#endif /* RE_ENABLE_I18N */ /* Node type, These are used by token, node, tree. */ OP_OPEN_SUBEXP, OP_CLOSE_SUBEXP, OP_PERIOD, CHARACTER, END_OF_RE, OP_ALT, OP_DUP_ASTERISK, OP_DUP_PLUS, OP_DUP_QUESTION, OP_BACK_REF, ANCHOR, /* Dummy marker. */ END_OF_RE_TOKEN_T} re_token_type_t;#ifdef RE_ENABLE_I18Ntypedef struct{ /* Multibyte characters. */ wchar_t *mbchars; /* Collating symbols. */# ifdef _LIBC int32_t *coll_syms;# endif /* Equivalence classes. */# ifdef _LIBC int32_t *equiv_classes;# endif /* Range expressions. */# ifdef _LIBC uint32_t *range_starts; uint32_t *range_ends;# else /* not _LIBC */ wchar_t *range_starts; wchar_t *range_ends;# endif /* not _LIBC */ /* Character classes. */ wctype_t *char_classes; /* If this character set is the non-matching list. */ unsigned int non_match : 1; /* # of multibyte characters. */ int nmbchars; /* # of collating symbols. */ int ncoll_syms; /* # of equivalence classes. */ int nequiv_classes; /* # of range expressions. */ int nranges; /* # of character classes. */ int nchar_classes;} re_charset_t;#endif /* RE_ENABLE_I18N */typedef struct{ union { unsigned char c; /* for CHARACTER */ re_bitset_ptr_t sbcset; /* for SIMPLE_BRACKET */#ifdef RE_ENABLE_I18N re_charset_t *mbcset; /* for COMPLEX_BRACKET */#endif /* RE_ENABLE_I18N */ int idx; /* for BACK_REF */ re_context_type ctx_type; /* for ANCHOR */ } opr;#if __GNUC__ >= 2 re_token_type_t type : 8;#else re_token_type_t type;#endif unsigned int constraint : 10; /* context constraint */ unsigned int duplicated : 1;#ifdef RE_ENABLE_I18N unsigned int mb_partial : 1;#endif} re_token_t;#define IS_EPSILON_NODE(type) \ ((type) == OP_ALT || (type) == OP_DUP_ASTERISK || (type) == OP_DUP_PLUS \ || (type) == OP_DUP_QUESTION || (type) == ANCHOR \ || (type) == OP_OPEN_SUBEXP || (type) == OP_CLOSE_SUBEXP)#define ACCEPT_MB_NODE(type) \ ((type) == COMPLEX_BRACKET || (type) == OP_PERIOD)struct re_string_t{ /* Indicate the raw buffer which is the original string passed as an argument of regexec(), re_search(), etc.. */ const unsigned char *raw_mbs; /* Store the multibyte string. In case of "case insensitive mode" like REG_ICASE, upper cases of the string are stored, otherwise MBS points the same address that RAW_MBS points. */ unsigned char *mbs; /* Store the case sensitive multibyte string. In case of "case insensitive mode", the original string are stored, otherwise MBS_CASE points the same address that MBS points. */ unsigned char *mbs_case;#ifdef RE_ENABLE_I18N /* Store the wide character string which is corresponding to MBS. */ wint_t *wcs; mbstate_t cur_state;#endif /* Index in RAW_MBS. Each character mbs[i] corresponds to raw_mbs[raw_mbs_idx + i]. */ int raw_mbs_idx; /* The length of the valid characters in the buffers. */ int valid_len; /* The length of the buffers MBS, MBS_CASE, and WCS. */ int bufs_len; /* The index in MBS, which is updated by re_string_fetch_byte. */ int cur_idx; /* This is length_of_RAW_MBS - RAW_MBS_IDX. */ int len; /* End of the buffer may be shorter than its length in the cases such as re_match_2, re_search_2. Then, we use STOP for end of the buffer instead of LEN. */ int stop; /* The context of mbs[0]. We store the context independently, since the context of mbs[0] may be different from raw_mbs[0], which is the beginning of the input string. */ unsigned int tip_context; /* The translation passed as a part of an argument of re_compile_pattern. */ RE_TRANSLATE_TYPE trans; /* 1 if REG_ICASE. */ unsigned int icase : 1;};typedef struct re_string_t re_string_t;/* In case of REG_ICASE, we allocate the buffer dynamically for mbs. */#define MBS_ALLOCATED(pstr) (pstr->icase)/* In case that we need translation, we allocate the buffer dynamically for mbs_case. Note that mbs == mbs_case if not REG_ICASE. */#define MBS_CASE_ALLOCATED(pstr) (pstr->trans != NULL)static reg_errcode_t re_string_allocate (re_string_t *pstr, const char *str, int len, int init_len, RE_TRANSLATE_TYPE trans, int icase);static reg_errcode_t re_string_construct (re_string_t *pstr, const char *str, int len, RE_TRANSLATE_TYPE trans, int icase);static reg_errcode_t re_string_reconstruct (re_string_t *pstr, int idx, int eflags, int newline);static reg_errcode_t re_string_realloc_buffers (re_string_t *pstr, int new_buf_len);#ifdef RE_ENABLE_I18Nstatic void build_wcs_buffer (re_string_t *pstr);static void build_wcs_upper_buffer (re_string_t *pstr);#endif /* RE_ENABLE_I18N */static void build_upper_buffer (re_string_t *pstr);static void re_string_translate_buffer (re_string_t *pstr);static void re_string_destruct (re_string_t *pstr);#ifdef RE_ENABLE_I18Nstatic int re_string_elem_size_at (const re_string_t *pstr, int idx);static inline int re_string_char_size_at (const re_string_t *pstr, int idx);static inline wint_t re_string_wchar_at (const re_string_t *pstr, int idx);#endif /* RE_ENABLE_I18N */static unsigned int re_string_context_at (const re_string_t *input, int idx, int eflags, int newline_anchor);#define re_string_peek_byte(pstr, offset) \ ((pstr)->mbs[(pstr)->cur_idx + offset])#define re_string_peek_byte_case(pstr, offset) \ ((pstr)->mbs_case[(pstr)->cur_idx + offset])#define re_string_fetch_byte(pstr) \ ((pstr)->mbs[(pstr)->cur_idx++])#define re_string_fetch_byte_case(pstr) \ ((pstr)->mbs_case[(pstr)->cur_idx++])#define re_string_first_byte(pstr, idx) \ ((idx) == (pstr)->len || (pstr)->wcs[idx] != WEOF)#define re_string_is_single_byte_char(pstr, idx) \
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -