📄 ure.c
字号:
* Return the number of characters consumed. */ return sp - pp;}/* * Collect a hex number with 1 to 4 digits and return the number * of characters used. */static unsigned long_ure_hex(ucs2_t *np, unsigned long limit, ucs4_t *n){ ucs2_t i; ucs2_t *sp, *ep; ucs4_t nn; sp = np; ep = sp + limit; for (nn = 0, i = 0; i < 4 && sp < ep; i++, sp++) { if (*sp >= '0' && *sp <= '9') nn = (nn << 4) + (*sp - '0'); else if (*sp >= 'A' && *sp <= 'F') nn = (nn << 4) + ((*sp - 'A') + 10); else if (*sp >= 'a' && *sp <= 'f') nn = (nn << 4) + ((*sp - 'a') + 10); else /* * Encountered something that is not a hex digit. */ break; } /* * Assign the character code collected and return the number of * characters used. */ *n = nn; return sp - np;}/* * Insert a range into a character class, removing duplicates and ordering * them in increasing range-start order. */static void_ure_add_range(_ure_ccl_t *ccl, _ure_range_t *r, _ure_buffer_t *b){ ucs2_t i; ucs4_t tmp; _ure_range_t *rp; /* * If the `casefold' flag is set, then make sure both endpoints of the * range are converted to lower case. */ if (b->flags & _URE_DFA_CASEFOLD) { r->min_code = _ure_tolower(r->min_code); r->max_code = _ure_tolower(r->max_code); } /* * Swap the range endpoints if they are not in increasing order. */ if (r->min_code > r->max_code) { tmp = r->min_code; r->min_code = r->max_code; r->max_code = tmp; } for (i = 0, rp = ccl->ranges; i < ccl->ranges_used && r->min_code < rp->min_code; i++, rp++) ; /* * Check for a duplicate. */ if (i < ccl->ranges_used && r->min_code == rp->min_code && r->max_code == rp->max_code) return; if (ccl->ranges_used == ccl->ranges_size) { if (ccl->ranges_size == 0) ccl->ranges = (_ure_range_t *) malloc(sizeof(_ure_range_t) << 3); else ccl->ranges = (_ure_range_t *) realloc((char *) ccl->ranges, sizeof(_ure_range_t) * (ccl->ranges_size + 8)); ccl->ranges_size += 8; } rp = ccl->ranges + ccl->ranges_used; if (i < ccl->ranges_used) _ure_memmove((char *) (rp + 1), (char *) rp, sizeof(_ure_range_t) * (ccl->ranges_used - i)); ccl->ranges_used++; rp->min_code = r->min_code; rp->max_code = r->max_code;}#define _URE_ALPHA_MASK (_URE_UPPER|_URE_LOWER|_URE_OTHERLETTER|\_URE_MODIFIER|_URE_TITLE|_URE_NONSPACING|_URE_COMBINING)#define _URE_ALNUM_MASK (_URE_ALPHA_MASK|_URE_NUMDIGIT)#define _URE_PUNCT_MASK (_URE_DASHPUNCT|_URE_OPENPUNCT|_URE_CLOSEPUNCT|\_URE_OTHERPUNCT)#define _URE_GRAPH_MASK (_URE_NUMDIGIT|_URE_NUMOTHER|_URE_ALPHA_MASK|\_URE_MATHSYM|_URE_CURRENCYSYM|_URE_OTHERSYM)#define _URE_PRINT_MASK (_URE_GRAPH_MASK|_URE_SPACESEP)#define _URE_SPACE_MASK (_URE_SPACESEP|_URE_LINESEP|_URE_PARASEP)typedef void (*_ure_cclsetup_t)( _ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b);typedef struct { ucs2_t key; unsigned long len; unsigned long next; _ure_cclsetup_t func; unsigned long mask;} _ure_trie_t;static void_ure_ccl_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b){ sym->props |= mask;}static void_ure_space_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b){ _ure_range_t range; sym->props |= mask; /* * Add the additional characters needed for handling isspace(). */ range.min_code = range.max_code = '\t'; _ure_add_range(&sym->sym.ccl, &range, b); range.min_code = range.max_code = '\r'; _ure_add_range(&sym->sym.ccl, &range, b); range.min_code = range.max_code = '\n'; _ure_add_range(&sym->sym.ccl, &range, b); range.min_code = range.max_code = '\f'; _ure_add_range(&sym->sym.ccl, &range, b); range.min_code = range.max_code = 0xfeff; _ure_add_range(&sym->sym.ccl, &range, b);}static void_ure_xdigit_setup(_ure_symtab_t *sym, unsigned long mask, _ure_buffer_t *b){ _ure_range_t range; /* * Add the additional characters needed for handling isxdigit(). */ range.min_code = '0'; range.max_code = '9'; _ure_add_range(&sym->sym.ccl, &range, b); range.min_code = 'A'; range.max_code = 'F'; _ure_add_range(&sym->sym.ccl, &range, b); range.min_code = 'a'; range.max_code = 'f'; _ure_add_range(&sym->sym.ccl, &range, b);}static _ure_trie_t cclass_trie[] = { {0x003a, 1, 1, 0, 0}, {0x0061, 9, 10, 0, 0}, {0x0063, 8, 19, 0, 0}, {0x0064, 7, 24, 0, 0}, {0x0067, 6, 29, 0, 0}, {0x006c, 5, 34, 0, 0}, {0x0070, 4, 39, 0, 0}, {0x0073, 3, 49, 0, 0}, {0x0075, 2, 54, 0, 0}, {0x0078, 1, 59, 0, 0}, {0x006c, 1, 11, 0, 0}, {0x006e, 2, 13, 0, 0}, {0x0070, 1, 16, 0, 0}, {0x0075, 1, 14, 0, 0}, {0x006d, 1, 15, 0, 0}, {0x003a, 1, 16, _ure_ccl_setup, _URE_ALNUM_MASK}, {0x0068, 1, 17, 0, 0}, {0x0061, 1, 18, 0, 0}, {0x003a, 1, 19, _ure_ccl_setup, _URE_ALPHA_MASK}, {0x006e, 1, 20, 0, 0}, {0x0074, 1, 21, 0, 0}, {0x0072, 1, 22, 0, 0}, {0x006c, 1, 23, 0, 0}, {0x003a, 1, 24, _ure_ccl_setup, _URE_CNTRL}, {0x0069, 1, 25, 0, 0}, {0x0067, 1, 26, 0, 0}, {0x0069, 1, 27, 0, 0}, {0x0074, 1, 28, 0, 0}, {0x003a, 1, 29, _ure_ccl_setup, _URE_NUMDIGIT}, {0x0072, 1, 30, 0, 0}, {0x0061, 1, 31, 0, 0}, {0x0070, 1, 32, 0, 0}, {0x0068, 1, 33, 0, 0}, {0x003a, 1, 34, _ure_ccl_setup, _URE_GRAPH_MASK}, {0x006f, 1, 35, 0, 0}, {0x0077, 1, 36, 0, 0}, {0x0065, 1, 37, 0, 0}, {0x0072, 1, 38, 0, 0}, {0x003a, 1, 39, _ure_ccl_setup, _URE_LOWER}, {0x0072, 2, 41, 0, 0}, {0x0075, 1, 45, 0, 0}, {0x0069, 1, 42, 0, 0}, {0x006e, 1, 43, 0, 0}, {0x0074, 1, 44, 0, 0}, {0x003a, 1, 45, _ure_ccl_setup, _URE_PRINT_MASK}, {0x006e, 1, 46, 0, 0}, {0x0063, 1, 47, 0, 0}, {0x0074, 1, 48, 0, 0}, {0x003a, 1, 49, _ure_ccl_setup, _URE_PUNCT_MASK}, {0x0070, 1, 50, 0, 0}, {0x0061, 1, 51, 0, 0}, {0x0063, 1, 52, 0, 0}, {0x0065, 1, 53, 0, 0}, {0x003a, 1, 54, _ure_space_setup, _URE_SPACE_MASK}, {0x0070, 1, 55, 0, 0}, {0x0070, 1, 56, 0, 0}, {0x0065, 1, 57, 0, 0}, {0x0072, 1, 58, 0, 0}, {0x003a, 1, 59, _ure_ccl_setup, _URE_UPPER}, {0x0064, 1, 60, 0, 0}, {0x0069, 1, 61, 0, 0}, {0x0067, 1, 62, 0, 0}, {0x0069, 1, 63, 0, 0}, {0x0074, 1, 64, 0, 0}, {0x003a, 1, 65, _ure_xdigit_setup, 0},};/* * Probe for one of the POSIX colon delimited character classes in the static * trie. */static unsigned long_ure_posix_ccl(ucs2_t *cp, unsigned long limit, _ure_symtab_t *sym, _ure_buffer_t *b){ int i; unsigned long n; _ure_trie_t *tp; ucs2_t *sp, *ep; /* * If the number of characters left is less than 7, then this cannot be * interpreted as one of the colon delimited classes. */ if (limit < 7) return 0; sp = cp; ep = sp + limit; tp = cclass_trie; for (i = 0; sp < ep && i < 8; i++, sp++) { n = tp->len; for (; n > 0 && tp->key != *sp; tp++, n--) ; if (n == 0) return 0; if (*sp == ':' && (i == 6 || i == 7)) { sp++; break; } if (sp + 1 < ep) tp = cclass_trie + tp->next; } if (tp->func == 0) return 0; (*tp->func)(sym, tp->mask, b); return sp - cp;}/* * Construct a list of ranges and return the number of characters consumed. */static unsigned long_ure_cclass(ucs2_t *cp, unsigned long limit, _ure_symtab_t *symp, _ure_buffer_t *b){ int range_end; unsigned long n; ucs2_t *sp, *ep; ucs4_t c, last; _ure_ccl_t *cclp; _ure_range_t range; sp = cp; ep = sp + limit; if (*sp == '^') { symp->type = _URE_NCCLASS; sp++; } else symp->type = _URE_CCLASS; for (last = 0, range_end = 0; b->error == _URE_OK && sp < ep && *sp != ']'; ) { c = *sp++; if (c == '\\') { if (sp == ep) { /* * The EOS was encountered when expecting the reverse solidus * to be followed by the character it is escaping. Set an * error code and return the number of characters consumed up * to this point. */ b->error = _URE_UNEXPECTED_EOS; return sp - cp; } c = *sp++; switch (c) { case 'a': c = 0x07; break; case 'b': c = 0x08; break; case 'f': c = 0x0c; break; case 'n': c = 0x0a; break; case 'r': c = 0x0d; break; case 't': c = 0x09; break; case 'v': c = 0x0b; break; case 'p': case 'P': sp += _ure_prop_list(sp, ep - sp, &symp->props, b); /* * Invert the bit mask of the properties if this is a negated * character class or if 'P' is used to specify a list of * character properties that should *not* match in a * character class. */ if (c == 'P') symp->props = ~symp->props; continue; break; case 'x': case 'X': case 'u': case 'U': if (sp < ep && ((*sp >= '0' && *sp <= '9') || (*sp >= 'A' && *sp <= 'F') || (*sp >= 'a' && *sp <= 'f'))) sp += _ure_hex(sp, ep - sp, &c); } } else if (c == ':') { /* * Probe for a POSIX colon delimited character class. */ sp--; if ((n = _ure_posix_ccl(sp, ep - sp, symp, b)) == 0) sp++; else { sp += n; continue; } } cclp = &symp->sym.ccl; /* * Check to see if the current character is a low surrogate that needs * to be combined with a preceding high surrogate. */ if (last != 0) { if (c >= 0xdc00 && c <= 0xdfff) /* * Construct the UTF16 character code. */ c = 0x10000 + (((last & 0x03ff) << 10) | (c & 0x03ff)); else { /* * Add the isolated high surrogate to the range. */ if (range_end == 1) range.max_code = last & 0xffff; else range.min_code = range.max_code = last & 0xffff; _ure_add_range(cclp, &range, b); range_end = 0; } } /* * Clear the last character code. */ last = 0; /* * This slightly awkward code handles the different cases needed to * construct a range. */ if (c >= 0xd800 && c <= 0xdbff) { /* * If the high surrogate is followed by a range indicator, simply * add it as the range start. Otherwise, save it in case the next * character is a low surrogate. */ if (*sp == '-') { sp++;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -