📄 ure.c
字号:
range.min_code = c; range_end = 1; } else last = c; } else if (range_end == 1) { range.max_code = c; _ure_add_range(cclp, &range, b); range_end = 0; } else { range.min_code = range.max_code = c; if (*sp == '-') { sp++; range_end = 1; } else _ure_add_range(cclp, &range, b); } } if (sp < ep && *sp == ']') sp++; else /* * The parse was not terminated by the character class close symbol * (']'), so set an error code. */ b->error = _URE_CCLASS_OPEN; return sp - cp;}/* * Probe for a low surrogate hex code. */static unsigned long_ure_probe_ls(ucs2_t *ls, unsigned long limit, ucs4_t *c){ ucs4_t i, code; ucs2_t *sp, *ep; for (i = code = 0, sp = ls, ep = sp + limit; i < 4 && sp < ep; sp++) { if (*sp >= '0' && *sp <= '9') code = (code << 4) + (*sp - '0'); else if (*sp >= 'A' && *sp <= 'F') code = (code << 4) + ((*sp - 'A') + 10); else if (*sp >= 'a' && *sp <= 'f') code = (code << 4) + ((*sp - 'a') + 10); else break; } *c = code; return (0xdc00 <= code && code <= 0xdfff) ? sp - ls : 0;}static unsigned long_ure_compile_symbol(ucs2_t *sym, unsigned long limit, _ure_symtab_t *symp, _ure_buffer_t *b){ ucs4_t c; ucs2_t *sp, *ep; sp = sym; ep = sym + limit; if ((c = *sp++) == '\\') { if (sp == ep) { /* * The EOS was encountered when expecting the reverse solidus to * be followed by the character it is escaping. Set an error code * and return the number of characters consumed up to this point. */ b->error = _URE_UNEXPECTED_EOS; return sp - sym; } c = *sp++; switch (c) { case 'p': case 'P': symp->type = (c == 'p') ? _URE_CCLASS : _URE_NCCLASS; sp += _ure_prop_list(sp, ep - sp, &symp->props, b); break; case 'a': symp->type = _URE_CHAR; symp->sym.chr = 0x07; break; case 'b': symp->type = _URE_CHAR; symp->sym.chr = 0x08; break; case 'f': symp->type = _URE_CHAR; symp->sym.chr = 0x0c; break; case 'n': symp->type = _URE_CHAR; symp->sym.chr = 0x0a; break; case 'r': symp->type = _URE_CHAR; symp->sym.chr = 0x0d; break; case 't': symp->type = _URE_CHAR; symp->sym.chr = 0x09; break; case 'v': symp->type = _URE_CHAR; symp->sym.chr = 0x0b; break; case 'x': case 'X': case 'u': case 'U': /* * Collect between 1 and 4 digits representing a UCS2 code. Fall * through to the next case. */ if (sp < ep && ((*sp >= '0' && *sp <= '9') || (*sp >= 'A' && *sp <= 'F') || (*sp >= 'a' && *sp <= 'f'))) sp += _ure_hex(sp, ep - sp, &c); /* FALLTHROUGH */ default: /* * Simply add an escaped character here. */ symp->type = _URE_CHAR; symp->sym.chr = c; } } else if (c == '^' || c == '$') /* * Handle the BOL and EOL anchors. This actually consists simply of * setting a flag that indicates that the user supplied anchor match * function should be called. This needs to be done instead of simply * matching line/paragraph separators because beginning-of-text and * end-of-text tests are needed as well. */ symp->type = (c == '^') ? _URE_BOL_ANCHOR : _URE_EOL_ANCHOR; else if (c == '[') /* * Construct a character class. */ sp += _ure_cclass(sp, ep - sp, symp, b); else if (c == '.') symp->type = _URE_ANY_CHAR; else { symp->type = _URE_CHAR; symp->sym.chr = c; } /* * If the symbol type happens to be a character and is a high surrogate, * then probe forward to see if it is followed by a low surrogate that * needs to be added. */ if (sp < ep && symp->type == _URE_CHAR && 0xd800 <= symp->sym.chr && symp->sym.chr <= 0xdbff) { if (0xdc00 <= *sp && *sp <= 0xdfff) { symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) | (*sp & 0x03ff)); sp++; } else if (*sp == '\\' && (*(sp + 1) == 'x' || *(sp + 1) == 'X' || *(sp + 1) == 'u' || *(sp + 1) == 'U')) { sp += _ure_probe_ls(sp + 2, ep - (sp + 2), &c); if (0xdc00 <= c && c <= 0xdfff) { /* * Take into account the \[xu] in front of the hex code. */ sp += 2; symp->sym.chr = 0x10000 + (((symp->sym.chr & 0x03ff) << 10) | (c & 0x03ff)); } } } /* * Last, make sure any _URE_CHAR type symbols are changed to lower case if * the `casefold' flag is set. */ if ((b->flags & _URE_DFA_CASEFOLD) && symp->type == _URE_CHAR) symp->sym.chr = _ure_tolower(symp->sym.chr); /* * If the symbol constructed is anything other than one of the anchors, * make sure the _URE_DFA_BLANKLINE flag is removed. */ if (symp->type != _URE_BOL_ANCHOR && symp->type != _URE_EOL_ANCHOR) b->flags &= ~_URE_DFA_BLANKLINE; /* * Return the number of characters consumed. */ return sp - sym;}static int_ure_sym_neq(_ure_symtab_t *a, _ure_symtab_t *b){ if (a->type != b->type || a->mods != b->mods || a->props != b->props) return 1; if (a->type == _URE_CCLASS || a->type == _URE_NCCLASS) { if (a->sym.ccl.ranges_used != b->sym.ccl.ranges_used) return 1; if (a->sym.ccl.ranges_used > 0 && memcmp((char *) a->sym.ccl.ranges, (char *) b->sym.ccl.ranges, sizeof(_ure_range_t) * a->sym.ccl.ranges_used) != 0) return 1; } else if (a->type == _URE_CHAR && a->sym.chr != b->sym.chr) return 1; return 0;}/* * Construct a symbol, but only keep unique symbols. */static ucs2_t_ure_make_symbol(ucs2_t *sym, unsigned long limit, unsigned long *consumed, _ure_buffer_t *b){ ucs2_t i; _ure_symtab_t *sp, symbol; /* * Build the next symbol so we can test to see if it is already in the * symbol table. */ (void) memset((char *) &symbol, '\0', sizeof(_ure_symtab_t)); *consumed = _ure_compile_symbol(sym, limit, &symbol, b); /* * Check to see if the symbol exists. */ for (i = 0, sp = b->symtab; i < b->symtab_used && _ure_sym_neq(&symbol, sp); i++, sp++) ; if (i < b->symtab_used) { /* * Free up any ranges used for the symbol. */ if ((symbol.type == _URE_CCLASS || symbol.type == _URE_NCCLASS) && symbol.sym.ccl.ranges_size > 0) free((char *) symbol.sym.ccl.ranges); return b->symtab[i].id; } /* * Need to add the new symbol. */ if (b->symtab_used == b->symtab_size) { if (b->symtab_size == 0) b->symtab = (_ure_symtab_t *) malloc(sizeof(_ure_symtab_t) << 3); else b->symtab = (_ure_symtab_t *) realloc((char *) b->symtab, sizeof(_ure_symtab_t) * (b->symtab_size + 8)); sp = b->symtab + b->symtab_size; (void) memset((char *) sp, '\0', sizeof(_ure_symtab_t) << 3); b->symtab_size += 8; } symbol.id = b->symtab_used++; (void) AC_MEMCPY((char *) &b->symtab[symbol.id], (char *) &symbol, sizeof(_ure_symtab_t)); return symbol.id;}/************************************************************************* * * End symbol parse functions. * *************************************************************************/static ucs2_t_ure_make_expr(ucs2_t type, ucs2_t lhs, ucs2_t rhs, _ure_buffer_t *b){ ucs2_t i; if (b == 0) return _URE_NOOP; /* * Determine if the expression already exists or not. */ for (i = 0; i < b->expr_used; i++) { if (b->expr[i].type == type && b->expr[i].lhs == lhs && b->expr[i].rhs == rhs) break; } if (i < b->expr_used) return i; /* * Need to add a new expression. */ if (b->expr_used == b->expr_size) { if (b->expr_size == 0) b->expr = (_ure_elt_t *) malloc(sizeof(_ure_elt_t) << 3); else b->expr = (_ure_elt_t *) realloc((char *) b->expr, sizeof(_ure_elt_t) * (b->expr_size + 8)); b->expr_size += 8; } b->expr[b->expr_used].onstack = 0; b->expr[b->expr_used].type = type; b->expr[b->expr_used].lhs = lhs; b->expr[b->expr_used].rhs = rhs; return b->expr_used++;}static unsigned char spmap[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,};#define _ure_isspecial(cc) ((cc) > 0x20 && (cc) < 0x7f && \ (spmap[(cc) >> 3] & (1 << ((cc) & 7))))/* * Convert the regular expression into an NFA in a form that will be easy to * reduce to a DFA. The starting state for the reduction will be returned. */static ucs2_t_ure_re2nfa(ucs2_t *re, unsigned long relen, _ure_buffer_t *b){ ucs2_t c, state, top, sym, *sp, *ep; unsigned long used; state = _URE_NOOP; sp = re; ep = sp + relen; while (b->error == _URE_OK && sp < ep) { c = *sp++; switch (c) { case '(': _ure_push(_URE_PAREN, b); break; case ')': /* * Check for the case of too many close parentheses. */ if (_ure_peek(b) == _URE_NOOP) { b->error = _URE_UNBALANCED_GROUP; break; } while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR) /* * Make an expression with the AND or OR operator and its right * hand side. */ state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b); /* * Remove the _URE_PAREN off the stack. */ (void) _ure_pop(b); break; case '*': state = _ure_make_expr(_URE_STAR, state, _URE_NOOP, b); break; case '+': state = _ure_make_expr(_URE_PLUS, state, _URE_NOOP, b); break; case '?': state = _ure_make_expr(_URE_QUEST, state, _URE_NOOP, b); break; case '|': while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR) /* * Make an expression with the AND or OR operator and its right * hand side. */ state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b); _ure_push(state, b); _ure_push(_URE_OR, b); break; default: sp--; sym = _ure_make_symbol(sp, ep - sp, &used, b); sp += used; state = _ure_make_expr(_URE_SYMBOL, sym, _URE_NOOP, b); break; } if (c != '(' && c != '|' && sp < ep && (!_ure_isspecial(*sp) || *sp == '(')) { _ure_push(state, b); _ure_push(_URE_AND, b); } } while ((top = _ure_peek(b)) == _URE_AND || top == _URE_OR) /* * Make an expression with the AND or OR operator and its right * hand side. */ state = _ure_make_expr(_ure_pop(b), _ure_pop(b), state, b); if (b->stack.slist_used > 0) b->error = _URE_UNBALANCED_GROUP; return (b->error == _URE_OK) ? state : _URE_NOOP;}static void_ure_add_symstate(ucs2_t sym, ucs2_t state, _ure_buffer_t *b){ ucs2_t i, *stp; _ure_symtab_t *sp; /* * Locate the symbol in the symbol table so the state can be added. * If the symbol doesn't exist, then a real problem exists. */ for (i = 0, sp = b->symtab; i < b->symtab_used && sym != sp->id;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -