📄 ure.c
字号:
buf->stack.slist_used = 0; buf->expr_used = 0; for (i = 0; i < buf->symtab_used; i++) buf->symtab[i].states.slist_used = 0; buf->symtab_used = 0; for (i = 0; i < buf->states.states_used; i++) { buf->states.states[i].st.slist_used = 0; buf->states.states[i].trans_used = 0; } buf->states.states_used = 0; /* * Construct the NFA. If this stage returns a 0, then an error occured or * an empty expression was passed. */ if ((state = _ure_re2nfa(re, relen, buf)) == _URE_NOOP) return 0; /* * Do the expression reduction to get the initial DFA. */ _ure_reduce(state, buf); /* * Merge all the equivalent DFA states. */ _ure_merge_equiv(buf); /* * Construct the minimal DFA. */ dfa = (ure_dfa_t) malloc(sizeof(_ure_dfa_t)); (void) memset((char *) dfa, '\0', sizeof(_ure_dfa_t)); dfa->flags = buf->flags & (_URE_DFA_CASEFOLD|_URE_DFA_BLANKLINE); /* * Free up the NFA state groups and transfer the symbols from the buffer * to the DFA. */ for (i = 0; i < buf->symtab_size; i++) { if (buf->symtab[i].states.slist_size > 0) free((char *) buf->symtab[i].states.slist); } dfa->syms = buf->symtab; dfa->nsyms = buf->symtab_used; buf->symtab_used = buf->symtab_size = 0; /* * Collect the total number of states and transitions needed for the DFA. */ for (i = state = 0, sp = buf->states.states; i < buf->states.states_used; i++, sp++) { if (sp->id == state) { dfa->nstates++; dfa->ntrans += sp->trans_used; state++; } } /* * Allocate enough space for the states and transitions. */ dfa->states = (_ure_dstate_t *) malloc(sizeof(_ure_dstate_t) * dfa->nstates); dfa->trans = (_ure_trans_t *) malloc(sizeof(_ure_trans_t) * dfa->ntrans); /* * Actually transfer the DFA states from the buffer. */ dsp = dfa->states; tp = dfa->trans; for (i = state = 0, sp = buf->states.states; i < buf->states.states_used; i++, sp++) { if (sp->id == state) { dsp->trans = tp; dsp->ntrans = sp->trans_used; dsp->accepting = sp->accepting; /* * Add the transitions for the state. */ for (j = 0; j < dsp->ntrans; j++, tp++) { tp->symbol = sp->trans[j].lhs; tp->next_state = buf->states.states[sp->trans[j].rhs].id; } dsp++; state++; } } return dfa;}voidure_dfa_free(ure_dfa_t dfa){ ucs2_t i; if (dfa == 0) return; for (i = 0; i < dfa->nsyms; i++) { if ((dfa->syms[i].type == _URE_CCLASS || dfa->syms[i].type == _URE_NCCLASS) && dfa->syms[i].sym.ccl.ranges_size > 0) free((char *) dfa->syms[i].sym.ccl.ranges); } if (dfa->nsyms > 0) free((char *) dfa->syms); if (dfa->nstates > 0) free((char *) dfa->states); if (dfa->ntrans > 0) free((char *) dfa->trans); free((char *) dfa);}voidure_write_dfa(ure_dfa_t dfa, FILE *out){ ucs2_t i, j, k, h, l; _ure_dstate_t *sp; _ure_symtab_t *sym; _ure_range_t *rp; if (dfa == 0 || out == 0) return; /* * Write all the different character classes. */ for (i = 0, sym = dfa->syms; i < dfa->nsyms; i++, sym++) { if (sym->type == _URE_CCLASS || sym->type == _URE_NCCLASS) { fprintf(out, "C%hd = ", sym->id); if (sym->sym.ccl.ranges_used > 0) { putc('[', out); if (sym->type == _URE_NCCLASS) putc('^', out); } if (sym->props != 0) { if (sym->type == _URE_NCCLASS) fprintf(out, "\\P"); else fprintf(out, "\\p"); for (k = h = 0; k < 32; k++) { if (sym->props & (1 << k)) { if (h != 0) putc(',', out); fprintf(out, "%hd", k + 1); h = 1; } } } /* * Dump the ranges. */ for (k = 0, rp = sym->sym.ccl.ranges; k < sym->sym.ccl.ranges_used; k++, rp++) { /* * Check for UTF16 characters. */ if (0x10000 <= rp->min_code && rp->min_code <= 0x10ffff) { h = (ucs2_t) (((rp->min_code - 0x10000) >> 10) + 0xd800); l = (ucs2_t) (((rp->min_code - 0x10000) & 1023) + 0xdc00); fprintf(out, "\\x%04hX\\x%04hX", h, l); } else fprintf(out, "\\x%04lX", rp->min_code & 0xffff); if (rp->max_code != rp->min_code) { putc('-', out); if (rp->max_code >= 0x10000 && rp->max_code <= 0x10ffff) { h = (ucs2_t) (((rp->max_code - 0x10000) >> 10) + 0xd800); l = (ucs2_t) (((rp->max_code - 0x10000) & 1023) + 0xdc00); fprintf(out, "\\x%04hX\\x%04hX", h, l); } else fprintf(out, "\\x%04lX", rp->max_code & 0xffff); } } if (sym->sym.ccl.ranges_used > 0) putc(']', out); putc('\n', out); } } for (i = 0, sp = dfa->states; i < dfa->nstates; i++, sp++) { fprintf(out, "S%hd = ", i); if (sp->accepting) { fprintf(out, "1 "); if (sp->ntrans) fprintf(out, "| "); } for (j = 0; j < sp->ntrans; j++) { if (j > 0) fprintf(out, "| "); sym = dfa->syms + sp->trans[j].symbol; switch (sym->type) { case _URE_CHAR: if (0x10000 <= sym->sym.chr && sym->sym.chr <= 0x10ffff) { /* * Take care of UTF16 characters. */ h = (ucs2_t) (((sym->sym.chr - 0x10000) >> 10) + 0xd800); l = (ucs2_t) (((sym->sym.chr - 0x10000) & 1023) + 0xdc00); fprintf(out, "\\x%04hX\\x%04hX ", h, l); } else fprintf(out, "\\x%04lX ", sym->sym.chr & 0xffff); break; case _URE_ANY_CHAR: fprintf(out, "<any> "); break; case _URE_BOL_ANCHOR: fprintf(out, "<bol-anchor> "); break; case _URE_EOL_ANCHOR: fprintf(out, "<eol-anchor> "); break; case _URE_CCLASS: case _URE_NCCLASS: fprintf(out, "[C%hd] ", sym->id); break; } fprintf(out, "S%hd", sp->trans[j].next_state); if (j + 1 < sp->ntrans) putc(' ', out); } putc('\n', out); }}#define _ure_issep(cc) ((cc) == '\n' || (cc) == '\r' || (cc) == 0x2028 ||\ (cc) == 0x2029)inture_exec(ure_dfa_t dfa, int flags, ucs2_t *text, unsigned long textlen, unsigned long *match_start, unsigned long *match_end){ int i, j, matched, found, skip; unsigned long ms, me; ucs4_t c; ucs2_t *sp, *ep, *lp; _ure_dstate_t *stp; _ure_symtab_t *sym; _ure_range_t *rp; if (dfa == 0 || text == 0) return 0; /* * Handle the special case of an empty string matching the "^$" pattern. */ if (textlen == 0 && (dfa->flags & _URE_DFA_BLANKLINE)) { *match_start = *match_end = 0; return 1; } sp = text; ep = sp + textlen; ms = me = ~0; stp = dfa->states; for (found = skip = 0; found == 0 && sp < ep; ) { lp = sp; c = *sp++; /* * Check to see if this is a high surrogate that should be * combined with a following low surrogate. */ if (sp < ep && 0xd800 <= c && c <= 0xdbff && 0xdc00 <= *sp && *sp <= 0xdfff) c = 0x10000 + (((c & 0x03ff) << 10) | (*sp++ & 0x03ff)); /* * Determine if the character is non-spacing and should be skipped. */ if (_ure_matches_properties(_URE_NONSPACING, c) && (flags & URE_IGNORE_NONSPACING)) { sp++; continue; } if (dfa->flags & _URE_DFA_CASEFOLD) c = _ure_tolower(c); /* * See if one of the transitions matches. */ for (i = 0, matched = 0; matched == 0 && i < stp->ntrans; i++) { sym = dfa->syms + stp->trans[i].symbol; switch (sym->type) { case _URE_ANY_CHAR: if ((flags & URE_DOT_MATCHES_SEPARATORS) || !_ure_issep(c)) matched = 1; break; case _URE_CHAR: if (c == sym->sym.chr) matched = 1; break; case _URE_BOL_ANCHOR: if (lp == text) { sp = lp; matched = 1; } else if (_ure_issep(c)) { if (c == '\r' && sp < ep && *sp == '\n') sp++; lp = sp; matched = 1; } break; case _URE_EOL_ANCHOR: if (_ure_issep(c)) { /* * Put the pointer back before the separator so the match * end position will be correct. This case will also * cause the `sp' pointer to be advanced over the current * separator once the match end point has been recorded. */ sp = lp; matched = 1; } break; case _URE_CCLASS: case _URE_NCCLASS: if (sym->props != 0) matched = _ure_matches_properties(sym->props, c); for (j = 0, rp = sym->sym.ccl.ranges; j < sym->sym.ccl.ranges_used; j++, rp++) { if (rp->min_code <= c && c <= rp->max_code) matched = 1; } if (sym->type == _URE_NCCLASS) matched = !matched; break; } if (matched) { if (ms == ~0UL) ms = lp - text; else me = sp - text; stp = dfa->states + stp->trans[i].next_state; /* * If the match was an EOL anchor, adjust the pointer past the * separator that caused the match. The correct match * position has been recorded already. */ if (sym->type == _URE_EOL_ANCHOR) { /* * Skip the character that caused the match. */ sp++; /* * Handle the infamous CRLF situation. */ if (sp < ep && c == '\r' && *sp == '\n') sp++; } } } if (matched == 0) { if (stp->accepting == 0) { /* * If the last state was not accepting, then reset * and start over. */ stp = dfa->states; ms = me = ~0; } else /* * The last state was accepting, so terminate the matching * loop to avoid more work. */ found = 1; } else if (sp == ep) { if (!stp->accepting) { /* * This ugly hack is to make sure the end-of-line anchors * match when the source text hits the end. This is only done * if the last subexpression matches. */ for (i = 0; found == 0 && i < stp->ntrans; i++) { sym = dfa->syms + stp->trans[i].symbol; if (sym->type ==_URE_EOL_ANCHOR) { stp = dfa->states + stp->trans[i].next_state; if (stp->accepting) { me = sp - text; found = 1; } else break; } } } else { /* * Make sure any conditions that match all the way to the end * of the string match. */ found = 1; me = sp - text; } } } if (found == 0) ms = me = ~0; *match_start = ms; *match_end = me; return (ms != ~0UL) ? 1 : 0;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -