📄 xmlregexp.c
字号:
transnr, statenr);#endif } else if (state->trans[transnr].count < 0) { int newto = state->trans[transnr].to;#ifdef DEBUG_REGEXP_GRAPH printf("Found epsilon trans %d from %d to %d\n", transnr, statenr, newto);#endif has_epsilon = 1; state->trans[transnr].to = -2; state->mark = XML_REGEXP_MARK_START; xmlFAReduceEpsilonTransitions(ctxt, statenr, newto, state->trans[transnr].counter); state->mark = XML_REGEXP_MARK_NORMAL;#ifdef DEBUG_REGEXP_GRAPH } else { printf("Found counted transition %d on %d\n", transnr, statenr);#endif } } } } /* * Eliminate the epsilon transitions */ if (has_epsilon) { for (statenr = 0;statenr < ctxt->nbStates;statenr++) { state = ctxt->states[statenr]; if (state == NULL) continue; for (transnr = 0;transnr < state->nbTrans;transnr++) { xmlRegTransPtr trans = &(state->trans[transnr]); if ((trans->atom == NULL) && (trans->count < 0) && (trans->to >= 0)) { trans->to = -1; } } } } /* * Use this pass to detect unreachable states too */ for (statenr = 0;statenr < ctxt->nbStates;statenr++) { state = ctxt->states[statenr]; if (state != NULL) state->reached = XML_REGEXP_MARK_NORMAL; } state = ctxt->states[0]; if (state != NULL) state->reached = XML_REGEXP_MARK_START; while (state != NULL) { xmlRegStatePtr target = NULL; state->reached = XML_REGEXP_MARK_VISITED; /* * Mark all states reachable from the current reachable state */ for (transnr = 0;transnr < state->nbTrans;transnr++) { if ((state->trans[transnr].to >= 0) && ((state->trans[transnr].atom != NULL) || (state->trans[transnr].count >= 0))) { int newto = state->trans[transnr].to; if (ctxt->states[newto] == NULL) continue; if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) { ctxt->states[newto]->reached = XML_REGEXP_MARK_START; target = ctxt->states[newto]; } } } /* * find the next accessible state not explored */ if (target == NULL) { for (statenr = 1;statenr < ctxt->nbStates;statenr++) { state = ctxt->states[statenr]; if ((state != NULL) && (state->reached == XML_REGEXP_MARK_START)) { target = state; break; } } } state = target; } for (statenr = 0;statenr < ctxt->nbStates;statenr++) { state = ctxt->states[statenr]; if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) {#ifdef DEBUG_REGEXP_GRAPH printf("Removed unreachable state %d\n", statenr);#endif xmlRegFreeState(state); ctxt->states[statenr] = NULL; } }}static intxmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) { int ret = 0; if ((range1->type == XML_REGEXP_RANGES) || (range2->type == XML_REGEXP_RANGES) || (range2->type == XML_REGEXP_SUBREG) || (range1->type == XML_REGEXP_SUBREG) || (range1->type == XML_REGEXP_STRING) || (range2->type == XML_REGEXP_STRING)) return(-1); /* put them in order */ if (range1->type > range2->type) { xmlRegRangePtr tmp; tmp = range1; range1 = range2; range2 = tmp; } if ((range1->type == XML_REGEXP_ANYCHAR) || (range2->type == XML_REGEXP_ANYCHAR)) { ret = 1; } else if ((range1->type == XML_REGEXP_EPSILON) || (range2->type == XML_REGEXP_EPSILON)) { return(0); } else if (range1->type == range2->type) { if ((range1->type != XML_REGEXP_CHARVAL) || (range1->end < range2->start) || (range2->end < range1->start)) ret = 1; else ret = 0; } else if (range1->type == XML_REGEXP_CHARVAL) { int codepoint; int neg = 0; /* * just check all codepoints in the range for acceptance, * this is usually way cheaper since done only once at * compilation than testing over and over at runtime or * pushing too many states when evaluating. */ if (((range1->neg == 0) && (range2->neg != 0)) || ((range1->neg != 0) && (range2->neg == 0))) neg = 1; for (codepoint = range1->start;codepoint <= range1->end ;codepoint++) { ret = xmlRegCheckCharacterRange(range2->type, codepoint, 0, range2->start, range2->end, range2->blockName); if (ret < 0) return(-1); if (((neg == 1) && (ret == 0)) || ((neg == 0) && (ret == 1))) return(1); } return(0); } else if ((range1->type == XML_REGEXP_BLOCK_NAME) || (range2->type == XML_REGEXP_BLOCK_NAME)) { if (range1->type == range2->type) { ret = xmlStrEqual(range1->blockName, range2->blockName); } else { /* * comparing a block range with anything else is way * too costly, and maintining the table is like too much * memory too, so let's force the automata to save state * here. */ return(1); } } else if ((range1->type < XML_REGEXP_LETTER) || (range2->type < XML_REGEXP_LETTER)) { if ((range1->type == XML_REGEXP_ANYSPACE) && (range2->type == XML_REGEXP_NOTSPACE)) ret = 0; else if ((range1->type == XML_REGEXP_INITNAME) && (range2->type == XML_REGEXP_NOTINITNAME)) ret = 0; else if ((range1->type == XML_REGEXP_NAMECHAR) && (range2->type == XML_REGEXP_NOTNAMECHAR)) ret = 0; else if ((range1->type == XML_REGEXP_DECIMAL) && (range2->type == XML_REGEXP_NOTDECIMAL)) ret = 0; else if ((range1->type == XML_REGEXP_REALCHAR) && (range2->type == XML_REGEXP_NOTREALCHAR)) ret = 0; else { /* same thing to limit complexity */ return(1); } } else { ret = 0; /* range1->type < range2->type here */ switch (range1->type) { case XML_REGEXP_LETTER: /* all disjoint except in the subgroups */ if ((range2->type == XML_REGEXP_LETTER_UPPERCASE) || (range2->type == XML_REGEXP_LETTER_LOWERCASE) || (range2->type == XML_REGEXP_LETTER_TITLECASE) || (range2->type == XML_REGEXP_LETTER_MODIFIER) || (range2->type == XML_REGEXP_LETTER_OTHERS)) ret = 1; break; case XML_REGEXP_MARK: if ((range2->type == XML_REGEXP_MARK_NONSPACING) || (range2->type == XML_REGEXP_MARK_SPACECOMBINING) || (range2->type == XML_REGEXP_MARK_ENCLOSING)) ret = 1; break; case XML_REGEXP_NUMBER: if ((range2->type == XML_REGEXP_NUMBER_DECIMAL) || (range2->type == XML_REGEXP_NUMBER_LETTER) || (range2->type == XML_REGEXP_NUMBER_OTHERS)) ret = 1; break; case XML_REGEXP_PUNCT: if ((range2->type == XML_REGEXP_PUNCT_CONNECTOR) || (range2->type == XML_REGEXP_PUNCT_DASH) || (range2->type == XML_REGEXP_PUNCT_OPEN) || (range2->type == XML_REGEXP_PUNCT_CLOSE) || (range2->type == XML_REGEXP_PUNCT_INITQUOTE) || (range2->type == XML_REGEXP_PUNCT_FINQUOTE) || (range2->type == XML_REGEXP_PUNCT_OTHERS)) ret = 1; break; case XML_REGEXP_SEPAR: if ((range2->type == XML_REGEXP_SEPAR_SPACE) || (range2->type == XML_REGEXP_SEPAR_LINE) || (range2->type == XML_REGEXP_SEPAR_PARA)) ret = 1; break; case XML_REGEXP_SYMBOL: if ((range2->type == XML_REGEXP_SYMBOL_MATH) || (range2->type == XML_REGEXP_SYMBOL_CURRENCY) || (range2->type == XML_REGEXP_SYMBOL_MODIFIER) || (range2->type == XML_REGEXP_SYMBOL_OTHERS)) ret = 1; break; case XML_REGEXP_OTHER: if ((range2->type == XML_REGEXP_OTHER_CONTROL) || (range2->type == XML_REGEXP_OTHER_FORMAT) || (range2->type == XML_REGEXP_OTHER_PRIVATE)) ret = 1; break; default: if ((range2->type >= XML_REGEXP_LETTER) && (range2->type < XML_REGEXP_BLOCK_NAME)) ret = 0; else { /* safety net ! */ return(1); } } } if (((range1->neg == 0) && (range2->neg != 0)) || ((range1->neg != 0) && (range2->neg == 0))) ret = !ret; return(1);}/** * xmlFACompareAtomTypes: * @type1: an atom type * @type2: an atom type * * Compares two atoms type to check whether they intersect in some ways, * this is used by xmlFACompareAtoms only * * Returns 1 if they may intersect and 0 otherwise */static intxmlFACompareAtomTypes(xmlRegAtomType type1, xmlRegAtomType type2) { if ((type1 == XML_REGEXP_EPSILON) || (type1 == XML_REGEXP_CHARVAL) || (type1 == XML_REGEXP_RANGES) || (type1 == XML_REGEXP_SUBREG) || (type1 == XML_REGEXP_STRING) || (type1 == XML_REGEXP_ANYCHAR)) return(1); if ((type2 == XML_REGEXP_EPSILON) || (type2 == XML_REGEXP_CHARVAL) || (type2 == XML_REGEXP_RANGES) || (type2 == XML_REGEXP_SUBREG) || (type2 == XML_REGEXP_STRING) || (type2 == XML_REGEXP_ANYCHAR)) return(1); if (type1 == type2) return(1); /* simplify subsequent compares by making sure type1 < type2 */ if (type1 > type2) { xmlRegAtomType tmp = type1; type1 = type2; type2 = tmp; } switch (type1) { case XML_REGEXP_ANYSPACE: /* \s */ /* can't be a letter, number, mark, pontuation, symbol */ if ((type2 == XML_REGEXP_NOTSPACE) || ((type2 >= XML_REGEXP_LETTER) && (type2 <= XML_REGEXP_LETTER_OTHERS)) || ((type2 >= XML_REGEXP_NUMBER) && (type2 <= XML_REGEXP_NUMBER_OTHERS)) || ((type2 >= XML_REGEXP_MARK) && (type2 <= XML_REGEXP_MARK_ENCLOSING)) || ((type2 >= XML_REGEXP_PUNCT) && (type2 <= XML_REGEXP_PUNCT_OTHERS)) || ((type2 >= XML_REGEXP_SYMBOL) && (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ) return(0); break; case XML_REGEXP_NOTSPACE: /* \S */ break; case XML_REGEXP_INITNAME: /* \l */ /* can't be a number, mark, separator, pontuation, symbol or other */ if ((type2 == XML_REGEXP_NOTINITNAME) || ((type2 >= XML_REGEXP_NUMBER) && (type2 <= XML_REGEXP_NUMBER_OTHERS)) || ((type2 >= XML_REGEXP_MARK) && (type2 <= XML_REGEXP_MARK_ENCLOSING)) || ((type2 >= XML_REGEXP_SEPAR) && (type2 <= XML_REGEXP_SEPAR_PARA)) || ((type2 >= XML_REGEXP_PUNCT) && (type2 <= XML_REGEXP_PUNCT_OTHERS)) || ((type2 >= XML_REGEXP_SYMBOL) && (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || ((type2 >= XML_REGEXP_OTHER) && (type2 <= XML_REGEXP_OTHER_NA)) ) return(0); break; case XML_REGEXP_NOTINITNAME: /* \L */ break; case XML_REGEXP_NAMECHAR: /* \c */ /* can't be a mark, separator, pontuation, symbol or other */ if ((type2 == XML_REGEXP_NOTNAMECHAR) || ((type2 >= XML_REGEXP_MARK) && (type2 <= XML_REGEXP_MARK_ENCLOSING)) || ((type2 >= XML_REGEXP_PUNCT) && (type2 <= XML_REGEXP_PUNCT_OTHERS)) || ((type2 >= XML_REGEXP_SEPAR) && (type2 <= XML_REGEXP_SEPAR_PARA)) || ((type2 >= XML_REGEXP_SYMBOL) && (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || ((type2 >= XML_REGEXP_OTHER) && (type2 <= XML_REGEXP_OTHER_NA)) ) return(0); break; case XML_REGEXP_NOTNAMECHAR: /* \C */ break; case XML_REGEXP_DECIMAL: /* \d */ /* can't be a letter, mark, separator, pontuation, symbol or other */ if ((type2 == XML_REGEXP_NOTDECIMAL) || (type2 == XML_REGEXP_REALCHAR) || ((type2 >= XML_REGEXP_LETTER) && (type2 <= XML_REGEXP_LETTER_OTHERS)) || ((type2 >= XML_REGEXP_MARK) && (type2 <= XML_REGEXP_MARK_ENCLOSING)) || ((type2 >= XML_REGEXP_PUNCT) && (type2 <= XML_REGEXP_PUNCT_OTHERS)) || ((type2 >= XML_REGEXP_SEPAR) && (type2 <= XML_REGEXP_SEPAR_PARA)) || ((type2 >= XML_REGEXP_SYMBOL) && (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || ((type2 >= XML_REGEXP_OTHER) && (type2 <= XML_REGEXP_OTHER_NA)) )return(0); break; case XML_REGEXP_NOTDECIMAL: /* \D */ break; case XML_REGEXP_REALCHAR: /* \w */ /* can't be a mark, separator, pontuation, symbol or other */ if ((type2 == XML_REGEXP_NOTDECIMAL) || ((type2 >= XML_REGEXP_MARK) && (type2 <= XML_REGEXP_MARK_ENCLOSING)) || ((type2 >= XML_REGEXP_PUNCT) && (type2 <= XML_REGEXP_PUNCT_OTHERS)) || ((type2 >= XML_REGEXP_SEPAR) && (type2 <= XML_REGEXP_SEPAR_PARA)) || ((type2 >= XML_REGEXP_SYMBOL) && (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || ((type2 >= XML_REGEXP_OTHER) && (type2 <= XML_REGEXP_OTHER_NA)) )return(0); break; case XML_REGEXP_NOTREALCHAR: /* \W */ break; /* * at that point we know both type 1 and type2 are from * character categories are ordered and are different, * it becomes simple because this is a partition */ case XML_REGEXP_LETTER: if (type2 <= XML_REGEXP_LETTER_OTHERS) return(1); return(0); case XML_REGEXP_LETTER_UPPERCASE: case XML_REGEXP_LETTER_LOWERCASE: case XML_REGEXP_LETTER_TITLECASE: case XML_REGEXP_LETTER_MODIFIER: case XML_REGEXP_LETTER_OTHERS: return(0); case XML_REGEXP_MARK: if (type2 <= XML_REGEXP_MARK_ENCLOSING) return(1); return(0); case XML_REGEXP_MARK_NONSPACING: case XML_REGEXP_MARK_SPACECOMBINING: case XML_REGEXP_MARK_ENCLOSING: return(0); case XML_REGEXP_NUMBER: if (type2 <= XML_REGEXP_NUMBER_OTHERS) return(1); return(0); case XML_REGEXP_NUMBER_DECIMAL: case XML_REGEXP_NUMBER_LETTER: case XML_REGEXP_NUMBER_OTHERS: return(0); case XML_REGEXP_PUNCT: if (type2 <= XML_REGEXP_PUNCT_OTHERS) return(1); return(0); case XML_REGEXP_PUNCT_CONNECTOR: case XML_REGEXP_PUNCT_DASH: case XML_REGEXP_PUNCT_OPEN: case XML_REGEXP_PUNCT_CLOSE: case XML_REGEXP_PUNCT_INITQUOTE: case XML_REGEXP_PUNCT_FINQUOTE: case XML_REGEXP_PUNCT_OTHERS: return(0); case XML_REGEXP_SEPAR: if (type2 <= XML_REGEXP_SEPAR_PARA) return(1); return(0); case XML_REGEXP_SEPAR_SPACE: case XML_REGEXP_SEPAR_LINE: case XML_REGEXP_SEPAR_PARA: return(0); case XML_REGEXP_SYMBOL: if (type2 <= XML_REGEXP_SYMBOL_OTHERS) return(1); return(0); case XML_REGEXP_SYMBOL_MATH: case XML_REGEXP_SYMBOL_CURRENCY: case XML_REGEXP_SYMBOL_MODIFIER: case XML_REGEXP_SYMBOL_OTHERS: return(0); case XML_REGEXP_OTHER: if (type2 <= XML_REGEXP_OTHER_NA) return(1); return(0);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -