📄 xmlregexp.c

📁 xml开源解析代码.版本为libxml2-2.6.29,可支持GB3212.网络消息发送XML时很有用.
💻 C
📖 第 1 页 / 共 5 页
字号:
上一页 1 2 3 45
			   transnr, statenr);#endif		} else if (state->trans[transnr].count < 0) {		    int newto = state->trans[transnr].to;#ifdef DEBUG_REGEXP_GRAPH		    printf("Found epsilon trans %d from %d to %d\n",			   transnr, statenr, newto);#endif		    has_epsilon = 1;		    state->trans[transnr].to = -2;		    state->mark = XML_REGEXP_MARK_START;		    xmlFAReduceEpsilonTransitions(ctxt, statenr,				      newto, state->trans[transnr].counter);		    state->mark = XML_REGEXP_MARK_NORMAL;#ifdef DEBUG_REGEXP_GRAPH		} else {		    printf("Found counted transition %d on %d\n",			   transnr, statenr);#endif	        }	    }	}    }    /*     * Eliminate the epsilon transitions     */    if (has_epsilon) {	for (statenr = 0;statenr < ctxt->nbStates;statenr++) {	    state = ctxt->states[statenr];	    if (state == NULL)		continue;	    for (transnr = 0;transnr < state->nbTrans;transnr++) {		xmlRegTransPtr trans = &(state->trans[transnr]);		if ((trans->atom == NULL) &&		    (trans->count < 0) &&		    (trans->to >= 0)) {		    trans->to = -1;		}	    }	}    }    /*     * Use this pass to detect unreachable states too     */    for (statenr = 0;statenr < ctxt->nbStates;statenr++) {	state = ctxt->states[statenr];	if (state != NULL)	    state->reached = XML_REGEXP_MARK_NORMAL;    }    state = ctxt->states[0];    if (state != NULL)	state->reached = XML_REGEXP_MARK_START;    while (state != NULL) {	xmlRegStatePtr target = NULL;	state->reached = XML_REGEXP_MARK_VISITED;	/*	 * Mark all states reachable from the current reachable state	 */	for (transnr = 0;transnr < state->nbTrans;transnr++) {	    if ((state->trans[transnr].to >= 0) &&		((state->trans[transnr].atom != NULL) ||		 (state->trans[transnr].count >= 0))) {		int newto = state->trans[transnr].to;		if (ctxt->states[newto] == NULL)		    continue;		if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) {		    ctxt->states[newto]->reached = XML_REGEXP_MARK_START;		    target = ctxt->states[newto];		}	    }	}	/*	 * find the next accessible state not explored	 */	if (target == NULL) {	    for (statenr = 1;statenr < ctxt->nbStates;statenr++) {		state = ctxt->states[statenr];		if ((state != NULL) && (state->reached ==			XML_REGEXP_MARK_START)) {		    target = state;		    break;		}	    }	}	state = target;    }    for (statenr = 0;statenr < ctxt->nbStates;statenr++) {	state = ctxt->states[statenr];	if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) {#ifdef DEBUG_REGEXP_GRAPH	    printf("Removed unreachable state %d\n", statenr);#endif	    xmlRegFreeState(state);	    ctxt->states[statenr] = NULL;	}    }}static intxmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) {    int ret = 0;    if ((range1->type == XML_REGEXP_RANGES) ||        (range2->type == XML_REGEXP_RANGES) ||        (range2->type == XML_REGEXP_SUBREG) ||        (range1->type == XML_REGEXP_SUBREG) ||        (range1->type == XML_REGEXP_STRING) ||        (range2->type == XML_REGEXP_STRING))	return(-1);    /* put them in order */    if (range1->type > range2->type) {        xmlRegRangePtr tmp;	tmp = range1;	range1 = range2;	range2 = tmp;    }    if ((range1->type == XML_REGEXP_ANYCHAR) ||        (range2->type == XML_REGEXP_ANYCHAR)) {	ret = 1;    } else if ((range1->type == XML_REGEXP_EPSILON) ||               (range2->type == XML_REGEXP_EPSILON)) {	return(0);    } else if (range1->type == range2->type) {        if ((range1->type != XML_REGEXP_CHARVAL) ||	    (range1->end < range2->start) ||	    (range2->end < range1->start))	    ret = 1;	else	    ret = 0;    } else if (range1->type == XML_REGEXP_CHARVAL) {        int codepoint;	int neg = 0;	/*	 * just check all codepoints in the range for acceptance,	 * this is usually way cheaper since done only once at	 * compilation than testing over and over at runtime or 	 * pushing too many states when evaluating.	 */	if (((range1->neg == 0) && (range2->neg != 0)) ||	    ((range1->neg != 0) && (range2->neg == 0)))	    neg = 1;	for (codepoint = range1->start;codepoint <= range1->end ;codepoint++) {	    ret = xmlRegCheckCharacterRange(range2->type, codepoint,					    0, range2->start, range2->end,					    range2->blockName);	    if (ret < 0)	        return(-1);	    if (((neg == 1) && (ret == 0)) ||	        ((neg == 0) && (ret == 1)))		return(1);	}	return(0);    } else if ((range1->type == XML_REGEXP_BLOCK_NAME) ||               (range2->type == XML_REGEXP_BLOCK_NAME)) {	if (range1->type == range2->type) {	    ret = xmlStrEqual(range1->blockName, range2->blockName);	} else {	    /*	     * comparing a block range with anything else is way	     * too costly, and maintining the table is like too much	     * memory too, so let's force the automata to save state	     * here.	     */	    return(1);	}    } else if ((range1->type < XML_REGEXP_LETTER) ||               (range2->type < XML_REGEXP_LETTER)) {	if ((range1->type == XML_REGEXP_ANYSPACE) &&	    (range2->type == XML_REGEXP_NOTSPACE))	    ret = 0;	else if ((range1->type == XML_REGEXP_INITNAME) &&	         (range2->type == XML_REGEXP_NOTINITNAME))	    ret = 0;	else if ((range1->type == XML_REGEXP_NAMECHAR) &&	         (range2->type == XML_REGEXP_NOTNAMECHAR))	    ret = 0;	else if ((range1->type == XML_REGEXP_DECIMAL) &&	         (range2->type == XML_REGEXP_NOTDECIMAL))	    ret = 0;	else if ((range1->type == XML_REGEXP_REALCHAR) &&	         (range2->type == XML_REGEXP_NOTREALCHAR))	    ret = 0;	else {	    /* same thing to limit complexity */	    return(1);	}    } else {        ret = 0;        /* range1->type < range2->type here */        switch (range1->type) {	    case XML_REGEXP_LETTER:	         /* all disjoint except in the subgroups */	         if ((range2->type == XML_REGEXP_LETTER_UPPERCASE) ||		     (range2->type == XML_REGEXP_LETTER_LOWERCASE) ||		     (range2->type == XML_REGEXP_LETTER_TITLECASE) ||		     (range2->type == XML_REGEXP_LETTER_MODIFIER) ||		     (range2->type == XML_REGEXP_LETTER_OTHERS))		     ret = 1;		 break;	    case XML_REGEXP_MARK:	         if ((range2->type == XML_REGEXP_MARK_NONSPACING) ||		     (range2->type == XML_REGEXP_MARK_SPACECOMBINING) ||		     (range2->type == XML_REGEXP_MARK_ENCLOSING))		     ret = 1;		 break;	    case XML_REGEXP_NUMBER:	         if ((range2->type == XML_REGEXP_NUMBER_DECIMAL) ||		     (range2->type == XML_REGEXP_NUMBER_LETTER) ||		     (range2->type == XML_REGEXP_NUMBER_OTHERS))		     ret = 1;		 break;	    case XML_REGEXP_PUNCT:	         if ((range2->type == XML_REGEXP_PUNCT_CONNECTOR) ||		     (range2->type == XML_REGEXP_PUNCT_DASH) ||		     (range2->type == XML_REGEXP_PUNCT_OPEN) ||		     (range2->type == XML_REGEXP_PUNCT_CLOSE) ||		     (range2->type == XML_REGEXP_PUNCT_INITQUOTE) ||		     (range2->type == XML_REGEXP_PUNCT_FINQUOTE) ||		     (range2->type == XML_REGEXP_PUNCT_OTHERS))		     ret = 1;		 break;	    case XML_REGEXP_SEPAR:	         if ((range2->type == XML_REGEXP_SEPAR_SPACE) ||		     (range2->type == XML_REGEXP_SEPAR_LINE) ||		     (range2->type == XML_REGEXP_SEPAR_PARA))		     ret = 1;		 break;	    case XML_REGEXP_SYMBOL:	         if ((range2->type == XML_REGEXP_SYMBOL_MATH) ||		     (range2->type == XML_REGEXP_SYMBOL_CURRENCY) ||		     (range2->type == XML_REGEXP_SYMBOL_MODIFIER) ||		     (range2->type == XML_REGEXP_SYMBOL_OTHERS))		     ret = 1;		 break;	    case XML_REGEXP_OTHER:	         if ((range2->type == XML_REGEXP_OTHER_CONTROL) ||		     (range2->type == XML_REGEXP_OTHER_FORMAT) ||		     (range2->type == XML_REGEXP_OTHER_PRIVATE))		     ret = 1;		 break;            default:	         if ((range2->type >= XML_REGEXP_LETTER) &&		     (range2->type < XML_REGEXP_BLOCK_NAME))		     ret = 0;		 else {		     /* safety net ! */		     return(1);		 }	}    }    if (((range1->neg == 0) && (range2->neg != 0)) ||        ((range1->neg != 0) && (range2->neg == 0)))	ret = !ret;    return(1);}/** * xmlFACompareAtomTypes: * @type1:  an atom type * @type2:  an atom type * * Compares two atoms type to check whether they intersect in some ways, * this is used by xmlFACompareAtoms only * * Returns 1 if they may intersect and 0 otherwise */static intxmlFACompareAtomTypes(xmlRegAtomType type1, xmlRegAtomType type2) {    if ((type1 == XML_REGEXP_EPSILON) ||        (type1 == XML_REGEXP_CHARVAL) ||	(type1 == XML_REGEXP_RANGES) ||	(type1 == XML_REGEXP_SUBREG) ||	(type1 == XML_REGEXP_STRING) ||	(type1 == XML_REGEXP_ANYCHAR))	return(1);    if ((type2 == XML_REGEXP_EPSILON) ||        (type2 == XML_REGEXP_CHARVAL) ||	(type2 == XML_REGEXP_RANGES) ||	(type2 == XML_REGEXP_SUBREG) ||	(type2 == XML_REGEXP_STRING) ||	(type2 == XML_REGEXP_ANYCHAR))	return(1);    if (type1 == type2) return(1);    /* simplify subsequent compares by making sure type1 < type2 */    if (type1 > type2) {        xmlRegAtomType tmp = type1;	type1 = type2;	type2 = tmp;    }    switch (type1) {        case XML_REGEXP_ANYSPACE: /* \s */	    /* can't be a letter, number, mark, pontuation, symbol */	    if ((type2 == XML_REGEXP_NOTSPACE) ||		((type2 >= XML_REGEXP_LETTER) &&		 (type2 <= XML_REGEXP_LETTER_OTHERS)) ||	        ((type2 >= XML_REGEXP_NUMBER) &&		 (type2 <= XML_REGEXP_NUMBER_OTHERS)) ||	        ((type2 >= XML_REGEXP_MARK) &&		 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||	        ((type2 >= XML_REGEXP_PUNCT) &&		 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||	        ((type2 >= XML_REGEXP_SYMBOL) &&		 (type2 <= XML_REGEXP_SYMBOL_OTHERS))	        ) return(0);	    break;        case XML_REGEXP_NOTSPACE: /* \S */	    break;        case XML_REGEXP_INITNAME: /* \l */	    /* can't be a number, mark, separator, pontuation, symbol or other */	    if ((type2 == XML_REGEXP_NOTINITNAME) ||	        ((type2 >= XML_REGEXP_NUMBER) &&		 (type2 <= XML_REGEXP_NUMBER_OTHERS)) ||	        ((type2 >= XML_REGEXP_MARK) &&		 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||	        ((type2 >= XML_REGEXP_SEPAR) &&		 (type2 <= XML_REGEXP_SEPAR_PARA)) ||	        ((type2 >= XML_REGEXP_PUNCT) &&		 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||	        ((type2 >= XML_REGEXP_SYMBOL) &&		 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||	        ((type2 >= XML_REGEXP_OTHER) &&		 (type2 <= XML_REGEXP_OTHER_NA))		) return(0);	    break;        case XML_REGEXP_NOTINITNAME: /* \L */	    break;        case XML_REGEXP_NAMECHAR: /* \c */	    /* can't be a mark, separator, pontuation, symbol or other */	    if ((type2 == XML_REGEXP_NOTNAMECHAR) ||	        ((type2 >= XML_REGEXP_MARK) &&		 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||	        ((type2 >= XML_REGEXP_PUNCT) &&		 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||	        ((type2 >= XML_REGEXP_SEPAR) &&		 (type2 <= XML_REGEXP_SEPAR_PARA)) ||	        ((type2 >= XML_REGEXP_SYMBOL) &&		 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||	        ((type2 >= XML_REGEXP_OTHER) &&		 (type2 <= XML_REGEXP_OTHER_NA))		) return(0);	    break;        case XML_REGEXP_NOTNAMECHAR: /* \C */	    break;        case XML_REGEXP_DECIMAL: /* \d */	    /* can't be a letter, mark, separator, pontuation, symbol or other */	    if ((type2 == XML_REGEXP_NOTDECIMAL) ||	        (type2 == XML_REGEXP_REALCHAR) ||		((type2 >= XML_REGEXP_LETTER) &&		 (type2 <= XML_REGEXP_LETTER_OTHERS)) ||	        ((type2 >= XML_REGEXP_MARK) &&		 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||	        ((type2 >= XML_REGEXP_PUNCT) &&		 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||	        ((type2 >= XML_REGEXP_SEPAR) &&		 (type2 <= XML_REGEXP_SEPAR_PARA)) ||	        ((type2 >= XML_REGEXP_SYMBOL) &&		 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||	        ((type2 >= XML_REGEXP_OTHER) &&		 (type2 <= XML_REGEXP_OTHER_NA))		)return(0);	    break;        case XML_REGEXP_NOTDECIMAL: /* \D */	    break;        case XML_REGEXP_REALCHAR: /* \w */	    /* can't be a mark, separator, pontuation, symbol or other */	    if ((type2 == XML_REGEXP_NOTDECIMAL) ||	        ((type2 >= XML_REGEXP_MARK) &&		 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||	        ((type2 >= XML_REGEXP_PUNCT) &&		 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||	        ((type2 >= XML_REGEXP_SEPAR) &&		 (type2 <= XML_REGEXP_SEPAR_PARA)) ||	        ((type2 >= XML_REGEXP_SYMBOL) &&		 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||	        ((type2 >= XML_REGEXP_OTHER) &&		 (type2 <= XML_REGEXP_OTHER_NA))		)return(0);	    break;        case XML_REGEXP_NOTREALCHAR: /* \W */	    break;	/*	 * at that point we know both type 1 and type2 are from	 * character categories are ordered and are different,	 * it becomes simple because this is a partition	 */        case XML_REGEXP_LETTER:	    if (type2 <= XML_REGEXP_LETTER_OTHERS)	        return(1);	    return(0);        case XML_REGEXP_LETTER_UPPERCASE:        case XML_REGEXP_LETTER_LOWERCASE:        case XML_REGEXP_LETTER_TITLECASE:        case XML_REGEXP_LETTER_MODIFIER:        case XML_REGEXP_LETTER_OTHERS:	    return(0);        case XML_REGEXP_MARK:	    if (type2 <= XML_REGEXP_MARK_ENCLOSING)	        return(1);	    return(0);        case XML_REGEXP_MARK_NONSPACING:        case XML_REGEXP_MARK_SPACECOMBINING:        case XML_REGEXP_MARK_ENCLOSING:	    return(0);        case XML_REGEXP_NUMBER:	    if (type2 <= XML_REGEXP_NUMBER_OTHERS)	        return(1);	    return(0);        case XML_REGEXP_NUMBER_DECIMAL:        case XML_REGEXP_NUMBER_LETTER:        case XML_REGEXP_NUMBER_OTHERS:	    return(0);        case XML_REGEXP_PUNCT:	    if (type2 <= XML_REGEXP_PUNCT_OTHERS)	        return(1);	    return(0);        case XML_REGEXP_PUNCT_CONNECTOR:        case XML_REGEXP_PUNCT_DASH:        case XML_REGEXP_PUNCT_OPEN:        case XML_REGEXP_PUNCT_CLOSE:        case XML_REGEXP_PUNCT_INITQUOTE:        case XML_REGEXP_PUNCT_FINQUOTE:        case XML_REGEXP_PUNCT_OTHERS:	    return(0);        case XML_REGEXP_SEPAR:	    if (type2 <= XML_REGEXP_SEPAR_PARA)	        return(1);	    return(0);        case XML_REGEXP_SEPAR_SPACE:        case XML_REGEXP_SEPAR_LINE:        case XML_REGEXP_SEPAR_PARA:	    return(0);        case XML_REGEXP_SYMBOL:	    if (type2 <= XML_REGEXP_SYMBOL_OTHERS)	        return(1);	    return(0);        case XML_REGEXP_SYMBOL_MATH:        case XML_REGEXP_SYMBOL_CURRENCY:        case XML_REGEXP_SYMBOL_MODIFIER:        case XML_REGEXP_SYMBOL_OTHERS:	    return(0);        case XML_REGEXP_OTHER:	    if (type2 <= XML_REGEXP_OTHER_NA)	        return(1);	    return(0);
上一页 1 2 3 45
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -