📄 xmlregexp.c
字号:
if (t1->to == -1) /* eliminated */
continue;
for (i = 0;i < transnr;i++) {
t2 = &(state->trans[i]);
if (t2->to == -1) /* eliminated */
continue;
if (t2->atom != NULL) {
if (t1->to == t2->to) {
if (xmlFACompareAtoms(t1->atom, t2->atom))
t2->to = -1; /* eliminated */
} else {
/* not determinist ! */
if (xmlFACompareAtoms(t1->atom, t2->atom))
ret = 0;
}
} else if (t1->to != -1) {
/*
* do the closure in case of remaining specific
* epsilon transitions like choices or all
*/
ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
t2->to, t2->atom);
if (ret == 0)
return(0);
}
}
if (ret == 0)
break;
}
if (ret == 0)
break;
}
ctxt->determinist = ret;
return(ret);
}
/************************************************************************
* *
* Routines to check input against transition atoms *
* *
************************************************************************/
static int
xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg,
int start, int end, const xmlChar *blockName) {
int ret = 0;
switch (type) {
case XML_REGEXP_STRING:
case XML_REGEXP_SUBREG:
case XML_REGEXP_RANGES:
case XML_REGEXP_EPSILON:
return(-1);
case XML_REGEXP_ANYCHAR:
ret = ((codepoint != '\n') && (codepoint != '\r'));
break;
case XML_REGEXP_CHARVAL:
ret = ((codepoint >= start) && (codepoint <= end));
break;
case XML_REGEXP_NOTSPACE:
neg = !neg;
case XML_REGEXP_ANYSPACE:
ret = ((codepoint == '\n') || (codepoint == '\r') ||
(codepoint == '\t') || (codepoint == ' '));
break;
case XML_REGEXP_NOTINITNAME:
neg = !neg;
case XML_REGEXP_INITNAME:
ret = (IS_LETTER(codepoint) ||
(codepoint == '_') || (codepoint == ':'));
break;
case XML_REGEXP_NOTNAMECHAR:
neg = !neg;
case XML_REGEXP_NAMECHAR:
ret = (IS_LETTER(codepoint) || IS_DIGIT(codepoint) ||
(codepoint == '.') || (codepoint == '-') ||
(codepoint == '_') || (codepoint == ':') ||
IS_COMBINING(codepoint) || IS_EXTENDER(codepoint));
break;
case XML_REGEXP_NOTDECIMAL:
neg = !neg;
case XML_REGEXP_DECIMAL:
ret = xmlUCSIsCatNd(codepoint);
break;
case XML_REGEXP_REALCHAR:
neg = !neg;
case XML_REGEXP_NOTREALCHAR:
ret = xmlUCSIsCatP(codepoint);
if (ret == 0)
ret = xmlUCSIsCatZ(codepoint);
if (ret == 0)
ret = xmlUCSIsCatC(codepoint);
break;
case XML_REGEXP_LETTER:
ret = xmlUCSIsCatL(codepoint);
break;
case XML_REGEXP_LETTER_UPPERCASE:
ret = xmlUCSIsCatLu(codepoint);
break;
case XML_REGEXP_LETTER_LOWERCASE:
ret = xmlUCSIsCatLl(codepoint);
break;
case XML_REGEXP_LETTER_TITLECASE:
ret = xmlUCSIsCatLt(codepoint);
break;
case XML_REGEXP_LETTER_MODIFIER:
ret = xmlUCSIsCatLm(codepoint);
break;
case XML_REGEXP_LETTER_OTHERS:
ret = xmlUCSIsCatLo(codepoint);
break;
case XML_REGEXP_MARK:
ret = xmlUCSIsCatM(codepoint);
break;
case XML_REGEXP_MARK_NONSPACING:
ret = xmlUCSIsCatMn(codepoint);
break;
case XML_REGEXP_MARK_SPACECOMBINING:
ret = xmlUCSIsCatMc(codepoint);
break;
case XML_REGEXP_MARK_ENCLOSING:
ret = xmlUCSIsCatMe(codepoint);
break;
case XML_REGEXP_NUMBER:
ret = xmlUCSIsCatN(codepoint);
break;
case XML_REGEXP_NUMBER_DECIMAL:
ret = xmlUCSIsCatNd(codepoint);
break;
case XML_REGEXP_NUMBER_LETTER:
ret = xmlUCSIsCatNl(codepoint);
break;
case XML_REGEXP_NUMBER_OTHERS:
ret = xmlUCSIsCatNo(codepoint);
break;
case XML_REGEXP_PUNCT:
ret = xmlUCSIsCatP(codepoint);
break;
case XML_REGEXP_PUNCT_CONNECTOR:
ret = xmlUCSIsCatPc(codepoint);
break;
case XML_REGEXP_PUNCT_DASH:
ret = xmlUCSIsCatPd(codepoint);
break;
case XML_REGEXP_PUNCT_OPEN:
ret = xmlUCSIsCatPs(codepoint);
break;
case XML_REGEXP_PUNCT_CLOSE:
ret = xmlUCSIsCatPe(codepoint);
break;
case XML_REGEXP_PUNCT_INITQUOTE:
ret = xmlUCSIsCatPi(codepoint);
break;
case XML_REGEXP_PUNCT_FINQUOTE:
ret = xmlUCSIsCatPf(codepoint);
break;
case XML_REGEXP_PUNCT_OTHERS:
ret = xmlUCSIsCatPo(codepoint);
break;
case XML_REGEXP_SEPAR:
ret = xmlUCSIsCatZ(codepoint);
break;
case XML_REGEXP_SEPAR_SPACE:
ret = xmlUCSIsCatZs(codepoint);
break;
case XML_REGEXP_SEPAR_LINE:
ret = xmlUCSIsCatZl(codepoint);
break;
case XML_REGEXP_SEPAR_PARA:
ret = xmlUCSIsCatZp(codepoint);
break;
case XML_REGEXP_SYMBOL:
ret = xmlUCSIsCatS(codepoint);
break;
case XML_REGEXP_SYMBOL_MATH:
ret = xmlUCSIsCatSm(codepoint);
break;
case XML_REGEXP_SYMBOL_CURRENCY:
ret = xmlUCSIsCatSc(codepoint);
break;
case XML_REGEXP_SYMBOL_MODIFIER:
ret = xmlUCSIsCatSk(codepoint);
break;
case XML_REGEXP_SYMBOL_OTHERS:
ret = xmlUCSIsCatSo(codepoint);
break;
case XML_REGEXP_OTHER:
ret = xmlUCSIsCatC(codepoint);
break;
case XML_REGEXP_OTHER_CONTROL:
ret = xmlUCSIsCatCc(codepoint);
break;
case XML_REGEXP_OTHER_FORMAT:
ret = xmlUCSIsCatCf(codepoint);
break;
case XML_REGEXP_OTHER_PRIVATE:
ret = xmlUCSIsCatCo(codepoint);
break;
case XML_REGEXP_OTHER_NA:
/* ret = xmlUCSIsCatCn(codepoint); */
/* Seems it doesn't exist anymore in recent Unicode releases */
ret = 0;
break;
case XML_REGEXP_BLOCK_NAME:
ret = xmlUCSIsBlock(codepoint, (const char *) blockName);
break;
}
if (neg)
return(!ret);
return(ret);
}
static int
xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) {
int i, ret = 0;
xmlRegRangePtr range;
if ((atom == NULL) || (!IS_CHAR(codepoint)))
return(-1);
switch (atom->type) {
case XML_REGEXP_SUBREG:
case XML_REGEXP_EPSILON:
return(-1);
case XML_REGEXP_CHARVAL:
return(codepoint == atom->codepoint);
case XML_REGEXP_RANGES: {
int accept = 0;
for (i = 0;i < atom->nbRanges;i++) {
range = atom->ranges[i];
if (range->neg == 2) {
ret = xmlRegCheckCharacterRange(range->type, codepoint,
0, range->start, range->end,
range->blockName);
if (ret != 0)
return(0); /* excluded char */
} else if (range->neg) {
ret = xmlRegCheckCharacterRange(range->type, codepoint,
0, range->start, range->end,
range->blockName);
if (ret == 0)
accept = 1;
else
return(0);
} else {
ret = xmlRegCheckCharacterRange(range->type, codepoint,
0, range->start, range->end,
range->blockName);
if (ret != 0)
accept = 1; /* might still be excluded */
}
}
return(accept);
}
case XML_REGEXP_STRING:
printf("TODO: XML_REGEXP_STRING\n");
return(-1);
case XML_REGEXP_ANYCHAR:
case XML_REGEXP_ANYSPACE:
case XML_REGEXP_NOTSPACE:
case XML_REGEXP_INITNAME:
case XML_REGEXP_NOTINITNAME:
case XML_REGEXP_NAMECHAR:
case XML_REGEXP_NOTNAMECHAR:
case XML_REGEXP_DECIMAL:
case XML_REGEXP_NOTDECIMAL:
case XML_REGEXP_REALCHAR:
case XML_REGEXP_NOTREALCHAR:
case XML_REGEXP_LETTER:
case XML_REGEXP_LETTER_UPPERCASE:
case XML_REGEXP_LETTER_LOWERCASE:
case XML_REGEXP_LETTER_TITLECASE:
case XML_REGEXP_LETTER_MODIFIER:
case XML_REGEXP_LETTER_OTHERS:
case XML_REGEXP_MARK:
case XML_REGEXP_MARK_NONSPACING:
case XML_REGEXP_MARK_SPACECOMBINING:
case XML_REGEXP_MARK_ENCLOSING:
case XML_REGEXP_NUMBER:
case XML_REGEXP_NUMBER_DECIMAL:
case XML_REGEXP_NUMBER_LETTER:
case XML_REGEXP_NUMBER_OTHERS:
case XML_REGEXP_PUNCT:
case XML_REGEXP_PUNCT_CONNECTOR:
case XML_REGEXP_PUNCT_DASH:
case XML_REGEXP_PUNCT_OPEN:
case XML_REGEXP_PUNCT_CLOSE:
case XML_REGEXP_PUNCT_INITQUOTE:
case XML_REGEXP_PUNCT_FINQUOTE:
case XML_REGEXP_PUNCT_OTHERS:
case XML_REGEXP_SEPAR:
case XML_REGEXP_SEPAR_SPACE:
case XML_REGEXP_SEPAR_LINE:
case XML_REGEXP_SEPAR_PARA:
case XML_REGEXP_SYMBOL:
case XML_REGEXP_SYMBOL_MATH:
case XML_REGEXP_SYMBOL_CURRENCY:
case XML_REGEXP_SYMBOL_MODIFIER:
case XML_REGEXP_SYMBOL_OTHERS:
case XML_REGEXP_OTHER:
case XML_REGEXP_OTHER_CONTROL:
case XML_REGEXP_OTHER_FORMAT:
case XML_REGEXP_OTHER_PRIVATE:
case XML_REGEXP_OTHER_NA:
case XML_REGEXP_BLOCK_NAME:
ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0,
(const xmlChar *)atom->valuep);
if (atom->neg)
ret = !ret;
break;
}
return(ret);
}
/************************************************************************
* *
* Saving and restoring state of an execution context *
* *
************************************************************************/
#ifdef DEBUG_REGEXP_EXEC
static void
xmlFARegDebugExec(xmlRegExecCtxtPtr exec) {
printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index);
if (exec->inputStack != NULL) {
int i;
printf(": ");
for (i = 0;(i < 3) && (i < exec->inputStackNr);i++)
printf("%s ", exec->inputStack[exec->inputStackNr - (i + 1)]);
} else {
printf(": %s", &(exec->inputString[exec->index]));
}
printf("\n");
}
#endif
static void
xmlFARegExecSave(xmlRegExecCtxtPtr exec) {
#ifdef DEBUG_REGEXP_EXEC
printf("saving ");
exec->transno++;
xmlFARegDebugExec(exec);
exec->transno--;
#endif
if (exec->maxRollbacks == 0) {
exec->maxRollbacks = 4;
exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks *
sizeof(xmlRegExecRollback));
if (exec->rollbacks == NULL) {
xmlRegexpErrMemory(NULL, "saving regexp");
exec->maxRollbacks = 0;
return;
}
memset(exec->rollbacks, 0,
exec->maxRollbacks * sizeof(xmlRegExecRollback));
} else if (exec->nbRollbacks >= exec->maxRollbacks) {
xmlRegExecRollback *tmp;
int len = exec->maxRollbacks;
exec->maxRollbacks *= 2;
tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks,
exec->maxRollbacks * sizeof(xmlRegExecRollback));
if (tmp == NULL) {
xmlRegexpErrMemory(NULL, "saving regexp");
exec->maxRollbacks /= 2;
return;
}
exec->rollbacks = tmp;
tmp = &exec->rollbacks[len];
memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback));
}
exec->rollbacks[exec->nbRollbacks].state = exec->state;
exec->rollbacks[exec->nbRollbacks].index = exec->index;
exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1;
if (exec->comp->nbCounters > 0) {
if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
exec->rollbacks[exec->nbRollbacks].counts = (int *)
xmlMalloc(exec->comp->nbCounters * sizeof(int));
if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
xmlRegexpErrMemory(NULL, "saving regexp");
exec->status = -5;
return;
}
}
memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts,
exec->comp->nbCounters * sizeof(int));
}
exec->nbRollbacks++;
}
static void
xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) {
if (exec->nbRollbacks <= 0) {
exec->status = -1;
#ifdef DEBUG_REGEXP_EXEC
printf("rollback failed on empty stack\n");
#endif
return;
}
exec->nbRollbacks--;
exec->state = exec->rollbacks[exec->nbRollbacks].state;
exec->index = exec->rollbacks[exec->nbRollbacks].index;
exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch;
if (exec->comp->nbCounters > 0) {
if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
fprintf(stderr, "exec save: allocation failed");
exec->status = -6;
return;
}
memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts,
exec->comp->nbCounters * sizeof(int));
}
#ifdef DEBUG_REGEXP_EXEC
printf("restored ");
xmlFARegDebugExec(exec);
#endif
}
/************************************************************************
* *
* Verifier, running an input against a compiled regexp *
* *
************************************************************************/
static int
xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
xmlRegExecCtxt execval;
xmlRegExecCtxtPtr exec = &execval;
int ret, codepoint = 0, len;
exec->inputString = content;
exec->index = 0;
exec->determinist = 1;
exec->maxRollbacks = 0;
exec->nbRollbacks = 0;
exec->rollbacks = NULL;
exec->status = 0;
exec->comp = comp;
exec->state = comp->states[0];
exec->transno = 0;
exec->transcount = 0;
exec->inputStack = NULL;
exec->inputStackMax = 0;
if (comp->nbCounters > 0) {
exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int));
if (exec->counts == NULL) {
xmlRegexpErrMemory(NULL, "running regexp");
return(-1);
}
memset(exec->counts, 0, comp->nbCo
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -