📄 xmlregexp.c
字号:
/*
* regexp.c: generic and extensible Regular Expression engine
*
* Basically designed with the purpose of compiling regexps for
* the variety of validation/shemas mechanisms now available in
* XML related specifications these include:
* - XML-1.0 DTD validation
* - XML Schemas structure part 1
* - XML Schemas Datatypes part 2 especially Appendix F
* - RELAX-NG/TREX i.e. the counter proposal
*
* See Copyright for the status of this software.
*
* Daniel Veillard <veillard@redhat.com>
*/
#define IN_LIBXML
#include "libxml.h"
#ifdef LIBXML_REGEXP_ENABLED
/* #define DEBUG_ERR */
#include <stdio.h>
#include <string.h>
#ifdef HAVE_LIMITS_H
#include <limits.h>
#endif
#include <libxml/tree.h>
#include <libxml/parserInternals.h>
#include <libxml/xmlregexp.h>
#include <libxml/xmlautomata.h>
#include <libxml/xmlunicode.h>
#ifndef INT_MAX
#define INT_MAX 123456789 /* easy to flag and big enough for our needs */
#endif
/* #define DEBUG_REGEXP_GRAPH */
/* #define DEBUG_REGEXP_EXEC */
/* #define DEBUG_PUSH */
/* #define DEBUG_COMPACTION */
#define ERROR(str) \
ctxt->error = XML_REGEXP_COMPILE_ERROR; \
xmlRegexpErrCompile(ctxt, str);
#define NEXT ctxt->cur++
#define CUR (*(ctxt->cur))
#define NXT(index) (ctxt->cur[index])
#define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l)
#define NEXTL(l) ctxt->cur += l;
#define XML_REG_STRING_SEPARATOR '|'
/**
* TODO:
*
* macro to flag unimplemented blocks
*/
#define TODO \
xmlGenericError(xmlGenericErrorContext, \
"Unimplemented block at %s:%d\n", \
__FILE__, __LINE__);
/************************************************************************
* *
* Datatypes and structures *
* *
************************************************************************/
typedef enum {
XML_REGEXP_EPSILON = 1,
XML_REGEXP_CHARVAL,
XML_REGEXP_RANGES,
XML_REGEXP_SUBREG,
XML_REGEXP_STRING,
XML_REGEXP_ANYCHAR, /* . */
XML_REGEXP_ANYSPACE, /* \s */
XML_REGEXP_NOTSPACE, /* \S */
XML_REGEXP_INITNAME, /* \l */
XML_REGEXP_NOTINITNAME, /* \l */
XML_REGEXP_NAMECHAR, /* \c */
XML_REGEXP_NOTNAMECHAR, /* \C */
XML_REGEXP_DECIMAL, /* \d */
XML_REGEXP_NOTDECIMAL, /* \d */
XML_REGEXP_REALCHAR, /* \w */
XML_REGEXP_NOTREALCHAR, /* \w */
XML_REGEXP_LETTER,
XML_REGEXP_LETTER_UPPERCASE,
XML_REGEXP_LETTER_LOWERCASE,
XML_REGEXP_LETTER_TITLECASE,
XML_REGEXP_LETTER_MODIFIER,
XML_REGEXP_LETTER_OTHERS,
XML_REGEXP_MARK,
XML_REGEXP_MARK_NONSPACING,
XML_REGEXP_MARK_SPACECOMBINING,
XML_REGEXP_MARK_ENCLOSING,
XML_REGEXP_NUMBER,
XML_REGEXP_NUMBER_DECIMAL,
XML_REGEXP_NUMBER_LETTER,
XML_REGEXP_NUMBER_OTHERS,
XML_REGEXP_PUNCT,
XML_REGEXP_PUNCT_CONNECTOR,
XML_REGEXP_PUNCT_DASH,
XML_REGEXP_PUNCT_OPEN,
XML_REGEXP_PUNCT_CLOSE,
XML_REGEXP_PUNCT_INITQUOTE,
XML_REGEXP_PUNCT_FINQUOTE,
XML_REGEXP_PUNCT_OTHERS,
XML_REGEXP_SEPAR,
XML_REGEXP_SEPAR_SPACE,
XML_REGEXP_SEPAR_LINE,
XML_REGEXP_SEPAR_PARA,
XML_REGEXP_SYMBOL,
XML_REGEXP_SYMBOL_MATH,
XML_REGEXP_SYMBOL_CURRENCY,
XML_REGEXP_SYMBOL_MODIFIER,
XML_REGEXP_SYMBOL_OTHERS,
XML_REGEXP_OTHER,
XML_REGEXP_OTHER_CONTROL,
XML_REGEXP_OTHER_FORMAT,
XML_REGEXP_OTHER_PRIVATE,
XML_REGEXP_OTHER_NA,
XML_REGEXP_BLOCK_NAME
} xmlRegAtomType;
typedef enum {
XML_REGEXP_QUANT_EPSILON = 1,
XML_REGEXP_QUANT_ONCE,
XML_REGEXP_QUANT_OPT,
XML_REGEXP_QUANT_MULT,
XML_REGEXP_QUANT_PLUS,
XML_REGEXP_QUANT_ONCEONLY,
XML_REGEXP_QUANT_ALL,
XML_REGEXP_QUANT_RANGE
} xmlRegQuantType;
typedef enum {
XML_REGEXP_START_STATE = 1,
XML_REGEXP_FINAL_STATE,
XML_REGEXP_TRANS_STATE,
XML_REGEXP_SINK_STATE
} xmlRegStateType;
typedef enum {
XML_REGEXP_MARK_NORMAL = 0,
XML_REGEXP_MARK_START,
XML_REGEXP_MARK_VISITED
} xmlRegMarkedType;
typedef struct _xmlRegRange xmlRegRange;
typedef xmlRegRange *xmlRegRangePtr;
struct _xmlRegRange {
int neg; /* 0 normal, 1 not, 2 exclude */
xmlRegAtomType type;
int start;
int end;
xmlChar *blockName;
};
typedef struct _xmlRegAtom xmlRegAtom;
typedef xmlRegAtom *xmlRegAtomPtr;
typedef struct _xmlAutomataState xmlRegState;
typedef xmlRegState *xmlRegStatePtr;
struct _xmlRegAtom {
int no;
xmlRegAtomType type;
xmlRegQuantType quant;
int min;
int max;
void *valuep;
void *valuep2;
int neg;
int codepoint;
xmlRegStatePtr start;
xmlRegStatePtr stop;
int maxRanges;
int nbRanges;
xmlRegRangePtr *ranges;
void *data;
};
typedef struct _xmlRegCounter xmlRegCounter;
typedef xmlRegCounter *xmlRegCounterPtr;
struct _xmlRegCounter {
int min;
int max;
};
typedef struct _xmlRegTrans xmlRegTrans;
typedef xmlRegTrans *xmlRegTransPtr;
struct _xmlRegTrans {
xmlRegAtomPtr atom;
int to;
int counter;
int count;
};
struct _xmlAutomataState {
xmlRegStateType type;
xmlRegMarkedType mark;
xmlRegMarkedType reached;
int no;
int maxTrans;
int nbTrans;
xmlRegTrans *trans;
};
typedef struct _xmlAutomata xmlRegParserCtxt;
typedef xmlRegParserCtxt *xmlRegParserCtxtPtr;
struct _xmlAutomata {
xmlChar *string;
xmlChar *cur;
int error;
int neg;
xmlRegStatePtr start;
xmlRegStatePtr end;
xmlRegStatePtr state;
xmlRegAtomPtr atom;
int maxAtoms;
int nbAtoms;
xmlRegAtomPtr *atoms;
int maxStates;
int nbStates;
xmlRegStatePtr *states;
int maxCounters;
int nbCounters;
xmlRegCounter *counters;
int determinist;
};
struct _xmlRegexp {
xmlChar *string;
int nbStates;
xmlRegStatePtr *states;
int nbAtoms;
xmlRegAtomPtr *atoms;
int nbCounters;
xmlRegCounter *counters;
int determinist;
/*
* That's the compact form for determinists automatas
*/
int nbstates;
int *compact;
void **transdata;
int nbstrings;
xmlChar **stringMap;
};
typedef struct _xmlRegExecRollback xmlRegExecRollback;
typedef xmlRegExecRollback *xmlRegExecRollbackPtr;
struct _xmlRegExecRollback {
xmlRegStatePtr state;/* the current state */
int index; /* the index in the input stack */
int nextbranch; /* the next transition to explore in that state */
int *counts; /* save the automata state if it has some */
};
typedef struct _xmlRegInputToken xmlRegInputToken;
typedef xmlRegInputToken *xmlRegInputTokenPtr;
struct _xmlRegInputToken {
xmlChar *value;
void *data;
};
struct _xmlRegExecCtxt {
int status; /* execution status != 0 indicate an error */
int determinist; /* did we find an indeterministic behaviour */
xmlRegexpPtr comp; /* the compiled regexp */
xmlRegExecCallbacks callback;
void *data;
xmlRegStatePtr state;/* the current state */
int transno; /* the current transition on that state */
int transcount; /* the number of chars in char counted transitions */
/*
* A stack of rollback states
*/
int maxRollbacks;
int nbRollbacks;
xmlRegExecRollback *rollbacks;
/*
* The state of the automata if any
*/
int *counts;
/*
* The input stack
*/
int inputStackMax;
int inputStackNr;
int index;
int *charStack;
const xmlChar *inputString; /* when operating on characters */
xmlRegInputTokenPtr inputStack;/* when operating on strings */
/*
* error handling
*/
int errStateNo; /* the error state number */
xmlRegStatePtr errState; /* the error state */
xmlChar *errString; /* the string raising the error */
int *errCounts; /* counters at the error state */
};
#define REGEXP_ALL_COUNTER 0x123456
#define REGEXP_ALL_LAX_COUNTER 0x123457
static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top);
static void xmlRegFreeState(xmlRegStatePtr state);
static void xmlRegFreeAtom(xmlRegAtomPtr atom);
/************************************************************************
* *
* Regexp memory error handler *
* *
************************************************************************/
/**
* xmlRegexpErrMemory:
* @extra: extra information
*
* Handle an out of memory condition
*/
static void
xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt, const char *extra)
{
const char *regexp = NULL;
if (ctxt != NULL) {
regexp = (const char *) ctxt->string;
ctxt->error = XML_ERR_NO_MEMORY;
}
__xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
regexp, NULL, 0, 0,
"Memory allocation failed : %s\n", extra);
}
/**
* xmlRegexpErrCompile:
* @extra: extra information
*
* Handle a compilation failure
*/
static void
xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra)
{
const char *regexp = NULL;
int idx = 0;
if (ctxt != NULL) {
regexp = (const char *) ctxt->string;
idx = ctxt->cur - ctxt->string;
ctxt->error = XML_REGEXP_COMPILE_ERROR;
}
__xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
XML_REGEXP_COMPILE_ERROR, XML_ERR_FATAL, NULL, 0, extra,
regexp, NULL, idx, 0,
"failed to compile: %s\n", extra);
}
/************************************************************************
* *
* Allocation/Deallocation *
* *
************************************************************************/
static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt);
/**
* xmlRegEpxFromParse:
* @ctxt: the parser context used to build it
*
* Allocate a new regexp and fill it with the result from the parser
*
* Returns the new regexp or NULL in case of error
*/
static xmlRegexpPtr
xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) {
xmlRegexpPtr ret;
ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp));
if (ret == NULL) {
xmlRegexpErrMemory(ctxt, "compiling regexp");
return(NULL);
}
memset(ret, 0, sizeof(xmlRegexp));
ret->string = ctxt->string;
ret->nbStates = ctxt->nbStates;
ret->states = ctxt->states;
ret->nbAtoms = ctxt->nbAtoms;
ret->atoms = ctxt->atoms;
ret->nbCounters = ctxt->nbCounters;
ret->counters = ctxt->counters;
ret->determinist = ctxt->determinist;
if ((ret->determinist != 0) &&
(ret->nbCounters == 0) &&
(ret->atoms != NULL) &&
(ret->atoms[0] != NULL) &&
(ret->atoms[0]->type == XML_REGEXP_STRING)) {
int i, j, nbstates = 0, nbatoms = 0;
int *stateRemap;
int *stringRemap;
int *transitions;
void **transdata;
xmlChar **stringMap;
xmlChar *value;
/*
* Switch to a compact representation
* 1/ counting the effective number of states left
* 2/ counting the unique number of atoms, and check that
* they are all of the string type
* 3/ build a table state x atom for the transitions
*/
stateRemap = xmlMalloc(ret->nbStates * sizeof(int));
if (stateRemap == NULL) {
xmlRegexpErrMemory(ctxt, "compiling regexp");
xmlFree(ret);
return(NULL);
}
for (i = 0;i < ret->nbStates;i++) {
if (ret->states[i] != NULL) {
stateRemap[i] = nbstates;
nbstates++;
} else {
stateRemap[i] = -1;
}
}
#ifdef DEBUG_COMPACTION
printf("Final: %d states\n", nbstates);
#endif
stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *));
if (stringMap == NULL) {
xmlRegexpErrMemory(ctxt, "compiling regexp");
xmlFree(stateRemap);
xmlFree(ret);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -