📄 xmlregexp.c.svn-base
字号:
/* * regexp.c: generic and extensible Regular Expression engine * * Basically designed with the purpose of compiling regexps for * the variety of validation/shemas mechanisms now available in * XML related specifications these include: * - XML-1.0 DTD validation * - XML Schemas structure part 1 * - XML Schemas Datatypes part 2 especially Appendix F * - RELAX-NG/TREX i.e. the counter proposal * * See Copyright for the status of this software. * * Daniel Veillard <veillard@redhat.com> */#define IN_LIBXML#include "libxml.h"#ifdef LIBXML_REGEXP_ENABLED#include <stdio.h>#include <string.h>#ifdef HAVE_LIMITS_H#include <limits.h>#endif#include <libxml/tree.h>#include <libxml/parserInternals.h>#include <libxml/xmlregexp.h>#include <libxml/xmlautomata.h>#include <libxml/xmlunicode.h>#ifndef INT_MAX#define INT_MAX 123456789 /* easy to flag and big enough for our needs */#endif/* #define DEBUG_REGEXP_GRAPH *//* #define DEBUG_REGEXP_EXEC *//* #define DEBUG_PUSH *//* #define DEBUG_COMPACTION */#define ERROR(str) \ ctxt->error = XML_REGEXP_COMPILE_ERROR; \ xmlRegexpErrCompile(ctxt, str);#define NEXT ctxt->cur++#define CUR (*(ctxt->cur))#define NXT(index) (ctxt->cur[index])#define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l)#define NEXTL(l) ctxt->cur += l;/** * TODO: * * macro to flag unimplemented blocks */#define TODO \ xmlGenericError(xmlGenericErrorContext, \ "Unimplemented block at %s:%d\n", \ __FILE__, __LINE__);/************************************************************************ * * * Datatypes and structures * * * ************************************************************************/typedef enum { XML_REGEXP_EPSILON = 1, XML_REGEXP_CHARVAL, XML_REGEXP_RANGES, XML_REGEXP_SUBREG, XML_REGEXP_STRING, XML_REGEXP_ANYCHAR, /* . */ XML_REGEXP_ANYSPACE, /* \s */ XML_REGEXP_NOTSPACE, /* \S */ XML_REGEXP_INITNAME, /* \l */ XML_REGEXP_NOTINITNAME, /* \l */ XML_REGEXP_NAMECHAR, /* \c */ XML_REGEXP_NOTNAMECHAR, /* \C */ XML_REGEXP_DECIMAL, /* \d */ XML_REGEXP_NOTDECIMAL, /* \d */ XML_REGEXP_REALCHAR, /* \w */ XML_REGEXP_NOTREALCHAR, /* \w */ XML_REGEXP_LETTER, XML_REGEXP_LETTER_UPPERCASE, XML_REGEXP_LETTER_LOWERCASE, XML_REGEXP_LETTER_TITLECASE, XML_REGEXP_LETTER_MODIFIER, XML_REGEXP_LETTER_OTHERS, XML_REGEXP_MARK, XML_REGEXP_MARK_NONSPACING, XML_REGEXP_MARK_SPACECOMBINING, XML_REGEXP_MARK_ENCLOSING, XML_REGEXP_NUMBER, XML_REGEXP_NUMBER_DECIMAL, XML_REGEXP_NUMBER_LETTER, XML_REGEXP_NUMBER_OTHERS, XML_REGEXP_PUNCT, XML_REGEXP_PUNCT_CONNECTOR, XML_REGEXP_PUNCT_DASH, XML_REGEXP_PUNCT_OPEN, XML_REGEXP_PUNCT_CLOSE, XML_REGEXP_PUNCT_INITQUOTE, XML_REGEXP_PUNCT_FINQUOTE, XML_REGEXP_PUNCT_OTHERS, XML_REGEXP_SEPAR, XML_REGEXP_SEPAR_SPACE, XML_REGEXP_SEPAR_LINE, XML_REGEXP_SEPAR_PARA, XML_REGEXP_SYMBOL, XML_REGEXP_SYMBOL_MATH, XML_REGEXP_SYMBOL_CURRENCY, XML_REGEXP_SYMBOL_MODIFIER, XML_REGEXP_SYMBOL_OTHERS, XML_REGEXP_OTHER, XML_REGEXP_OTHER_CONTROL, XML_REGEXP_OTHER_FORMAT, XML_REGEXP_OTHER_PRIVATE, XML_REGEXP_OTHER_NA, XML_REGEXP_BLOCK_NAME} xmlRegAtomType;typedef enum { XML_REGEXP_QUANT_EPSILON = 1, XML_REGEXP_QUANT_ONCE, XML_REGEXP_QUANT_OPT, XML_REGEXP_QUANT_MULT, XML_REGEXP_QUANT_PLUS, XML_REGEXP_QUANT_ONCEONLY, XML_REGEXP_QUANT_ALL, XML_REGEXP_QUANT_RANGE} xmlRegQuantType;typedef enum { XML_REGEXP_START_STATE = 1, XML_REGEXP_FINAL_STATE, XML_REGEXP_TRANS_STATE} xmlRegStateType;typedef enum { XML_REGEXP_MARK_NORMAL = 0, XML_REGEXP_MARK_START, XML_REGEXP_MARK_VISITED} xmlRegMarkedType;typedef struct _xmlRegRange xmlRegRange;typedef xmlRegRange *xmlRegRangePtr;struct _xmlRegRange { int neg; /* 0 normal, 1 not, 2 exclude */ xmlRegAtomType type; int start; int end; xmlChar *blockName;};typedef struct _xmlRegAtom xmlRegAtom;typedef xmlRegAtom *xmlRegAtomPtr;typedef struct _xmlAutomataState xmlRegState;typedef xmlRegState *xmlRegStatePtr;struct _xmlRegAtom { int no; xmlRegAtomType type; xmlRegQuantType quant; int min; int max; void *valuep; void *valuep2; int neg; int codepoint; xmlRegStatePtr start; xmlRegStatePtr stop; int maxRanges; int nbRanges; xmlRegRangePtr *ranges; void *data;};typedef struct _xmlRegCounter xmlRegCounter;typedef xmlRegCounter *xmlRegCounterPtr;struct _xmlRegCounter { int min; int max;};typedef struct _xmlRegTrans xmlRegTrans;typedef xmlRegTrans *xmlRegTransPtr;struct _xmlRegTrans { xmlRegAtomPtr atom; int to; int counter; int count;};struct _xmlAutomataState { xmlRegStateType type; xmlRegMarkedType mark; xmlRegMarkedType reached; int no; int maxTrans; int nbTrans; xmlRegTrans *trans;};typedef struct _xmlAutomata xmlRegParserCtxt;typedef xmlRegParserCtxt *xmlRegParserCtxtPtr;struct _xmlAutomata { xmlChar *string; xmlChar *cur; int error; int neg; xmlRegStatePtr start; xmlRegStatePtr end; xmlRegStatePtr state; xmlRegAtomPtr atom; int maxAtoms; int nbAtoms; xmlRegAtomPtr *atoms; int maxStates; int nbStates; xmlRegStatePtr *states; int maxCounters; int nbCounters; xmlRegCounter *counters; int determinist;};struct _xmlRegexp { xmlChar *string; int nbStates; xmlRegStatePtr *states; int nbAtoms; xmlRegAtomPtr *atoms; int nbCounters; xmlRegCounter *counters; int determinist; /* * That's the compact form for determinists automatas */ int nbstates; int *compact; void **transdata; int nbstrings; xmlChar **stringMap;};typedef struct _xmlRegExecRollback xmlRegExecRollback;typedef xmlRegExecRollback *xmlRegExecRollbackPtr;struct _xmlRegExecRollback { xmlRegStatePtr state;/* the current state */ int index; /* the index in the input stack */ int nextbranch; /* the next transition to explore in that state */ int *counts; /* save the automata state if it has some */};typedef struct _xmlRegInputToken xmlRegInputToken;typedef xmlRegInputToken *xmlRegInputTokenPtr;struct _xmlRegInputToken { xmlChar *value; void *data;};struct _xmlRegExecCtxt { int status; /* execution status != 0 indicate an error */ int determinist; /* did we find an indeterministic behaviour */ xmlRegexpPtr comp; /* the compiled regexp */ xmlRegExecCallbacks callback; void *data; xmlRegStatePtr state;/* the current state */ int transno; /* the current transition on that state */ int transcount; /* the number of chars in char counted transitions */ /* * A stack of rollback states */ int maxRollbacks; int nbRollbacks; xmlRegExecRollback *rollbacks; /* * The state of the automata if any */ int *counts; /* * The input stack */ int inputStackMax; int inputStackNr; int index; int *charStack; const xmlChar *inputString; /* when operating on characters */ xmlRegInputTokenPtr inputStack;/* when operating on strings */};#define REGEXP_ALL_COUNTER 0x123456#define REGEXP_ALL_LAX_COUNTER 0x123457static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top);static void xmlRegFreeState(xmlRegStatePtr state);static void xmlRegFreeAtom(xmlRegAtomPtr atom);/************************************************************************ * * * Regexp memory error handler * * * ************************************************************************//** * xmlRegexpErrMemory: * @extra: extra information * * Handle an out of memory condition */static voidxmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt, const char *extra){ const char *regexp = NULL; if (ctxt != NULL) { regexp = (const char *) ctxt->string; ctxt->error = XML_ERR_NO_MEMORY; } __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP, XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra, regexp, NULL, 0, 0, "Memory allocation failed : %s\n", extra);}/** * xmlRegexpErrCompile: * @extra: extra information * * Handle a compilation failure */static voidxmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra){ const char *regexp = NULL; int idx = 0; if (ctxt != NULL) { regexp = (const char *) ctxt->string; idx = ctxt->cur - ctxt->string; ctxt->error = XML_REGEXP_COMPILE_ERROR; } __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP, XML_REGEXP_COMPILE_ERROR, XML_ERR_FATAL, NULL, 0, extra, regexp, NULL, idx, 0, "failed to compile: %s\n", extra);}/************************************************************************ * * * Allocation/Deallocation * * * ************************************************************************/static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt);/** * xmlRegEpxFromParse: * @ctxt: the parser context used to build it * * Allocate a new regexp and fill it with the result from the parser * * Returns the new regexp or NULL in case of error */static xmlRegexpPtrxmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) { xmlRegexpPtr ret; ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp)); if (ret == NULL) { xmlRegexpErrMemory(ctxt, "compiling regexp"); return(NULL); } memset(ret, 0, sizeof(xmlRegexp)); ret->string = ctxt->string; ret->nbStates = ctxt->nbStates; ret->states = ctxt->states; ret->nbAtoms = ctxt->nbAtoms; ret->atoms = ctxt->atoms; ret->nbCounters = ctxt->nbCounters; ret->counters = ctxt->counters; ret->determinist = ctxt->determinist; if ((ret->determinist != 0) && (ret->nbCounters == 0) && (ret->atoms != NULL) && (ret->atoms[0] != NULL) && (ret->atoms[0]->type == XML_REGEXP_STRING)) { int i, j, nbstates = 0, nbatoms = 0; int *stateRemap; int *stringRemap; int *transitions; void **transdata; xmlChar **stringMap; xmlChar *value; /* * Switch to a compact representation * 1/ counting the effective number of states left * 2/ counting the unique number of atoms, and check that * they are all of the string type * 3/ build a table state x atom for the transitions */ stateRemap = xmlMalloc(ret->nbStates * sizeof(int)); if (stateRemap == NULL) { xmlRegexpErrMemory(ctxt, "compiling regexp"); xmlFree(ret); return(NULL); } for (i = 0;i < ret->nbStates;i++) { if (ret->states[i] != NULL) { stateRemap[i] = nbstates; nbstates++; } else { stateRemap[i] = -1; } }#ifdef DEBUG_COMPACTION printf("Final: %d states\n", nbstates);#endif stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *)); if (stringMap == NULL) { xmlRegexpErrMemory(ctxt, "compiling regexp"); xmlFree(stateRemap); xmlFree(ret); return(NULL); } stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int)); if (stringRemap == NULL) { xmlRegexpErrMemory(ctxt, "compiling regexp"); xmlFree(stringMap); xmlFree(stateRemap); xmlFree(ret); return(NULL); } for (i = 0;i < ret->nbAtoms;i++) { if ((ret->atoms[i]->type == XML_REGEXP_STRING) && (ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) { value = ret->atoms[i]->valuep; for (j = 0;j < nbatoms;j++) { if (xmlStrEqual(stringMap[j], value)) { stringRemap[i] = j; break; } } if (j >= nbatoms) { stringRemap[i] = nbatoms; stringMap[nbatoms] = xmlStrdup(value); if (stringMap[nbatoms] == NULL) { for (i = 0;i < nbatoms;i++) xmlFree(stringMap[i]);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -