📄 parser.c
字号:
/*
* parser.c : an XML 1.0 parser, namespaces and validity support are mostly
* implemented on top of the SAX interfaces
*
* References:
* The XML specification:
* http://www.w3.org/TR/REC-xml
* Original 1.0 version:
* http://www.w3.org/TR/1998/REC-xml-19980210
* XML second edition working draft
* http://www.w3.org/TR/2000/WD-xml-2e-20000814
*
* Okay this is a big file, the parser core is around 7000 lines, then it
* is followed by the progressive parser top routines, then the various
* high level APIs to call the parser and a few miscellaneous functions.
* A number of helper functions and deprecated ones have been moved to
* parserInternals.c to reduce this file size.
* As much as possible the functions are associated with their relative
* production in the XML specification. A few productions defining the
* different ranges of character are actually implanted either in
* parserInternals.h or parserInternals.c
* The DOM tree build is realized from the default SAX callbacks in
* the module SAX.c.
* The routines doing the validation checks are in valid.c and called either
* from the SAX callbacks or as standalone functions using a preparsed
* document.
*
* See Copyright for the status of this software.
*
* daniel@veillard.com
*/
#define IN_LIBXML
#include "libxml.h"
#if defined(WIN32) && !defined (__CYGWIN__)
#define XML_DIR_SEP '\\'
#else
#define XML_DIR_SEP '/'
#endif
#include <stdlib.h>
#include <string.h>
#include <stdarg.h>
#include <libxml/xmlmemory.h>
#include <libxml/threads.h>
#include <libxml/globals.h>
#include <libxml/tree.h>
#include <libxml/parser.h>
#include <libxml/parserInternals.h>
#include <libxml/valid.h>
#include <libxml/entities.h>
#include <libxml/xmlerror.h>
#include <libxml/encoding.h>
#include <libxml/xmlIO.h>
#include <libxml/uri.h>
#ifdef LIBXML_CATALOG_ENABLED
#include <libxml/catalog.h>
#endif
#ifdef LIBXML_SCHEMAS_ENABLED
#include <libxml/xmlschemastypes.h>
#include <libxml/relaxng.h>
#endif
#ifdef HAVE_CTYPE_H
#include <ctype.h>
#endif
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif
#ifdef HAVE_SYS_STAT_H
#include <sys/stat.h>
#endif
#ifdef HAVE_FCNTL_H
#include <fcntl.h>
#endif
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
#ifdef HAVE_ZLIB_H
#include <zlib.h>
#endif
/**
* xmlParserMaxDepth:
*
* arbitrary depth limit for the XML documents that we allow to
* process. This is not a limitation of the parser but a safety
* boundary feature.
*/
unsigned int xmlParserMaxDepth = 1024;
#define SAX2 1
#define XML_PARSER_BIG_BUFFER_SIZE 300
#define XML_PARSER_BUFFER_SIZE 100
#define SAX_COMPAT_MODE BAD_CAST "SAX compatibility mode document"
/*
* List of XML prefixed PI allowed by W3C specs
*/
static const char *xmlW3CPIs[] = {
"xml-stylesheet",
NULL
};
/* DEPR void xmlParserHandleReference(xmlParserCtxtPtr ctxt); */
xmlEntityPtr xmlParseStringPEReference(xmlParserCtxtPtr ctxt,
const xmlChar **str);
static xmlParserErrors
xmlParseExternalEntityPrivate(xmlDocPtr doc, xmlParserCtxtPtr oldctxt,
xmlSAXHandlerPtr sax,
void *user_data, int depth, const xmlChar *URL,
const xmlChar *ID, xmlNodePtr *list);
#ifdef LIBXML_LEGACY_ENABLED
static void
xmlAddEntityReference(xmlEntityPtr ent, xmlNodePtr firstNode,
xmlNodePtr lastNode);
#endif /* LIBXML_LEGACY_ENABLED */
static xmlParserErrors
xmlParseBalancedChunkMemoryInternal(xmlParserCtxtPtr oldctxt,
const xmlChar *string, void *user_data, xmlNodePtr *lst);
/************************************************************************
* *
* Some factorized error routines *
* *
************************************************************************/
/**
* xmlErrAttributeDup:
* @ctxt: an XML parser context
* @prefix: the attribute prefix
* @localname: the attribute localname
*
* Handle a redefinition of attribute error
*/
static void
xmlErrAttributeDup(xmlParserCtxtPtr ctxt, const xmlChar * prefix,
const xmlChar * localname)
{
if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
(ctxt->instate == XML_PARSER_EOF))
return;
ctxt->errNo = XML_ERR_ATTRIBUTE_REDEFINED;
if (prefix == NULL)
__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
ctxt->errNo, XML_ERR_FATAL, NULL, 0,
(const char *) localname, NULL, NULL, 0, 0,
"Attribute %s redefined\n", localname);
else
__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
ctxt->errNo, XML_ERR_FATAL, NULL, 0,
(const char *) prefix, (const char *) localname,
NULL, 0, 0, "Attribute %s:%s redefined\n", prefix,
localname);
ctxt->wellFormed = 0;
if (ctxt->recovery == 0)
ctxt->disableSAX = 1;
}
/**
* xmlFatalErr:
* @ctxt: an XML parser context
* @error: the error number
* @extra: extra information string
*
* Handle a fatal parser error, i.e. violating Well-Formedness constraints
*/
static void
xmlFatalErr(xmlParserCtxtPtr ctxt, xmlParserErrors error, const char *info)
{
const char *errmsg;
if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
(ctxt->instate == XML_PARSER_EOF))
return;
switch (error) {
case XML_ERR_INVALID_HEX_CHARREF:
errmsg = "CharRef: invalid hexadecimal value\n";
break;
case XML_ERR_INVALID_DEC_CHARREF:
errmsg = "CharRef: invalid decimal value\n";
break;
case XML_ERR_INVALID_CHARREF:
errmsg = "CharRef: invalid value\n";
break;
case XML_ERR_INTERNAL_ERROR:
errmsg = "internal error";
break;
case XML_ERR_PEREF_AT_EOF:
errmsg = "PEReference at end of document\n";
break;
case XML_ERR_PEREF_IN_PROLOG:
errmsg = "PEReference in prolog\n";
break;
case XML_ERR_PEREF_IN_EPILOG:
errmsg = "PEReference in epilog\n";
break;
case XML_ERR_PEREF_NO_NAME:
errmsg = "PEReference: no name\n";
break;
case XML_ERR_PEREF_SEMICOL_MISSING:
errmsg = "PEReference: expecting ';'\n";
break;
case XML_ERR_ENTITY_LOOP:
errmsg = "Detected an entity reference loop\n";
break;
case XML_ERR_ENTITY_NOT_STARTED:
errmsg = "EntityValue: \" or ' expected\n";
break;
case XML_ERR_ENTITY_PE_INTERNAL:
errmsg = "PEReferences forbidden in internal subset\n";
break;
case XML_ERR_ENTITY_NOT_FINISHED:
errmsg = "EntityValue: \" or ' expected\n";
break;
case XML_ERR_ATTRIBUTE_NOT_STARTED:
errmsg = "AttValue: \" or ' expected\n";
break;
case XML_ERR_LT_IN_ATTRIBUTE:
errmsg = "Unescaped '<' not allowed in attributes values\n";
break;
case XML_ERR_LITERAL_NOT_STARTED:
errmsg = "SystemLiteral \" or ' expected\n";
break;
case XML_ERR_LITERAL_NOT_FINISHED:
errmsg = "Unfinished System or Public ID \" or ' expected\n";
break;
case XML_ERR_MISPLACED_CDATA_END:
errmsg = "Sequence ']]>' not allowed in content\n";
break;
case XML_ERR_URI_REQUIRED:
errmsg = "SYSTEM or PUBLIC, the URI is missing\n";
break;
case XML_ERR_PUBID_REQUIRED:
errmsg = "PUBLIC, the Public Identifier is missing\n";
break;
case XML_ERR_HYPHEN_IN_COMMENT:
errmsg = "Comment must not contain '--' (double-hyphen)\n";
break;
case XML_ERR_PI_NOT_STARTED:
errmsg = "xmlParsePI : no target name\n";
break;
case XML_ERR_RESERVED_XML_NAME:
errmsg = "Invalid PI name\n";
break;
case XML_ERR_NOTATION_NOT_STARTED:
errmsg = "NOTATION: Name expected here\n";
break;
case XML_ERR_NOTATION_NOT_FINISHED:
errmsg = "'>' required to close NOTATION declaration\n";
break;
case XML_ERR_VALUE_REQUIRED:
errmsg = "Entity value required\n";
break;
case XML_ERR_URI_FRAGMENT:
errmsg = "Fragment not allowed";
break;
case XML_ERR_ATTLIST_NOT_STARTED:
errmsg = "'(' required to start ATTLIST enumeration\n";
break;
case XML_ERR_NMTOKEN_REQUIRED:
errmsg = "NmToken expected in ATTLIST enumeration\n";
break;
case XML_ERR_ATTLIST_NOT_FINISHED:
errmsg = "')' required to finish ATTLIST enumeration\n";
break;
case XML_ERR_MIXED_NOT_STARTED:
errmsg = "MixedContentDecl : '|' or ')*' expected\n";
break;
case XML_ERR_PCDATA_REQUIRED:
errmsg = "MixedContentDecl : '#PCDATA' expected\n";
break;
case XML_ERR_ELEMCONTENT_NOT_STARTED:
errmsg = "ContentDecl : Name or '(' expected\n";
break;
case XML_ERR_ELEMCONTENT_NOT_FINISHED:
errmsg = "ContentDecl : ',' '|' or ')' expected\n";
break;
case XML_ERR_PEREF_IN_INT_SUBSET:
errmsg =
"PEReference: forbidden within markup decl in internal subset\n";
break;
case XML_ERR_GT_REQUIRED:
errmsg = "expected '>'\n";
break;
case XML_ERR_CONDSEC_INVALID:
errmsg = "XML conditional section '[' expected\n";
break;
case XML_ERR_EXT_SUBSET_NOT_FINISHED:
errmsg = "Content error in the external subset\n";
break;
case XML_ERR_CONDSEC_INVALID_KEYWORD:
errmsg =
"conditional section INCLUDE or IGNORE keyword expected\n";
break;
case XML_ERR_CONDSEC_NOT_FINISHED:
errmsg = "XML conditional section not closed\n";
break;
case XML_ERR_XMLDECL_NOT_STARTED:
errmsg = "Text declaration '<?xml' required\n";
break;
case XML_ERR_XMLDECL_NOT_FINISHED:
errmsg = "parsing XML declaration: '?>' expected\n";
break;
case XML_ERR_EXT_ENTITY_STANDALONE:
errmsg = "external parsed entities cannot be standalone\n";
break;
case XML_ERR_ENTITYREF_SEMICOL_MISSING:
errmsg = "EntityRef: expecting ';'\n";
break;
case XML_ERR_DOCTYPE_NOT_FINISHED:
errmsg = "DOCTYPE improperly terminated\n";
break;
case XML_ERR_LTSLASH_REQUIRED:
errmsg = "EndTag: '</' not found\n";
break;
case XML_ERR_EQUAL_REQUIRED:
errmsg = "expected '='\n";
break;
case XML_ERR_STRING_NOT_CLOSED:
errmsg = "String not closed expecting \" or '\n";
break;
case XML_ERR_STRING_NOT_STARTED:
errmsg = "String not started expecting ' or \"\n";
break;
case XML_ERR_ENCODING_NAME:
errmsg = "Invalid XML encoding name\n";
break;
case XML_ERR_STANDALONE_VALUE:
errmsg = "standalone accepts only 'yes' or 'no'\n";
break;
case XML_ERR_DOCUMENT_EMPTY:
errmsg = "Document is empty\n";
break;
case XML_ERR_DOCUMENT_END:
errmsg = "Extra content at the end of the document\n";
break;
case XML_ERR_NOT_WELL_BALANCED:
errmsg = "chunk is not well balanced\n";
break;
case XML_ERR_EXTRA_CONTENT:
errmsg = "extra content at the end of well balanced chunk\n";
break;
case XML_ERR_VERSION_MISSING:
errmsg = "Malformed declaration expecting version\n";
break;
#if 0
case:
errmsg = "\n";
break;
#endif
default:
errmsg = "Unregistered error message\n";
}
ctxt->errNo = error;
__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER, error,
XML_ERR_FATAL, NULL, 0, info, NULL, NULL, 0, 0, errmsg,
info);
ctxt->wellFormed = 0;
if (ctxt->recovery == 0)
ctxt->disableSAX = 1;
}
/**
* xmlFatalErrMsg:
* @ctxt: an XML parser context
* @error: the error number
* @msg: the error message
*
* Handle a fatal parser error, i.e. violating Well-Formedness constraints
*/
static void
xmlFatalErrMsg(xmlParserCtxtPtr ctxt, xmlParserErrors error,
const char *msg)
{
if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
(ctxt->instate == XML_PARSER_EOF))
return;
ctxt->errNo = error;
__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER, error,
XML_ERR_FATAL, NULL, 0, NULL, NULL, NULL, 0, 0, msg);
ctxt->wellFormed = 0;
if (ctxt->recovery == 0)
ctxt->disableSAX = 1;
}
/**
* xmlWarningMsg:
* @ctxt: an XML parser context
* @error: the error number
* @msg: the error message
* @str1: extra data
* @str2: extra data
*
* Handle a warning.
*/
static void
xmlWarningMsg(xmlParserCtxtPtr ctxt, xmlParserErrors error,
const char *msg, const xmlChar *str1, const xmlChar *str2)
{
xmlStructuredErrorFunc schannel = NULL;
if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
(ctxt->instate == XML_PARSER_EOF))
return;
if ((ctxt->sax != NULL) && (ctxt->sax->initialized == XML_SAX2_MAGIC))
schannel = ctxt->sax->serror;
__xmlRaiseError(schannel,
(ctxt->sax) ? ctxt->sax->warning : NULL,
ctxt->userData,
ctxt, NULL, XML_FROM_PARSER, error,
XML_ERR_WARNING, NULL, 0,
(const char *) str1, (const char *) str2, NULL, 0, 0,
msg, (const char *) str1, (const char *) str2);
}
/**
* xmlValidityError:
* @ctxt: an XML parser context
* @error: the error number
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -