📄 htmlparser.c.svn-base

📁 这是一个用于解析xml文件的类库。使用这个类库
💻 SVN-BASE
📖 第 1 页 / 共 5 页
字号:
12 3 4 5 下一页
/* * HTMLparser.c : an HTML 4.0 non-verifying parser * * See Copyright for the status of this software. * * daniel@veillard.com */#define IN_LIBXML#include "libxml.h"#ifdef LIBXML_HTML_ENABLED#include <string.h>#ifdef HAVE_CTYPE_H#include <ctype.h>#endif#ifdef HAVE_STDLIB_H#include <stdlib.h>#endif#ifdef HAVE_SYS_STAT_H#include <sys/stat.h>#endif#ifdef HAVE_FCNTL_H#include <fcntl.h>#endif#ifdef HAVE_UNISTD_H#include <unistd.h>#endif#ifdef HAVE_ZLIB_H#include <zlib.h>#endif#include <libxml/xmlmemory.h>#include <libxml/tree.h>#include <libxml/parser.h>#include <libxml/parserInternals.h>#include <libxml/xmlerror.h>#include <libxml/HTMLparser.h>#include <libxml/HTMLtree.h>#include <libxml/entities.h>#include <libxml/encoding.h>#include <libxml/valid.h>#include <libxml/xmlIO.h>#include <libxml/globals.h>#include <libxml/uri.h>#define HTML_MAX_NAMELEN 1000#define HTML_PARSER_BIG_BUFFER_SIZE 1000#define HTML_PARSER_BUFFER_SIZE 100/* #define DEBUG *//* #define DEBUG_PUSH */static int htmlOmittedDefaultValue = 1;xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,			     xmlChar end, xmlChar  end2, xmlChar end3);static void htmlParseComment(htmlParserCtxtPtr ctxt);/************************************************************************ *									* * 		Some factorized error routines				* *									* ************************************************************************//** * htmlErrMemory: * @ctxt:  an HTML parser context * @extra:  extra informations * * Handle a redefinition of attribute error */static voidhtmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra){    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&        (ctxt->instate == XML_PARSER_EOF))	return;    if (ctxt != NULL) {        ctxt->errNo = XML_ERR_NO_MEMORY;        ctxt->instate = XML_PARSER_EOF;        ctxt->disableSAX = 1;    }    if (extra)        __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,                        XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,                        NULL, NULL, 0, 0,                        "Memory allocation failed : %s\n", extra);    else        __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,                        XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,                        NULL, NULL, 0, 0, "Memory allocation failed\n");}/** * htmlParseErr: * @ctxt:  an HTML parser context * @error:  the error number * @msg:  the error message * @str1:  string infor * @str2:  string infor * * Handle a fatal parser error, i.e. violating Well-Formedness constraints */static voidhtmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,             const char *msg, const xmlChar *str1, const xmlChar *str2){    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&        (ctxt->instate == XML_PARSER_EOF))	return;    ctxt->errNo = error;    __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,                    XML_ERR_ERROR, NULL, 0,		    (const char *) str1, (const char *) str2,		    NULL, 0, 0,		    msg, str1, str2);    ctxt->wellFormed = 0;}/** * htmlParseErrInt: * @ctxt:  an HTML parser context * @error:  the error number * @msg:  the error message * @val:  integer info * * Handle a fatal parser error, i.e. violating Well-Formedness constraints */static voidhtmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,             const char *msg, int val){    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&        (ctxt->instate == XML_PARSER_EOF))	return;    ctxt->errNo = error;    __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,                    XML_ERR_ERROR, NULL, 0, NULL, NULL,		    NULL, val, 0, msg, val);    ctxt->wellFormed = 0;}/************************************************************************ *									* * 		Parser stacks related functions and macros		* *									* ************************************************************************//** * htmlnamePush: * @ctxt:  an HTML parser context * @value:  the element name * * Pushes a new element name on top of the name stack * * Returns 0 in case of error, the index in the stack otherwise */static inthtmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value){    if (ctxt->nameNr >= ctxt->nameMax) {        ctxt->nameMax *= 2;        ctxt->nameTab = (const xmlChar * *)                         xmlRealloc((xmlChar * *)ctxt->nameTab,                                    ctxt->nameMax *                                    sizeof(ctxt->nameTab[0]));        if (ctxt->nameTab == NULL) {            htmlErrMemory(ctxt, NULL);            return (0);        }    }    ctxt->nameTab[ctxt->nameNr] = value;    ctxt->name = value;    return (ctxt->nameNr++);}/** * htmlnamePop: * @ctxt: an HTML parser context * * Pops the top element name from the name stack * * Returns the name just removed */static const xmlChar *htmlnamePop(htmlParserCtxtPtr ctxt){    const xmlChar *ret;    if (ctxt->nameNr <= 0)        return (0);    ctxt->nameNr--;    if (ctxt->nameNr < 0)        return (0);    if (ctxt->nameNr > 0)        ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];    else        ctxt->name = NULL;    ret = ctxt->nameTab[ctxt->nameNr];    ctxt->nameTab[ctxt->nameNr] = 0;    return (ret);}/* * Macros for accessing the content. Those should be used only by the parser, * and not exported. * * Dirty macros, i.e. one need to make assumption on the context to use them * *   CUR_PTR return the current pointer to the xmlChar to be parsed. *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled *           in UNICODE mode. This should be used internally by the parser *           only to compare to ASCII values otherwise it would break when *           running with UTF-8 encoding. *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only *           to compare on ASCII based substring. *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR *           it should be used only to compare on ASCII based substring. *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined *           strings without newlines within the parser. * * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding * *   CURRENT Returns the current char value, with the full decoding of *           UTF-8 if we are using this mode. It returns an int. *   NEXT    Skip to the next character, this does the proper decoding *           in UTF-8 mode. It also pop-up unfinished entities on the fly. *   NEXTL(l) Skip the current unicode character of l xmlChars long. *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly */#define UPPER (toupper(*ctxt->input->cur))#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)#define NXT(val) ctxt->input->cur[(val)]#define UPP(val) (toupper(ctxt->input->cur[(val)]))#define CUR_PTR ctxt->input->cur#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \		   (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \	xmlParserInputShrink(ctxt->input)#define GROW if ((ctxt->progressive == 0) &&				\		 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK))	\	xmlParserInputGrow(ctxt->input, INPUT_CHUNK)#define CURRENT ((int) (*ctxt->input->cur))#define SKIP_BLANKS htmlSkipBlankChars(ctxt)/* Inported from XML *//* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */#define CUR ((int) (*ctxt->input->cur))#define NEXT xmlNextChar(ctxt)#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))#define NXT(val) ctxt->input->cur[(val)]#define CUR_PTR ctxt->input->cur#define NEXTL(l) do {							\    if (*(ctxt->input->cur) == '\n') {					\	ctxt->input->line++; ctxt->input->col = 1;			\    } else ctxt->input->col++;						\    ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++;		\  } while (0)    /************    \    if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt);	\    if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt); ************/#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)#define COPY_BUF(l,b,i,v)						\    if (l == 1) b[i++] = (xmlChar) v;					\    else i += xmlCopyChar(l,&b[i],v)/** * htmlCurrentChar: * @ctxt:  the HTML parser context * @len:  pointer to the length of the char read * * The current char value, if using UTF-8 this may actually span multiple * bytes in the input buffer. Implement the end of line normalization: * 2.11 End-of-Line Handling * If the encoding is unspecified, in the case we find an ISO-Latin-1 * char, then the encoding converter is plugged in automatically. * * Returns the current char value and its length */static inthtmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {    if (ctxt->instate == XML_PARSER_EOF)	return(0);    if (ctxt->token != 0) {	*len = 0;	return(ctxt->token);    }	    if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {	/*	 * We are supposed to handle UTF8, check it's valid	 * From rfc2044: encoding of the Unicode values on UTF-8:	 *	 * UCS-4 range (hex.)           UTF-8 octet sequence (binary)	 * 0000 0000-0000 007F   0xxxxxxx	 * 0000 0080-0000 07FF   110xxxxx 10xxxxxx	 * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx 	 *	 * Check for the 0x110000 limit too	 */	const unsigned char *cur = ctxt->input->cur;	unsigned char c;	unsigned int val;	c = *cur;	if (c & 0x80) {	    if (cur[1] == 0)		xmlParserInputGrow(ctxt->input, INPUT_CHUNK);	    if ((cur[1] & 0xc0) != 0x80)		goto encoding_error;	    if ((c & 0xe0) == 0xe0) {		if (cur[2] == 0)		    xmlParserInputGrow(ctxt->input, INPUT_CHUNK);		if ((cur[2] & 0xc0) != 0x80)		    goto encoding_error;		if ((c & 0xf0) == 0xf0) {		    if (cur[3] == 0)			xmlParserInputGrow(ctxt->input, INPUT_CHUNK);		    if (((c & 0xf8) != 0xf0) ||			((cur[3] & 0xc0) != 0x80))			goto encoding_error;		    /* 4-byte code */		    *len = 4;		    val = (cur[0] & 0x7) << 18;		    val |= (cur[1] & 0x3f) << 12;		    val |= (cur[2] & 0x3f) << 6;
12 3 4 5 下一页
💿 文件大小 1527 K
👤 上传用户 qqpp2q
📂 所属分类其他
🏷️ 相关标签

#xml
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -