📄 htmltree.c.svn-base

📁 这是一个用于解析xml文件的类库。使用这个类库
💻 SVN-BASE
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* * HTMLtree.c : implementation of access function for an HTML tree. * * See Copyright for the status of this software. * * daniel@veillard.com */#define IN_LIBXML#include "libxml.h"#ifdef LIBXML_HTML_ENABLED#include <string.h> /* for memset() only ! */#ifdef HAVE_CTYPE_H#include <ctype.h>#endif#ifdef HAVE_STDLIB_H#include <stdlib.h>#endif#include <libxml/xmlmemory.h>#include <libxml/HTMLparser.h>#include <libxml/HTMLtree.h>#include <libxml/entities.h>#include <libxml/valid.h>#include <libxml/xmlerror.h>#include <libxml/parserInternals.h>#include <libxml/globals.h>#include <libxml/uri.h>/************************************************************************ *									* *   		Getting/Setting encoding meta tags			* *									* ************************************************************************//** * htmlGetMetaEncoding: * @doc:  the document *  * Encoding definition lookup in the Meta tags * * Returns the current encoding as flagged in the HTML source */const xmlChar *htmlGetMetaEncoding(htmlDocPtr doc) {    htmlNodePtr cur;    const xmlChar *content;    const xmlChar *encoding;    if (doc == NULL)	return(NULL);    cur = doc->children;    /*     * Search the html     */    while (cur != NULL) {	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {	    if (xmlStrEqual(cur->name, BAD_CAST"html"))		break;	    if (xmlStrEqual(cur->name, BAD_CAST"head"))		goto found_head;	    if (xmlStrEqual(cur->name, BAD_CAST"meta"))		goto found_meta;	}	cur = cur->next;    }    if (cur == NULL)	return(NULL);    cur = cur->children;    /*     * Search the head     */    while (cur != NULL) {	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {	    if (xmlStrEqual(cur->name, BAD_CAST"head"))		break;	    if (xmlStrEqual(cur->name, BAD_CAST"meta"))		goto found_meta;	}	cur = cur->next;    }    if (cur == NULL)	return(NULL);found_head:    cur = cur->children;    /*     * Search the meta elements     */found_meta:    while (cur != NULL) {	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {	    if (xmlStrEqual(cur->name, BAD_CAST"meta")) {		xmlAttrPtr attr = cur->properties;		int http;		const xmlChar *value;		content = NULL;		http = 0;		while (attr != NULL) {		    if ((attr->children != NULL) &&		        (attr->children->type == XML_TEXT_NODE) &&		        (attr->children->next == NULL)) {			value = attr->children->content;			if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))			 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))			    http = 1;			else if ((value != NULL)			 && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))			    content = value;			if ((http != 0) && (content != NULL))			    goto found_content;		    }		    attr = attr->next;		}	    }	}	cur = cur->next;    }    return(NULL);found_content:    encoding = xmlStrstr(content, BAD_CAST"charset=");    if (encoding == NULL) 	encoding = xmlStrstr(content, BAD_CAST"Charset=");    if (encoding == NULL) 	encoding = xmlStrstr(content, BAD_CAST"CHARSET=");    if (encoding != NULL) {	encoding += 8;    } else {	encoding = xmlStrstr(content, BAD_CAST"charset =");	if (encoding == NULL) 	    encoding = xmlStrstr(content, BAD_CAST"Charset =");	if (encoding == NULL) 	    encoding = xmlStrstr(content, BAD_CAST"CHARSET =");	if (encoding != NULL)	    encoding += 9;    }    if (encoding != NULL) {	while ((*encoding == ' ') || (*encoding == '\t')) encoding++;    }    return(encoding);}/** * htmlSetMetaEncoding: * @doc:  the document * @encoding:  the encoding string *  * Sets the current encoding in the Meta tags * NOTE: this will not change the document content encoding, just * the META flag associated. * * Returns 0 in case of success and -1 in case of error */inthtmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {    htmlNodePtr cur, meta;    const xmlChar *content;    char newcontent[100];    if (doc == NULL)	return(-1);    if (encoding != NULL) {	_snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",                encoding);	newcontent[sizeof(newcontent) - 1] = 0;    }    cur = doc->children;    /*     * Search the html     */    while (cur != NULL) {	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {	    if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)		break;	    if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)		goto found_head;	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)		goto found_meta;	}	cur = cur->next;    }    if (cur == NULL)	return(-1);    cur = cur->children;    /*     * Search the head     */    while (cur != NULL) {	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {	    if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)		break;	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)		goto found_meta;	}	cur = cur->next;    }    if (cur == NULL)	return(-1);found_head:    if (cur->children == NULL) {	if (encoding == NULL)	    return(0);	meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);	xmlAddChild(cur, meta);	xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");	xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);	return(0);    }    cur = cur->children;found_meta:    if (encoding != NULL) {	/*	 * Create a new Meta element with the right attributes	 */	meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);	xmlAddPrevSibling(cur, meta);	xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");	xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);    }    /*     * Search and destroy all the remaining the meta elements carrying     * encoding informations     */    while (cur != NULL) {	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {		xmlAttrPtr attr = cur->properties;		int http;		const xmlChar *value;		content = NULL;		http = 0;		while (attr != NULL) {		    if ((attr->children != NULL) &&		        (attr->children->type == XML_TEXT_NODE) &&		        (attr->children->next == NULL)) {			value = attr->children->content;			if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))			 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))			    http = 1;			else                         {                           if ((value != NULL) && 				(!xmlStrcasecmp(attr->name, BAD_CAST"content")))			      content = value;                        }		        if ((http != 0) && (content != NULL))			    break;		    }		    attr = attr->next;		}		if ((http != 0) && (content != NULL)) {		    meta = cur;		    cur = cur->next;		    xmlUnlinkNode(meta);                    xmlFreeNode(meta);		    continue;		}	    }	}	cur = cur->next;    }    return(0);}/** * booleanHTMLAttrs: * * These are the HTML attributes which will be output * in minimized form, i.e. <option selected="selected"> will be * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method" * */static const char* htmlBooleanAttrs[] = {  "checked", "compact", "declare", "defer", "disabled", "ismap",  "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",  "selected", NULL};/** * htmlIsBooleanAttr: * @name:  the name of the attribute to check * * Determine if a given attribute is a boolean attribute. *  * returns: false if the attribute is not boolean, true otherwise. */inthtmlIsBooleanAttr(const xmlChar *name){    int i = 0;    while (htmlBooleanAttrs[i] != NULL) {        if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)            return 1;        i++;    }    return 0;}#ifdef LIBXML_OUTPUT_ENABLED/************************************************************************ *									* * 			Output error handlers				* *									* ************************************************************************//** * htmlSaveErrMemory: * @extra:  extra informations * * Handle an out of memory condition */static voidhtmlSaveErrMemory(const char *extra){    __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra);}/** * htmlSaveErr: * @code:  the error number * @node:  the location of the error. * @extra:  extra informations * * Handle an out of memory condition */static voidhtmlSaveErr(int code, xmlNodePtr node, const char *extra){    const char *msg = NULL;    switch(code) {        case XML_SAVE_NOT_UTF8:	    msg = "string is not in UTF-8";	    break;	case XML_SAVE_CHAR_INVALID:	    msg = "invalid character value";	    break;	case XML_SAVE_UNKNOWN_ENCODING:	    msg = "unknown encoding %s";	    break;	case XML_SAVE_NO_DOCTYPE:	    msg = "HTML has no DOCTYPE";	    break;	default:	    msg = "unexpected error number";    }    __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra);}/************************************************************************ *									* *   		Dumping HTML tree content to a simple buffer		* *									* ************************************************************************/static inthtmlNodeDumpFormat(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur,	           int format);/** * htmlNodeDumpFormat: * @buf:  the HTML buffer output * @doc:  the document * @cur:  the current node * @format:  should formatting spaces been added * * Dump an HTML node, recursive behaviour,children are printed too. * * Returns the number of byte written or -1 in case of error */static inthtmlNodeDumpFormat(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur,	           int format) {    unsigned int use;    int ret;    xmlOutputBufferPtr outbuf;    if (cur == NULL) {	return (-1);    }    if (buf == NULL) {	return (-1);    }    outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));    if (outbuf == NULL) {        htmlSaveErrMemory("allocating HTML output buffer");	return (-1);    }    memset(outbuf, 0, (size_t) sizeof(xmlOutputBuffer));    outbuf->buffer = buf;    outbuf->encoder = NULL;    outbuf->writecallback = NULL;    outbuf->closecallback = NULL;    outbuf->context = NULL;    outbuf->written = 0;    use = buf->use;    htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);    xmlFree(outbuf);    ret = buf->use - use;    return (ret);}/** * htmlNodeDump: * @buf:  the HTML buffer output * @doc:  the document * @cur:  the current node * * Dump an HTML node, recursive behaviour,children are printed too, * and formatting returns are added. * * Returns the number of byte written or -1 in case of error */inthtmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {    xmlInitParser();    return(htmlNodeDumpFormat(buf, doc, cur, 1));}/** * htmlNodeDumpFileFormat: * @out:  the FILE pointer * @doc:  the document * @cur:  the current node * @encoding: the document encoding * @format:  should formatting spaces been added * * Dump an HTML node, recursive behaviour,children are printed too. * * TODO: if encoding == NULL try to save in the doc encoding * * returns: the number of byte written or -1 in case of failure. */inthtmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,	               xmlNodePtr cur, const char *encoding, int format) {    xmlOutputBufferPtr buf;    xmlCharEncodingHandlerPtr handler = NULL;    int ret;    xmlInitParser();    if (encoding != NULL) {	xmlCharEncoding enc;	enc = xmlParseCharEncoding(encoding);	if (enc != XML_CHAR_ENCODING_UTF8) {	    handler = xmlFindCharEncodingHandler(encoding);	    if (handler == NULL)		return(-1);	}    }    /*     * Fallback to HTML or ASCII when the encoding is unspecified     */    if (handler == NULL)	handler = xmlFindCharEncodingHandler("HTML");    if (handler == NULL)	handler = xmlFindCharEncodingHandler("ascii");    /*      * save the content to a temp buffer.     */    buf = xmlOutputBufferCreateFile(out, handler);    if (buf == NULL) return(0);    htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);    ret = xmlOutputBufferClose(buf);    return(ret);}/** * htmlNodeDumpFile: * @out:  the FILE pointer * @doc:  the document * @cur:  the current node * * Dump an HTML node, recursive behaviour,children are printed too, * and formatting returns are added. */voidhtmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {    htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);}/** * htmlDocDumpMemory: * @cur:  the document * @mem:  OUT: the memory pointer * @size:  OUT: the memory length * * Dump an HTML document in memory and return the xmlChar * and it's size. * It's up to the caller to free the memory. */voidhtmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {    xmlOutputBufferPtr buf;    xmlCharEncodingHandlerPtr handler = NULL;    const char *encoding;    xmlInitParser();    if (cur == NULL) {	*mem = NULL;	*size = 0;	return;    }    encoding = (const char *) htmlGetMetaEncoding(cur);    if (encoding != NULL) {	xmlCharEncoding enc;	enc = xmlParseCharEncoding(encoding);	if (enc != cur->charset) {	    if (cur->charset != XML_CHAR_ENCODING_UTF8) {		/*		 * Not supported yet		 */		*mem = NULL;		*size = 0;		return;	    }	    handler = xmlFindCharEncodingHandler(encoding);	    if (handler == NULL) {		*mem = NULL;		*size = 0;		return;	    }	}    }    /*     * Fallback to HTML or ASCII when the encoding is unspecified     */    if (handler == NULL)	handler = xmlFindCharEncodingHandler("HTML");    if (handler == NULL)	handler = xmlFindCharEncodingHandler("ascii");    buf = xmlAllocOutputBuffer(handler);    if (buf == NULL) {	*mem = NULL;	*size = 0;	return;    }    htmlDocContentDumpOutput(buf, cur, NULL);    xmlOutputBufferFlush(buf);    if (buf->conv != NULL) {	*size = buf->conv->use;	*mem = xmlStrndup(buf->conv->content, *size);    } else {	*size = buf->buffer->use;	*mem = xmlStrndup(buf->buffer->content, *size);    }    (void)xmlOutputBufferClose(buf);}/************************************************************************ *									* *   		Dumping HTML tree content to an I/O output buffer	* *									* ************************************************************************/
12 下一页
💿 文件大小 1527 K
👤 上传用户 qqpp2q
📂 所属分类其他
🏷️ 相关标签

#xml
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -