📄 htmltree.c.svn-base
字号:
/* * HTMLtree.c : implementation of access function for an HTML tree. * * See Copyright for the status of this software. * * daniel@veillard.com */#define IN_LIBXML#include "libxml.h"#ifdef LIBXML_HTML_ENABLED#include <string.h> /* for memset() only ! */#ifdef HAVE_CTYPE_H#include <ctype.h>#endif#ifdef HAVE_STDLIB_H#include <stdlib.h>#endif#include <libxml/xmlmemory.h>#include <libxml/HTMLparser.h>#include <libxml/HTMLtree.h>#include <libxml/entities.h>#include <libxml/valid.h>#include <libxml/xmlerror.h>#include <libxml/parserInternals.h>#include <libxml/globals.h>#include <libxml/uri.h>/************************************************************************ * * * Getting/Setting encoding meta tags * * * ************************************************************************//** * htmlGetMetaEncoding: * @doc: the document * * Encoding definition lookup in the Meta tags * * Returns the current encoding as flagged in the HTML source */const xmlChar *htmlGetMetaEncoding(htmlDocPtr doc) { htmlNodePtr cur; const xmlChar *content; const xmlChar *encoding; if (doc == NULL) return(NULL); cur = doc->children; /* * Search the html */ while (cur != NULL) { if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { if (xmlStrEqual(cur->name, BAD_CAST"html")) break; if (xmlStrEqual(cur->name, BAD_CAST"head")) goto found_head; if (xmlStrEqual(cur->name, BAD_CAST"meta")) goto found_meta; } cur = cur->next; } if (cur == NULL) return(NULL); cur = cur->children; /* * Search the head */ while (cur != NULL) { if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { if (xmlStrEqual(cur->name, BAD_CAST"head")) break; if (xmlStrEqual(cur->name, BAD_CAST"meta")) goto found_meta; } cur = cur->next; } if (cur == NULL) return(NULL);found_head: cur = cur->children; /* * Search the meta elements */found_meta: while (cur != NULL) { if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { if (xmlStrEqual(cur->name, BAD_CAST"meta")) { xmlAttrPtr attr = cur->properties; int http; const xmlChar *value; content = NULL; http = 0; while (attr != NULL) { if ((attr->children != NULL) && (attr->children->type == XML_TEXT_NODE) && (attr->children->next == NULL)) { value = attr->children->content; if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv")) && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) http = 1; else if ((value != NULL) && (!xmlStrcasecmp(attr->name, BAD_CAST"content"))) content = value; if ((http != 0) && (content != NULL)) goto found_content; } attr = attr->next; } } } cur = cur->next; } return(NULL);found_content: encoding = xmlStrstr(content, BAD_CAST"charset="); if (encoding == NULL) encoding = xmlStrstr(content, BAD_CAST"Charset="); if (encoding == NULL) encoding = xmlStrstr(content, BAD_CAST"CHARSET="); if (encoding != NULL) { encoding += 8; } else { encoding = xmlStrstr(content, BAD_CAST"charset ="); if (encoding == NULL) encoding = xmlStrstr(content, BAD_CAST"Charset ="); if (encoding == NULL) encoding = xmlStrstr(content, BAD_CAST"CHARSET ="); if (encoding != NULL) encoding += 9; } if (encoding != NULL) { while ((*encoding == ' ') || (*encoding == '\t')) encoding++; } return(encoding);}/** * htmlSetMetaEncoding: * @doc: the document * @encoding: the encoding string * * Sets the current encoding in the Meta tags * NOTE: this will not change the document content encoding, just * the META flag associated. * * Returns 0 in case of success and -1 in case of error */inthtmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) { htmlNodePtr cur, meta; const xmlChar *content; char newcontent[100]; if (doc == NULL) return(-1); if (encoding != NULL) { _snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s", encoding); newcontent[sizeof(newcontent) - 1] = 0; } cur = doc->children; /* * Search the html */ while (cur != NULL) { if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0) break; if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0) goto found_head; if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) goto found_meta; } cur = cur->next; } if (cur == NULL) return(-1); cur = cur->children; /* * Search the head */ while (cur != NULL) { if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0) break; if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) goto found_meta; } cur = cur->next; } if (cur == NULL) return(-1);found_head: if (cur->children == NULL) { if (encoding == NULL) return(0); meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL); xmlAddChild(cur, meta); xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type"); xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent); return(0); } cur = cur->children;found_meta: if (encoding != NULL) { /* * Create a new Meta element with the right attributes */ meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL); xmlAddPrevSibling(cur, meta); xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type"); xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent); } /* * Search and destroy all the remaining the meta elements carrying * encoding informations */ while (cur != NULL) { if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) { xmlAttrPtr attr = cur->properties; int http; const xmlChar *value; content = NULL; http = 0; while (attr != NULL) { if ((attr->children != NULL) && (attr->children->type == XML_TEXT_NODE) && (attr->children->next == NULL)) { value = attr->children->content; if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv")) && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) http = 1; else { if ((value != NULL) && (!xmlStrcasecmp(attr->name, BAD_CAST"content"))) content = value; } if ((http != 0) && (content != NULL)) break; } attr = attr->next; } if ((http != 0) && (content != NULL)) { meta = cur; cur = cur->next; xmlUnlinkNode(meta); xmlFreeNode(meta); continue; } } } cur = cur->next; } return(0);}/** * booleanHTMLAttrs: * * These are the HTML attributes which will be output * in minimized form, i.e. <option selected="selected"> will be * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method" * */static const char* htmlBooleanAttrs[] = { "checked", "compact", "declare", "defer", "disabled", "ismap", "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly", "selected", NULL};/** * htmlIsBooleanAttr: * @name: the name of the attribute to check * * Determine if a given attribute is a boolean attribute. * * returns: false if the attribute is not boolean, true otherwise. */inthtmlIsBooleanAttr(const xmlChar *name){ int i = 0; while (htmlBooleanAttrs[i] != NULL) { if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0) return 1; i++; } return 0;}#ifdef LIBXML_OUTPUT_ENABLED/************************************************************************ * * * Output error handlers * * * ************************************************************************//** * htmlSaveErrMemory: * @extra: extra informations * * Handle an out of memory condition */static voidhtmlSaveErrMemory(const char *extra){ __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra);}/** * htmlSaveErr: * @code: the error number * @node: the location of the error. * @extra: extra informations * * Handle an out of memory condition */static voidhtmlSaveErr(int code, xmlNodePtr node, const char *extra){ const char *msg = NULL; switch(code) { case XML_SAVE_NOT_UTF8: msg = "string is not in UTF-8"; break; case XML_SAVE_CHAR_INVALID: msg = "invalid character value"; break; case XML_SAVE_UNKNOWN_ENCODING: msg = "unknown encoding %s"; break; case XML_SAVE_NO_DOCTYPE: msg = "HTML has no DOCTYPE"; break; default: msg = "unexpected error number"; } __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra);}/************************************************************************ * * * Dumping HTML tree content to a simple buffer * * * ************************************************************************/static inthtmlNodeDumpFormat(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, int format);/** * htmlNodeDumpFormat: * @buf: the HTML buffer output * @doc: the document * @cur: the current node * @format: should formatting spaces been added * * Dump an HTML node, recursive behaviour,children are printed too. * * Returns the number of byte written or -1 in case of error */static inthtmlNodeDumpFormat(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, int format) { unsigned int use; int ret; xmlOutputBufferPtr outbuf; if (cur == NULL) { return (-1); } if (buf == NULL) { return (-1); } outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer)); if (outbuf == NULL) { htmlSaveErrMemory("allocating HTML output buffer"); return (-1); } memset(outbuf, 0, (size_t) sizeof(xmlOutputBuffer)); outbuf->buffer = buf; outbuf->encoder = NULL; outbuf->writecallback = NULL; outbuf->closecallback = NULL; outbuf->context = NULL; outbuf->written = 0; use = buf->use; htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format); xmlFree(outbuf); ret = buf->use - use; return (ret);}/** * htmlNodeDump: * @buf: the HTML buffer output * @doc: the document * @cur: the current node * * Dump an HTML node, recursive behaviour,children are printed too, * and formatting returns are added. * * Returns the number of byte written or -1 in case of error */inthtmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) { xmlInitParser(); return(htmlNodeDumpFormat(buf, doc, cur, 1));}/** * htmlNodeDumpFileFormat: * @out: the FILE pointer * @doc: the document * @cur: the current node * @encoding: the document encoding * @format: should formatting spaces been added * * Dump an HTML node, recursive behaviour,children are printed too. * * TODO: if encoding == NULL try to save in the doc encoding * * returns: the number of byte written or -1 in case of failure. */inthtmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc, xmlNodePtr cur, const char *encoding, int format) { xmlOutputBufferPtr buf; xmlCharEncodingHandlerPtr handler = NULL; int ret; xmlInitParser(); if (encoding != NULL) { xmlCharEncoding enc; enc = xmlParseCharEncoding(encoding); if (enc != XML_CHAR_ENCODING_UTF8) { handler = xmlFindCharEncodingHandler(encoding); if (handler == NULL) return(-1); } } /* * Fallback to HTML or ASCII when the encoding is unspecified */ if (handler == NULL) handler = xmlFindCharEncodingHandler("HTML"); if (handler == NULL) handler = xmlFindCharEncodingHandler("ascii"); /* * save the content to a temp buffer. */ buf = xmlOutputBufferCreateFile(out, handler); if (buf == NULL) return(0); htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format); ret = xmlOutputBufferClose(buf); return(ret);}/** * htmlNodeDumpFile: * @out: the FILE pointer * @doc: the document * @cur: the current node * * Dump an HTML node, recursive behaviour,children are printed too, * and formatting returns are added. */voidhtmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) { htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);}/** * htmlDocDumpMemory: * @cur: the document * @mem: OUT: the memory pointer * @size: OUT: the memory length * * Dump an HTML document in memory and return the xmlChar * and it's size. * It's up to the caller to free the memory. */voidhtmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) { xmlOutputBufferPtr buf; xmlCharEncodingHandlerPtr handler = NULL; const char *encoding; xmlInitParser(); if (cur == NULL) { *mem = NULL; *size = 0; return; } encoding = (const char *) htmlGetMetaEncoding(cur); if (encoding != NULL) { xmlCharEncoding enc; enc = xmlParseCharEncoding(encoding); if (enc != cur->charset) { if (cur->charset != XML_CHAR_ENCODING_UTF8) { /* * Not supported yet */ *mem = NULL; *size = 0; return; } handler = xmlFindCharEncodingHandler(encoding); if (handler == NULL) { *mem = NULL; *size = 0; return; } } } /* * Fallback to HTML or ASCII when the encoding is unspecified */ if (handler == NULL) handler = xmlFindCharEncodingHandler("HTML"); if (handler == NULL) handler = xmlFindCharEncodingHandler("ascii"); buf = xmlAllocOutputBuffer(handler); if (buf == NULL) { *mem = NULL; *size = 0; return; } htmlDocContentDumpOutput(buf, cur, NULL); xmlOutputBufferFlush(buf); if (buf->conv != NULL) { *size = buf->conv->use; *mem = xmlStrndup(buf->conv->content, *size); } else { *size = buf->buffer->use; *mem = xmlStrndup(buf->buffer->content, *size); } (void)xmlOutputBufferClose(buf);}/************************************************************************ * * * Dumping HTML tree content to an I/O output buffer * * * ************************************************************************/
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -