📄 htmlparser.c
字号:
* Returns the new input stream or NULL */htmlParserInputPtrhtmlNewInputStream(htmlParserCtxtPtr ctxt) { htmlParserInputPtr input; input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput)); if (input == NULL) { ctxt->errNo = XML_ERR_NO_MEMORY; if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "malloc: couldn't allocate a new input stream\n"); ctxt->errNo = XML_ERR_NO_MEMORY; return(NULL); } input->filename = NULL; input->directory = NULL; input->base = NULL; input->cur = NULL; input->buf = NULL; input->line = 1; input->col = 1; input->buf = NULL; input->free = NULL; input->consumed = 0; input->length = 0; return(input);}/************************************************************************ * * * Commodity functions, cleanup needed ? * * * ************************************************************************//** * areBlanks: * @ctxt: an HTML parser context * @str: a xmlChar * * @len: the size of @str * * Is this a sequence of blank chars that one can ignore ? * * Returns 1 if ignorable 0 otherwise. */static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) { int i; xmlNodePtr lastChild; for (i = 0;i < len;i++) if (!(IS_BLANK(str[i]))) return(0); if (CUR != '<') return(0); if (ctxt->node == NULL) return(0); lastChild = xmlGetLastChild(ctxt->node); if (lastChild == NULL) { if (ctxt->node->content != NULL) return(0); } else if (xmlNodeIsText(lastChild)) return(0); return(1);}/** * htmlHandleEntity: * @ctxt: an HTML parser context * @entity: an XML entity pointer. * * Default handling of an HTML entity, call the parser with the * substitution string */voidhtmlHandleEntity(htmlParserCtxtPtr ctxt, xmlEntityPtr entity) { int len; if (entity->content == NULL) { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "htmlHandleEntity %s: content == NULL\n", entity->name); ctxt->wellFormed = 0; return; } len = xmlStrlen(entity->content); /* * Just handle the content as a set of chars. */ if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) ctxt->sax->characters(ctxt->userData, entity->content, len);}/** * htmlNewDoc: * @URI: URI for the dtd, or NULL * @ExternalID: the external ID of the DTD, or NULL * * Returns a new document */htmlDocPtrhtmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) { xmlDocPtr cur; /* * Allocate a new document and fill the fields. */ cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc)); if (cur == NULL) { fprintf(stderr, "xmlNewDoc : malloc failed\n"); return(NULL); } memset(cur, 0, sizeof(xmlDoc)); cur->type = XML_HTML_DOCUMENT_NODE; cur->version = NULL; cur->intSubset = NULL; if ((ExternalID == NULL) && (URI == NULL)) xmlCreateIntSubset(cur, BAD_CAST "HTML", BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN", BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"); else xmlCreateIntSubset(cur, BAD_CAST "HTML", ExternalID, URI); cur->name = NULL; cur->children = NULL; cur->extSubset = NULL; cur->oldNs = NULL; cur->encoding = NULL; cur->standalone = 1; cur->compression = 0; cur->ids = NULL; cur->refs = NULL;#ifndef XML_WITHOUT_CORBA cur->_private = NULL;#endif return(cur);}/************************************************************************ * * * The parser itself * * Relates to http://www.w3.org/TR/html40 * * * ************************************************************************//************************************************************************ * * * The parser itself * * * ************************************************************************//** * htmlParseHTMLName: * @ctxt: an HTML parser context * * parse an HTML tag or attribute name, note that we convert it to lowercase * since HTML names are not case-sensitive. * * Returns the Tag Name parsed or NULL */xmlChar *htmlParseHTMLName(htmlParserCtxtPtr ctxt) { xmlChar *ret = NULL; int i = 0; xmlChar loc[HTML_PARSER_BUFFER_SIZE]; if (!IS_LETTER(CUR) && (CUR != '_') && (CUR != ':')) return(NULL); while ((i < HTML_PARSER_BUFFER_SIZE) && ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)))) { if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20; else loc[i] = CUR; i++; NEXT; } ret = xmlStrndup(loc, i); return(ret);}/** * htmlParseName: * @ctxt: an HTML parser context * * parse an HTML name, this routine is case sensistive. * * Returns the Name parsed or NULL */xmlChar *htmlParseName(htmlParserCtxtPtr ctxt) { xmlChar buf[HTML_MAX_NAMELEN]; int len = 0; GROW; if (!IS_LETTER(CUR) && (CUR != '_')) { return(NULL); } while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) || (CUR == '.') || (CUR == '-') || (CUR == '_') || (CUR == ':') || (IS_COMBINING(CUR)) || (IS_EXTENDER(CUR))) { buf[len++] = CUR; NEXT; if (len >= HTML_MAX_NAMELEN) { fprintf(stderr, "htmlParseName: reached HTML_MAX_NAMELEN limit\n"); while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) || (CUR == '.') || (CUR == '-') || (CUR == '_') || (CUR == ':') || (IS_COMBINING(CUR)) || (IS_EXTENDER(CUR))) NEXT; break; } } return(xmlStrndup(buf, len));}/** * htmlParseHTMLAttribute: * @ctxt: an HTML parser context * @stop: a char stop value * * parse an HTML attribute value till the stop (quote), if * stop is 0 then it stops at the first space * * Returns the attribute parsed or NULL */xmlChar *htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {#if 0 xmlChar buf[HTML_MAX_NAMELEN]; int len = 0; GROW; while ((CUR != 0) && (CUR != stop) && (CUR != '>')) { if ((stop == 0) && (IS_BLANK(CUR))) break; buf[len++] = CUR; NEXT; if (len >= HTML_MAX_NAMELEN) { fprintf(stderr, "htmlParseHTMLAttribute: reached HTML_MAX_NAMELEN limit\n"); while ((!IS_BLANK(CUR)) && (CUR != '<') && (CUR != '>') && (CUR != '\'') && (CUR != '"')) NEXT; break; } } return(xmlStrndup(buf, len));#else xmlChar *buffer = NULL; int buffer_size = 0; xmlChar *out = NULL; xmlChar *name = NULL; xmlChar *cur = NULL; htmlEntityDescPtr ent; /* * allocate a translation buffer. */ buffer_size = HTML_PARSER_BIG_BUFFER_SIZE; buffer = (xmlChar *) xmlMalloc(buffer_size * sizeof(xmlChar)); if (buffer == NULL) { perror("htmlParseHTMLAttribute: malloc failed"); return(NULL); } out = buffer; /* * Ok loop until we reach one of the ending chars */ while ((CUR != 0) && (CUR != stop) && (CUR != '>')) { if ((stop == 0) && (IS_BLANK(CUR))) break; if (CUR == '&') { if (NXT(1) == '#') { int val = htmlParseCharRef(ctxt); *out++ = val; } else { ent = htmlParseEntityRef(ctxt, &name); if (name == NULL) { *out++ = '&'; if (out - buffer > buffer_size - 100) { int index = out - buffer; growBuffer(buffer); out = &buffer[index]; } } else if ((ent == NULL) || (ent->value <= 0) || (ent->value >= 255)) { *out++ = '&'; cur = name; while (*cur != 0) { if (out - buffer > buffer_size - 100) { int index = out - buffer; growBuffer(buffer); out = &buffer[index]; } *out++ = *cur++; } xmlFree(name); } else { *out++ = ent->value; if (out - buffer > buffer_size - 100) { int index = out - buffer; growBuffer(buffer); out = &buffer[index]; } xmlFree(name); } } } else { *out++ = CUR; if (out - buffer > buffer_size - 100) { int index = out - buffer; growBuffer(buffer); out = &buffer[index]; } NEXT; } } *out++ = 0; return(buffer);#endif}/** * htmlParseNmtoken: * @ctxt: an HTML parser context * * parse an HTML Nmtoken. * * Returns the Nmtoken parsed or NULL */xmlChar *htmlParseNmtoken(htmlParserCtxtPtr ctxt) { xmlChar buf[HTML_MAX_NAMELEN]; int len = 0; GROW; while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) || (CUR == '.') || (CUR == '-') || (CUR == '_') || (CUR == ':') || (IS_COMBINING(CUR)) || (IS_EXTENDER(CUR))) { buf[len++] = CUR; NEXT; if (len >= HTML_MAX_NAMELEN) { fprintf(stderr, "htmlParseNmtoken: reached HTML_MAX_NAMELEN limit\n"); while ((IS_LETTER(CUR)) || (IS_DIGIT(CUR)) || (CUR == '.') || (CUR == '-') || (CUR == '_') || (CUR == ':') || (IS_COMBINING(CUR)) || (IS_EXTENDER(CUR))) NEXT; break; } } return(xmlStrndup(buf, len));}/** * htmlParseEntityRef: * @ctxt: an HTML parser context * @str: location to store the entity name * * parse an HTML ENTITY references * * [68] EntityRef ::= '&' Name ';' * * Returns the associated htmlEntityDescPtr if found, or NULL otherwise, * if non-NULL *str will have to be freed by the caller. */htmlEntityDescPtrhtmlParseEntityRef(htmlParserCtxtPtr ctxt, xmlChar **str) { xmlChar *name; htmlEntityDescPtr ent = NULL; *str = NULL; if (CUR == '&') { NEXT; name = htmlParseName(ctxt); if (name == NULL) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -