📄 htmlparser.cpp
字号:
/* Copyright (C) 1997 Martin Jones (mjones@kde.org) (C) 1997 Torben Weis (weis@kde.org) (C) 1999,2001 Lars Knoll (knoll@kde.org) (C) 2000,2001 Dirk Mueller (mueller@kde.org) Copyright (C) 2004, 2005, 2006, 2007 Apple Inc. All rights reserved. This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.*/#include "config.h"#include "HTMLParser.h"#include "CharacterNames.h"#include "CSSPropertyNames.h"#include "CSSValueKeywords.h"#include "Comment.h"#include "Console.h"#include "DOMWindow.h"#include "DocumentFragment.h"#include "DocumentType.h"#include "Frame.h"#include "HTMLBodyElement.h"#include "HTMLDocument.h"#include "HTMLDivElement.h"#include "HTMLDListElement.h"#include "HTMLElementFactory.h"#include "HTMLFormElement.h"#include "HTMLHeadElement.h"#include "HTMLHRElement.h"#include "HTMLHtmlElement.h"#include "HTMLIsIndexElement.h"#include "HTMLMapElement.h"#include "HTMLNames.h"#include "HTMLTableCellElement.h"#include "HTMLTableRowElement.h"#include "HTMLTableSectionElement.h"#include "HTMLTokenizer.h"#include "LocalizedStrings.h"#include "Settings.h"#include "Text.h"#include <wtf/StdLibExtras.h> namespace WebCore {using namespace HTMLNames;static const unsigned cMaxRedundantTagDepth = 20;static const unsigned cResidualStyleMaxDepth = 200;struct HTMLStackElem : Noncopyable { HTMLStackElem(const AtomicString& t, int lvl, Node* n, bool r, HTMLStackElem* nx) : tagName(t) , level(lvl) , strayTableContent(false) , node(n) , didRefNode(r) , next(nx) { } void derefNode() { if (didRefNode) node->deref(); } AtomicString tagName; int level; bool strayTableContent; Node* node; bool didRefNode; HTMLStackElem* next;};/** * The parser parses tokenized input into the document, building up the * document tree. If the document is well-formed, parsing it is straightforward. * * Unfortunately, we have to handle many HTML documents that are not well-formed, * so the parser has to be tolerant about errors. * * We have to take care of at least the following error conditions: * * 1. The element being added is explicitly forbidden inside some outer tag. * In this case we should close all tags up to the one, which forbids * the element, and add it afterwards. * * 2. We are not allowed to add the element directly. It could be that * the person writing the document forgot some tag in between (or that the * tag in between is optional). This could be the case with the following * tags: HTML HEAD BODY TBODY TR TD LI (did I forget any?). * * 3. We want to add a block element inside to an inline element. Close all * inline elements up to the next higher block element. * * 4. If this doesn't help, close elements until we are allowed to add the * element or ignore the tag. * */HTMLParser::HTMLParser(HTMLDocument* doc, bool reportErrors) : m_document(doc) , m_current(doc) , m_didRefCurrent(false) , m_blockStack(0) , m_hasPElementInScope(NotInScope) , m_head(0) , m_inBody(false) , m_haveContent(false) , m_haveFrameSet(false) , m_isParsingFragment(false) , m_reportErrors(reportErrors) , m_handlingResidualStyleAcrossBlocks(false) , m_inStrayTableContent(0){}HTMLParser::HTMLParser(DocumentFragment* frag) : m_document(frag->document()) , m_current(frag) , m_didRefCurrent(true) , m_blockStack(0) , m_hasPElementInScope(NotInScope) , m_head(0) , m_inBody(true) , m_haveContent(false) , m_haveFrameSet(false) , m_isParsingFragment(true) , m_reportErrors(false) , m_handlingResidualStyleAcrossBlocks(false) , m_inStrayTableContent(0){ if (frag) frag->ref();}HTMLParser::~HTMLParser(){ freeBlock(); if (m_didRefCurrent) m_current->deref(); }void HTMLParser::reset(){ ASSERT(!m_isParsingFragment); setCurrent(m_document); freeBlock(); m_inBody = false; m_haveFrameSet = false; m_haveContent = false; m_inStrayTableContent = 0; m_currentFormElement = 0; m_currentMapElement = 0; m_head = 0; m_isindexElement = 0; m_skipModeTag = nullAtom;}void HTMLParser::setCurrent(Node* newCurrent) { bool didRefNewCurrent = newCurrent && newCurrent != m_document; if (didRefNewCurrent) newCurrent->ref(); if (m_didRefCurrent) m_current->deref(); m_current = newCurrent; m_didRefCurrent = didRefNewCurrent;}PassRefPtr<Node> HTMLParser::parseToken(Token* t){ if (!m_skipModeTag.isNull()) { if (!t->beginTag && t->tagName == m_skipModeTag) // Found the end tag for the current skip mode, so we're done skipping. m_skipModeTag = nullAtom; else if (m_current->localName() == t->tagName) // Do not skip </iframe>. // FIXME: What does that comment mean? How can it be right to parse a token without clearing m_skipModeTag? ; else return 0; } // Apparently some sites use </br> instead of <br>. Be compatible with IE and Firefox and treat this like <br>. if (t->isCloseTag(brTag) && m_document->inCompatMode()) { reportError(MalformedBRError); t->beginTag = true; } if (!t->beginTag) { processCloseTag(t); return 0; } // Ignore spaces, if we're not inside a paragraph or other inline code. // Do not alter the text if it is part of a scriptTag. if (t->tagName == textAtom && t->text && m_current->localName() != scriptTag) { if (m_inBody && !skipMode() && m_current->localName() != styleTag && m_current->localName() != titleTag && !t->text->containsOnlyWhitespace()) m_haveContent = true; RefPtr<Node> n; String text = t->text.get(); unsigned charsLeft = text.length(); while (charsLeft) { // split large blocks of text to nodes of manageable size n = Text::createWithLengthLimit(m_document, text, charsLeft); if (!insertNode(n.get(), t->selfClosingTag)) return 0; } return n; } RefPtr<Node> n = getNode(t); // just to be sure, and to catch currently unimplemented stuff if (!n) return 0; // set attributes if (n->isHTMLElement()) { HTMLElement* e = static_cast<HTMLElement*>(n.get()); e->setAttributeMap(t->attrs.get()); // take care of optional close tags if (e->endTagRequirement() == TagStatusOptional) popBlock(t->tagName); // If the node does not have a forbidden end tag requirement, and if the broken XML self-closing // syntax was used, report an error. if (t->brokenXMLStyle && e->endTagRequirement() != TagStatusForbidden) { if (t->tagName == scriptTag) reportError(IncorrectXMLCloseScriptWarning); else reportError(IncorrectXMLSelfCloseError, &t->tagName); } } if (!insertNode(n.get(), t->selfClosingTag)) { // we couldn't insert the node if (n->isElementNode()) { Element* e = static_cast<Element*>(n.get()); e->setAttributeMap(0); } if (m_currentMapElement == n) m_currentMapElement = 0; if (m_currentFormElement == n) m_currentFormElement = 0; if (m_head == n) m_head = 0; return 0; } return n;}void HTMLParser::parseDoctypeToken(DoctypeToken* t){ // Ignore any doctype after the first. Ignore doctypes in fragments. if (m_document->doctype() || m_isParsingFragment || m_current != m_document) return; // Make a new doctype node and set it as our doctype. m_document->addChild(DocumentType::create(m_document, String::adopt(t->m_name), String::adopt(t->m_publicID), String::adopt(t->m_systemID)));}static bool isTableSection(const Node* n){ return n->hasTagName(tbodyTag) || n->hasTagName(tfootTag) || n->hasTagName(theadTag);}static bool isTablePart(const Node* n){ return n->hasTagName(trTag) || n->hasTagName(tdTag) || n->hasTagName(thTag) || isTableSection(n);}static bool isTableRelated(const Node* n){ return n->hasTagName(tableTag) || isTablePart(n);}static bool isScopingTag(const AtomicString& tagName){ return tagName == appletTag || tagName == captionTag || tagName == tdTag || tagName == thTag || tagName == buttonTag || tagName == marqueeTag || tagName == objectTag || tagName == tableTag || tagName == htmlTag;}bool HTMLParser::insertNode(Node* n, bool flat){ RefPtr<Node> protectNode(n); const AtomicString& localName = n->localName(); int tagPriority = n->isHTMLElement() ? static_cast<HTMLElement*>(n)->tagPriority() : 0; // <table> is never allowed inside stray table content. Always pop out of the stray table content // and close up the first table, and then start the second table as a sibling. if (m_inStrayTableContent && localName == tableTag) popBlock(tableTag);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -