📄 htmlparser.cpp
字号:
/*
This file is part of the KDE libraries
Copyright (C) 1997 Martin Jones (mjones@kde.org)
(C) 1997 Torben Weis (weis@kde.org)
(C) 1999,2001 Lars Knoll (knoll@kde.org)
(C) 2000,2001 Dirk Mueller (mueller@kde.org)
Copyright (C) 2004 Apple Computer, Inc.
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public
License as published by the Free Software Foundation; either
version 2 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public License
along with this library; see the file COPYING.LIB. If not, write to
the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA.
*/
//----------------------------------------------------------------------------
//
// KDE HTML Widget -- HTML Parser
//#define PARSER_DEBUG
#include "html/htmlparser.h"
#include "dom/dom_exception.h"
#include "html/html_baseimpl.h"
#include "html/html_blockimpl.h"
#include "html/html_canvasimpl.h"
#include "html/html_documentimpl.h"
#include "html/html_elementimpl.h"
#include "html/html_formimpl.h"
#include "html/html_headimpl.h"
#include "html/html_imageimpl.h"
#include "html/html_inlineimpl.h"
#include "html/html_listimpl.h"
#include "html/html_miscimpl.h"
#include "html/html_tableimpl.h"
#include "html/html_objectimpl.h"
#include "xml/dom_textimpl.h"
#include "xml/dom_nodeimpl.h"
#include "misc/htmlhashes.h"
#include "html/htmltokenizer.h"
#include "khtmlview.h"
#include "khtml_part.h"
#include "css/cssproperties.h"
#include "css/cssvalues.h"
#include "rendering/render_object.h"
#include <kdebug.h>
#include <klocale.h>
using namespace DOM;
using namespace khtml;
//----------------------------------------------------------------------------
/**
* @internal
*/
class HTMLStackElem
OOM_MODIFIED
{
public:
HTMLStackElem( int _id,
int _level,
DOM::NodeImpl *_node,
HTMLStackElem * _next
)
:
id(_id),
level(_level),
strayTableContent(false),
node(_node),
next(_next)
{ if(node) node->ref(); }
~HTMLStackElem() { if(node) node->deref(); }
void setNode( DOM::NodeImpl *_node ) { if(node) node->deref(); node=_node; if(node) node->ref(); }
int id;
int level;
bool strayTableContent;
NodeImpl *node;
HTMLStackElem *next;
};
/**
* @internal
*
* The parser parses tokenized input into the document, building up the
* document tree. If the document is wellformed, parsing it is
* straightforward.
* Unfortunately, people can't write wellformed HTML documents, so the parser
* has to be tolerant about errors.
*
* We have to take care of the following error conditions:
* 1. The element being added is explicitly forbidden inside some outer tag.
* In this case we should close all tags up to the one, which forbids
* the element, and add it afterwards.
* 2. We are not allowed to add the element directly. It could be, that
* the person writing the document forgot some tag inbetween (or that the
* tag inbetween is optional...) This could be the case with the following
* tags: HTML HEAD BODY TBODY TR TD LI (did I forget any?)
* 3. We wan't to add a block element inside to an inline element. Close all
* inline elements up to the next higher block element.
* 4. If this doesn't help close elements, until we are allowed to add the
* element or ignore the tag.
*
*/
KHTMLParser::KHTMLParser(KHTMLView *_parent, DocumentPtr *doc, bool includesComments)
: current(0), currentIsReferenced(false), includesCommentsInDOM(includesComments)
{
//kdDebug( 6035 ) << "parser constructor" << endl;
#if SPEED_DEBUG > 0
qt.start();
#endif
HTMLWidget = _parent;
document = doc;
document->ref();
blockStack = 0;
reset();
}
KHTMLParser::KHTMLParser(DOM::DocumentFragmentImpl *i, DocumentPtr *doc, bool includesComments)
: current(0), currentIsReferenced(false), includesCommentsInDOM(includesComments)
{
HTMLWidget = 0;
document = doc;
document->ref();
blockStack = 0;
reset();
setCurrent(i);
inBody = true;
}
KHTMLParser::~KHTMLParser()
{
#if SPEED_DEBUG > 0
kdDebug( ) << "TIME: parsing time was = " << qt.elapsed() << endl;
#endif
freeBlock();
setCurrent(0);
document->deref();
if (isindex)
isindex->deref();
}
void KHTMLParser::reset()
{
setCurrent(doc());
freeBlock();
// before parsing, no tags are forbidden
memset(forbiddenTag, 0, sizeof(forbiddenTag));
inBody = false;
haveFrameSet = false;
haveContent = false;
inSelect = false;
inStrayTableContent = 0;
form = 0;
map = 0;
head = 0;
end = false;
isindex = 0;
discard_until = 0;
}
void KHTMLParser::setCurrent(DOM::NodeImpl *newCurrent)
{
bool newCurrentIsReferenced = newCurrent && newCurrent != doc();
if (newCurrentIsReferenced)
newCurrent->ref();
if (current && currentIsReferenced)
current->deref();
current = newCurrent;
currentIsReferenced = newCurrentIsReferenced;
}
void KHTMLParser::parseToken(Token *t)
{
if(discard_until) {
if(t->id == discard_until)
discard_until = 0;
// do not skip </iframe>
if ( discard_until || current->id() + ID_CLOSE_TAG != t->id )
return;
}
#ifdef PARSER_DEBUG
kdDebug( 6035 ) << "\n\n==> parser: processing token " << getTagName(t->id).string() << "(" << t->id << ")"
<< " current = " << getTagName(current->id()).string() << "(" << current->id() << ")" << endl;
kdDebug(6035) << " inBody=" << inBody << " haveFrameSet=" << haveFrameSet << endl;
#endif
// holy shit. apparently some sites use </br> instead of <br>
// be compatible with IE and NS
if (t->id == ID_BR + ID_CLOSE_TAG && doc()->inCompatMode())
t->id = ID_BR;
if (t->id > ID_CLOSE_TAG)
{
processCloseTag(t);
return;
}
// ignore spaces, if we're not inside a paragraph or other inline code
if( t->id == ID_TEXT && t->text ) {
if(inBody && !skipMode() && current->id() != ID_STYLE
&& current->id() != ID_TITLE && current->id() != ID_SCRIPT &&
!t->text->containsOnlyWhitespace())
haveContent = true;
#ifdef PARSER_DEBUG
kdDebug(6035) << "length="<< t->text->l << " text='" << QConstString(t->text->s, t->text->l).string() << "'" << endl;
#endif
}
NodeImpl *n = getElement(t);
// just to be sure, and to catch currently unimplemented stuff
if(!n)
return;
Node protectNode(n);
// set attributes
if(n->isElementNode())
{
ElementImpl *e = static_cast<ElementImpl *>(n);
e->setAttributeMap(t->attrs);
// take care of optional close tags
if(endTagRequirement(e->id()) == DOM::OPTIONAL)
popBlock(t->id);
if (isHeaderTag(t->id))
// Do not allow two header tags to be nested if the intervening tags are inlines.
popNestedHeaderTag();
}
// if this tag is forbidden inside the current context, pop
// blocks until we are allowed to add it...
while (blockStack && t->id <= ID_LAST_TAG && forbiddenTag[t->id]) {
#ifdef PARSER_DEBUG
kdDebug( 6035 ) << "t->id: " << t->id << " is forbidden :-( " << endl;
#endif
popOneBlock();
}
if (!insertNode(n, t->flat))
{
// we couldn't insert the node...
if(n->isElementNode())
{
ElementImpl *e = static_cast<ElementImpl *>(n);
e->setAttributeMap(0);
}
#ifdef PARSER_DEBUG
kdDebug( 6035 ) << "insertNode failed current=" << current->id() << ", new=" << n->id() << "!" << endl;
#endif
if (map == n)
{
#ifdef PARSER_DEBUG
kdDebug( 6035 ) << " --> resetting map!" << endl;
#endif
map = 0;
}
if (form == n)
{
#ifdef PARSER_DEBUG
kdDebug( 6035 ) << " --> resetting form!" << endl;
#endif
form = 0;
}
}
}
static bool isTableRelatedTag(int id)
{
return (id == ID_TR || id == ID_TD || id == ID_TABLE || id == ID_TBODY || id == ID_TFOOT || id == ID_THEAD ||
id == ID_TH);
}
bool KHTMLParser::insertNode(NodeImpl *n, bool flat)
{
Node protectNode(n);
int id = n->id();
// let's be stupid and just try to insert it.
// this should work if the document is wellformed
#ifdef PARSER_DEBUG
NodeImpl *tmp = current;
#endif
NodeImpl *newNode = current->addChild(n);
if ( newNode ) {
#ifdef PARSER_DEBUG
kdDebug( 6035 ) << "added " << n->nodeName().string() << " to " << tmp->nodeName().string() << ", new current=" << newNode->nodeName().string() << endl;
#endif
// don't push elements without end tag on the stack
if(tagPriority(id) != 0 && !flat)
{
pushBlock(id, tagPriority(id));
if (newNode == current)
popBlock(id);
else
setCurrent(newNode);
#if SPEED_DEBUG < 2
if(!n->attached() && HTMLWidget)
n->attach();
#endif
}
else {
#if SPEED_DEBUG < 2
if(!n->attached() && HTMLWidget)
n->attach();
if (n->maintainsState()) {
doc()->registerMaintainsState(n);
QStringList &states = doc()->restoreState();
if (!states.isEmpty())
n->restoreState(states);
}
n->closeRenderer();
#endif
}
return true;
} else {
#ifdef PARSER_DEBUG
kdDebug( 6035 ) << "ADDING NODE FAILED!!!! current = " << current->nodeName().string() << ", new = " << n->nodeName().string() << endl;
#endif
// error handling...
HTMLElementImpl *e;
bool handled = false;
// switch according to the element to insert
switch(id)
{
case ID_TR:
case ID_TH:
case ID_TD:
if (inStrayTableContent && !isTableRelatedTag(current->id())) {
// pop out to the nearest enclosing table-related tag.
while (blockStack && !isTableRelatedTag(current->id()))
popOneBlock();
return insertNode(n);
}
break;
case ID_COMMENT:
break;
case ID_HEAD:
// ### alllow not having <HTML> in at all, as per HTML spec
if (!current->isDocumentNode() && current->id() != ID_HTML )
return false;
break;
// We can deal with a base, meta and link element in the body, by just adding the element to head.
case ID_META:
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -