📄 htmltokenizer.cpp
字号:
/*
This file is part of the KDE libraries
Copyright (C) 1997 Martin Jones (mjones@kde.org)
(C) 1997 Torben Weis (weis@kde.org)
(C) 1998 Waldo Bastian (bastian@kde.org)
(C) 1999 Lars Knoll (knoll@kde.org)
(C) 1999 Antti Koivisto (koivisto@kde.org)
(C) 2001 Dirk Mueller (mueller@kde.org)
Copyright (C) 2004 Apple Computer, Inc.
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public
License as published by the Free Software Foundation; either
version 2 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public License
along with this library; see the file COPYING.LIB. If not, write to
the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA.
*/
//----------------------------------------------------------------------------
//
// KDE HTML Widget - Tokenizers
//#define TOKEN_DEBUG 1
//#define TOKEN_DEBUG 2
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
//#include <string.h>
#include "html/htmltokenizer.h"
#include "html/html_documentimpl.h"
#include "html/htmlparser.h"
#include "html/dtd.h"
#include "misc/loader.h"
#include "misc/htmlhashes.h"
#include "khtmlview.h"
#include "khtml_part.h"
#include "xml/dom_docimpl.h"
#include "dom/dom_doc.h"
#include "css/csshelper.h"
#include "ecma/kjs_proxy.h"
#include <kcharsets.h>
#include <kglobal.h>
#include <ctype.h>
#include <assert.h>
#include <qvariant.h>
#include <kdebug.h>
#include <stdlib.h>
using DOM::AtomicString;
using DOM::AttributeImpl;
using DOM::DOMString;
using DOM::DOMStringImpl;
using DOM::DocumentImpl;
using DOM::FORBIDDEN;
using DOM::Node;
using DOM::emptyAtom;
using DOM::endTagRequirement;
// turn off inlining to void warning with newer gcc
#undef __inline
#define __inline
#include "kentities.c"
#undef __inline
// #define INSTRUMENT_LAYOUT_SCHEDULING 1
#if NOKIA_CHANGES
#define TOKENIZER_CHUNK_SIZE 256
#define TOKENIZER_TIME_DELAY 300
#else
#define TOKENIZER_CHUNK_SIZE 4096
// FIXME: We would like this constant to be 200ms. Yielding more aggressively results in increased
// responsiveness and better incremental rendering. It slows down overall page-load on slower machines,
// though, so for now we set a value of 500.
#define TOKENIZER_TIME_DELAY 500
#endif
#if NOKIA_CHANGES && __OOM__
#include <allocs.h>
#define RETURN_IF_OOM( __P ) if( !__P ) return;
#else
#define RETURN_IF_OOM( __P ) (__P)
#endif
namespace khtml {
static const char commentStart [] = "<!--";
static const char scriptEnd [] = "</script";
static const char xmpEnd [] = "</xmp";
static const char styleEnd [] = "</style";
static const char textareaEnd [] = "</textarea";
static const char titleEnd [] = "</title";
#define KHTML_ALLOC_QCHAR_VEC( N ) (QChar*) malloc( sizeof(QChar)*( N ) )
#define KHTML_REALLOC_QCHAR_VEC(P, N ) (QChar*) P = realloc(p, sizeof(QChar)*( N ))
#define KHTML_DELETE_QCHAR_VEC( P ) free((char*)( P ))
// Full support for MS Windows extensions to Latin-1.
// Technically these extensions should only be activated for pages
// marked "windows-1252" or "cp1252", but
// in the standard Microsoft way, these extensions infect hundreds of thousands
// of web pages. Note that people with non-latin-1 Microsoft extensions
// are SOL.
//
// See: http://www.microsoft.com/globaldev/reference/WinCP.asp
// http://www.bbsinc.com/iso8859.html
// http://www.obviously.com/
//
// There may be better equivalents
#if APPLE_CHANGES
// Note that we have more Unicode characters than Qt, so we use the
// official mapping table from the Unicode 2.0 standard here instead of
// one with hacks to avoid certain Unicode characters. Also, we don't
// need the unrelated hacks to avoid Unicode characters that are in the
// original version.
// We need this for entities at least. For non-entity text, we could
// handle this in the text codec.
// To cover non-entity text, I think this function would need to be called
// in more places. There seem to be many places that don't call fixUpChar.
inline void fixUpChar(QChar& c) {
switch (c.unicode()) {
case 0x0080: c = 0x20AC; break;
case 0x0081: break;
case 0x0082: c = 0x201A; break;
case 0x0083: c = 0x0192; break;
case 0x0084: c = 0x201E; break;
case 0x0085: c = 0x2026; break;
case 0x0086: c = 0x2020; break;
case 0x0087: c = 0x2021; break;
case 0x0088: c = 0x02C6; break;
case 0x0089: c = 0x2030; break;
case 0x008A: c = 0x0160; break;
case 0x008B: c = 0x2039; break;
case 0x008C: c = 0x0152; break;
case 0x008D: break;
case 0x008E: c = 0x017D; break;
case 0x008F: break;
case 0x0090: break;
case 0x0091: c = 0x2018; break;
case 0x0092: c = 0x2019; break;
case 0x0093: c = 0x201C; break;
case 0x0094: c = 0x201D; break;
case 0x0095: c = 0x2022; break;
case 0x0096: c = 0x2013; break;
case 0x0097: c = 0x2014; break;
case 0x0098: c = 0x02DC; break;
case 0x0099: c = 0x2122; break;
case 0x009A: c = 0x0161; break;
case 0x009B: c = 0x203A; break;
case 0x009C: c = 0x0153; break;
case 0x009D: break;
case 0x009E: c = 0x017E; break;
case 0x009F: c = 0x0178; break;
}
}
#else // APPLE_CHANGES
#define fixUpChar(x) \
if (!(x).row() ) { \
switch ((x).cell()) \
{ \
/* ALL of these should be changed to Unicode SOON */ \
case 0x80: (x) = 0x20ac; break; \
case 0x82: (x) = ','; break; \
case 0x83: (x) = 0x0192; break; \
case 0x84: (x) = '"'; break; \
case 0x85: (x) = 0x2026; break; \
case 0x86: (x) = 0x2020; break; \
case 0x87: (x) = 0x2021; break; \
case 0x88: (x) = 0x02C6; break; \
case 0x89: (x) = 0x2030; break; \
case 0x8A: (x) = 0x0160; break; \
case 0x8b: (x) = '<'; break; \
case 0x8C: (x) = 0x0152; break; \
\
case 0x8E: (x) = 0x017D; break; \
\
\
case 0x91: (x) = '\''; break; \
case 0x92: (x) = '\''; break; \
case 0x93: (x) = '"'; break; \
case 0x94: (x) = '"'; break; \
case 0x95: (x) = '*'; break; \
case 0x96: (x) = '-'; break; \
case 0x97: (x) = '-'; break; \
case 0x98: (x) = '~'; break; \
case 0x99: (x) = 0x2122; break; \
case 0x9A: (x) = 0x0161; break; \
case 0x9b: (x) = '>'; break; \
case 0x9C: (x) = 0x0153; break; \
\
case 0x9E: (x) = 0x017E; break; \
case 0x9F: (x) = 0x0178; break; \
/* This one should die */ \
case 0xb7: (x) = '*'; break; \
default: break; \
} \
} \
else { \
/* These should all die sooner rather than later */ \
switch( (x).unicode() ) { \
case 0x2013: (x) = '-'; break; \
case 0x2014: (x) = '-'; break; \
case 0x2018: (x) = '\''; break; \
case 0x2019: (x) = '\''; break; \
case 0x201c: (x) = '"'; break; \
case 0x201d: (x) = '"'; break; \
case 0x2022: (x) = '*'; break; \
case 0x2122: (x) = 0x2122; break; \
default: break; \
} \
}
#endif // APPLE_CHANGES
inline bool tagMatch(const char *s1, const QChar *s2, uint length)
{
for (uint i = 0; i != length; ++i) {
char c1 = s1[i];
char uc1 = toupper(c1);
QChar c2 = s2[i];
if (c1 != c2 && uc1 != c2)
return false;
}
return true;
}
// ----------------------------------------------------------------------------
HTMLTokenizer::HTMLTokenizer(DOM::DocumentPtr *_doc, KHTMLView *_view, bool includesComments)
: inWrite(false)
{
view = _view;
buffer = 0;
scriptCode = 0;
scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
charsets = KGlobal::charsets();
parser = new KHTMLParser(_view, _doc, includesComments);
m_executingScript = 0;
loadingExtScript = false;
onHold = false;
attrNamePresent = false;
timerId = 0;
includesCommentsInDOM = includesComments;
loadStopped = false;
begin();
}
HTMLTokenizer::HTMLTokenizer(DOM::DocumentPtr *_doc, DOM::DocumentFragmentImpl *i, bool includesComments)
: inWrite(false)
{
view = 0;
buffer = 0;
scriptCode = 0;
scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
charsets = KGlobal::charsets();
parser = new KHTMLParser(i, _doc, includesComments);
m_executingScript = 0;
loadingExtScript = false;
onHold = false;
timerId = 0;
includesCommentsInDOM = includesComments;
loadStopped = false;
begin();
}
void HTMLTokenizer::reset()
{
assert(m_executingScript == 0);
assert(onHold == false);
while (!cachedScript.isEmpty())
cachedScript.dequeue()->deref(this);
if ( buffer )
KHTML_DELETE_QCHAR_VEC(buffer);
buffer = dest = 0;
size = 0;
if ( scriptCode )
KHTML_DELETE_QCHAR_VEC(scriptCode);
scriptCode = 0;
scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
if (timerId) {
killTimer(timerId);
timerId = 0;
}
timerId = 0;
allowYield = false;
forceSynchronous = false;
currToken.reset();
}
void HTMLTokenizer::begin()
{
m_executingScript = 0;
loadingExtScript = false;
onHold = false;
reset();
size = 254;
buffer = KHTML_ALLOC_QCHAR_VEC( 255 );
dest = buffer;
tag = NoTag;
pending = NonePending;
discard = NoneDiscard;
pre = false;
prePos = 0;
plaintext = false;
xmp = false;
processingInstruction = false;
script = false;
escaped = false;
style = false;
skipLF = false;
select = false;
comment = false;
server = false;
textarea = false;
title = false;
startTag = false;
tquote = NoQuote;
searchCount = 0;
Entity = NoEntity;
loadingExtScript = false;
scriptSrc = QString::null;
pendingSrc.clear();
currentPrependingSrc = 0;
noMoreData = false;
brokenComments = false;
brokenServer = false;
lineno = 0;
scriptStartLineno = 0;
tagStartLineno = 0;
forceSynchronous = false;
}
void HTMLTokenizer::setForceSynchronous(bool force)
{
forceSynchronous = force;
}
void HTMLTokenizer::processListing(TokenizerString list)
{
bool old_pre = pre;
// This function adds the listing 'list' as
// preformatted text-tokens to the token-collection
// thereby converting TABs.
if(!style) pre = true;
prePos = 0;
while ( !list.isEmpty() )
{
RETURN_IF_OOM( checkBuffer(3*TAB_SIZE) );
if (skipLF && ( *list != '\n' ))
{
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -