htmltokenizer.cpp

来自「手机浏览器源码程序,功能强大」· C++ 代码 · 共 1,900 行 · 第 1/5 页
CPP
1,900 行
/*
    This file is part of the KDE libraries

    Copyright (C) 1997 Martin Jones (mjones@kde.org)
              (C) 1997 Torben Weis (weis@kde.org)
              (C) 1998 Waldo Bastian (bastian@kde.org)
              (C) 1999 Lars Knoll (knoll@kde.org)
              (C) 1999 Antti Koivisto (koivisto@kde.org)
              (C) 2001 Dirk Mueller (mueller@kde.org)
    Copyright (C) 2004 Apple Computer, Inc.

    This library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Library General Public
    License as published by the Free Software Foundation; either
    version 2 of the License, or (at your option) any later version.

    This library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    Library General Public License for more details.

    You should have received a copy of the GNU Library General Public License
    along with this library; see the file COPYING.LIB.  If not, write to
    the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
    Boston, MA 02111-1307, USA.
*/
//----------------------------------------------------------------------------
//
// KDE HTML Widget - Tokenizers

//#define TOKEN_DEBUG 1
//#define TOKEN_DEBUG 2

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

//#include <string.h>
#include "html/htmltokenizer.h"
#include "html/html_documentimpl.h"
#include "html/htmlparser.h"
#include "html/dtd.h"

#include "misc/loader.h"
#include "misc/htmlhashes.h"

#include "khtmlview.h"
#include "khtml_part.h"
#include "xml/dom_docimpl.h"
#include "dom/dom_doc.h"
#include "css/csshelper.h"
#include "ecma/kjs_proxy.h"
#include <kcharsets.h>
#include <kglobal.h>
#include <ctype.h>
#include <assert.h>
#include <qvariant.h>
#include <kdebug.h>
#include <stdlib.h>

using DOM::AtomicString;
using DOM::AttributeImpl;
using DOM::DOMString;
using DOM::DOMStringImpl;
using DOM::DocumentImpl;
using DOM::FORBIDDEN;
using DOM::Node;
using DOM::emptyAtom;
using DOM::endTagRequirement;

// turn off inlining to void warning with newer gcc
#undef __inline
#define __inline
#include "kentities.c"
#undef __inline

// #define INSTRUMENT_LAYOUT_SCHEDULING 1

#if NOKIA_CHANGES
#define TOKENIZER_CHUNK_SIZE  256
#define TOKENIZER_TIME_DELAY  300
#else
#define TOKENIZER_CHUNK_SIZE  4096

// FIXME: We would like this constant to be 200ms.  Yielding more aggressively results in increased
// responsiveness and better incremental rendering.  It slows down overall page-load on slower machines,
// though, so for now we set a value of 500.
#define TOKENIZER_TIME_DELAY  500
#endif

#if NOKIA_CHANGES && __OOM__
#include <allocs.h>
#define RETURN_IF_OOM( __P )       if( !__P )  return;
#else
#define RETURN_IF_OOM( __P )        (__P)
#endif


namespace khtml {

static const char commentStart [] = "<!--";
static const char scriptEnd [] = "</script";
static const char xmpEnd [] = "</xmp";
static const char styleEnd [] =  "</style";
static const char textareaEnd [] = "</textarea";
static const char titleEnd [] = "</title";

#define KHTML_ALLOC_QCHAR_VEC( N ) (QChar*) malloc( sizeof(QChar)*( N ) )
#define KHTML_REALLOC_QCHAR_VEC(P, N ) (QChar*) P = realloc(p, sizeof(QChar)*( N ))
#define KHTML_DELETE_QCHAR_VEC( P ) free((char*)( P ))

// Full support for MS Windows extensions to Latin-1.
// Technically these extensions should only be activated for pages
// marked "windows-1252" or "cp1252", but
// in the standard Microsoft way, these extensions infect hundreds of thousands
// of web pages.  Note that people with non-latin-1 Microsoft extensions
// are SOL.
//
// See: http://www.microsoft.com/globaldev/reference/WinCP.asp
//      http://www.bbsinc.com/iso8859.html
//      http://www.obviously.com/
//
// There may be better equivalents

#if APPLE_CHANGES

// Note that we have more Unicode characters than Qt, so we use the
// official mapping table from the Unicode 2.0 standard here instead of
// one with hacks to avoid certain Unicode characters. Also, we don't
// need the unrelated hacks to avoid Unicode characters that are in the
// original version.

// We need this for entities at least. For non-entity text, we could
// handle this in the text codec.

// To cover non-entity text, I think this function would need to be called
// in more places. There seem to be many places that don't call fixUpChar.

inline void fixUpChar(QChar& c) {
    switch (c.unicode()) {
        case 0x0080: c = 0x20AC; break;
        case 0x0081: break;
        case 0x0082: c = 0x201A; break;
        case 0x0083: c = 0x0192; break;
        case 0x0084: c = 0x201E; break;
        case 0x0085: c = 0x2026; break;
        case 0x0086: c = 0x2020; break;
        case 0x0087: c = 0x2021; break;
        case 0x0088: c = 0x02C6; break;
        case 0x0089: c = 0x2030; break;
        case 0x008A: c = 0x0160; break;
        case 0x008B: c = 0x2039; break;
        case 0x008C: c = 0x0152; break;
        case 0x008D: break;
        case 0x008E: c = 0x017D; break;
        case 0x008F: break;
        case 0x0090: break;
        case 0x0091: c = 0x2018; break;
        case 0x0092: c = 0x2019; break;
        case 0x0093: c = 0x201C; break;
        case 0x0094: c = 0x201D; break;
        case 0x0095: c = 0x2022; break;
        case 0x0096: c = 0x2013; break;
        case 0x0097: c = 0x2014; break;
        case 0x0098: c = 0x02DC; break;
        case 0x0099: c = 0x2122; break;
        case 0x009A: c = 0x0161; break;
        case 0x009B: c = 0x203A; break;
        case 0x009C: c = 0x0153; break;
        case 0x009D: break;
        case 0x009E: c = 0x017E; break;
        case 0x009F: c = 0x0178; break;
    }
}

#else // APPLE_CHANGES

#define fixUpChar(x) \
            if (!(x).row() ) { \
                switch ((x).cell()) \
                { \
                /* ALL of these should be changed to Unicode SOON */ \
                case 0x80: (x) = 0x20ac; break; \
                case 0x82: (x) = ',';    break; \
                case 0x83: (x) = 0x0192; break; \
                case 0x84: (x) = '"';    break; \
                case 0x85: (x) = 0x2026; break; \
                case 0x86: (x) = 0x2020; break; \
                case 0x87: (x) = 0x2021; break; \
                case 0x88: (x) = 0x02C6; break; \
                case 0x89: (x) = 0x2030; break; \
                case 0x8A: (x) = 0x0160; break; \
                case 0x8b: (x) = '<';    break; \
                case 0x8C: (x) = 0x0152; break; \
\
                case 0x8E: (x) = 0x017D; break; \
\
\
                case 0x91: (x) = '\'';   break; \
                case 0x92: (x) = '\'';   break; \
                case 0x93: (x) = '"';    break; \
                case 0x94: (x) = '"';    break; \
                case 0x95: (x) = '*';    break; \
                case 0x96: (x) = '-';    break; \
                case 0x97: (x) = '-';    break; \
                case 0x98: (x) = '~';    break; \
                case 0x99: (x) = 0x2122; break; \
                case 0x9A: (x) = 0x0161; break; \
                case 0x9b: (x) = '>';    break; \
                case 0x9C: (x) = 0x0153; break; \
\
                case 0x9E: (x) = 0x017E; break; \
                case 0x9F: (x) = 0x0178; break; \
                /* This one should die */ \
                case 0xb7: (x) = '*';    break; \
                default: break; \
                } \
            } \
            else { \
                /* These should all die sooner rather than later */ \
                switch( (x).unicode() ) { \
                case 0x2013: (x) = '-'; break; \
                case 0x2014: (x) = '-'; break; \
                case 0x2018: (x) = '\''; break; \
                case 0x2019: (x) = '\''; break; \
                case 0x201c: (x) = '"'; break; \
                case 0x201d: (x) = '"'; break; \
                case 0x2022: (x) = '*'; break; \
                case 0x2122: (x) = 0x2122; break; \
                default: break; \
                } \
            }

#endif // APPLE_CHANGES

inline bool tagMatch(const char *s1, const QChar *s2, uint length)
{
    for (uint i = 0; i != length; ++i) {
        char c1 = s1[i];
        char uc1 = toupper(c1);
        QChar c2 = s2[i];
        if (c1 != c2 && uc1 != c2)
            return false;
    }
    return true;
}

// ----------------------------------------------------------------------------

HTMLTokenizer::HTMLTokenizer(DOM::DocumentPtr *_doc, KHTMLView *_view, bool includesComments)
    : inWrite(false)
{
    view = _view;
    buffer = 0;
    scriptCode = 0;
    scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
    charsets = KGlobal::charsets();
    parser = new KHTMLParser(_view, _doc, includesComments);
    m_executingScript = 0;
    loadingExtScript = false;
    onHold = false;
    attrNamePresent = false;
    timerId = 0;
    includesCommentsInDOM = includesComments;
    loadStopped = false;

    begin();
}

HTMLTokenizer::HTMLTokenizer(DOM::DocumentPtr *_doc, DOM::DocumentFragmentImpl *i, bool includesComments)
    : inWrite(false)
{
    view = 0;
    buffer = 0;
    scriptCode = 0;
    scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;
    charsets = KGlobal::charsets();
    parser = new KHTMLParser(i, _doc, includesComments);
    m_executingScript = 0;
    loadingExtScript = false;
    onHold = false;
    timerId = 0;
    includesCommentsInDOM = includesComments;
    loadStopped = false;

    begin();
}

void HTMLTokenizer::reset()
{
    assert(m_executingScript == 0);
    assert(onHold == false);

    while (!cachedScript.isEmpty())
        cachedScript.dequeue()->deref(this);

    if ( buffer )
        KHTML_DELETE_QCHAR_VEC(buffer);
    buffer = dest = 0;
    size = 0;

    if ( scriptCode )
        KHTML_DELETE_QCHAR_VEC(scriptCode);
    scriptCode = 0;
    scriptCodeSize = scriptCodeMaxSize = scriptCodeResync = 0;

    if (timerId) {
        killTimer(timerId);
        timerId = 0;
    }
    timerId = 0;
    allowYield = false;
    forceSynchronous = false;

    currToken.reset();
}

void HTMLTokenizer::begin()
{
    m_executingScript = 0;
    loadingExtScript = false;
    onHold = false;
    reset();
    size = 254;
    buffer = KHTML_ALLOC_QCHAR_VEC( 255 );
    dest = buffer;
    tag = NoTag;
    pending = NonePending;
    discard = NoneDiscard;
    pre = false;
    prePos = 0;
    plaintext = false;
    xmp = false;
    processingInstruction = false;
    script = false;
    escaped = false;
    style = false;
    skipLF = false;
    select = false;
    comment = false;
    server = false;
    textarea = false;
    title = false;
    startTag = false;
    tquote = NoQuote;
    searchCount = 0;
    Entity = NoEntity;
    loadingExtScript = false;
    scriptSrc = QString::null;
    pendingSrc.clear();
    currentPrependingSrc = 0;
    noMoreData = false;
    brokenComments = false;
    brokenServer = false;
    lineno = 0;
    scriptStartLineno = 0;
    tagStartLineno = 0;
    forceSynchronous = false;
}

void HTMLTokenizer::setForceSynchronous(bool force)
{
    forceSynchronous = force;
}

void HTMLTokenizer::processListing(TokenizerString list)
{
    bool old_pre = pre;
    // This function adds the listing 'list' as
    // preformatted text-tokens to the token-collection
    // thereby converting TABs.
    if(!style) pre = true;
    prePos = 0;

    while ( !list.isEmpty() )
    {
        RETURN_IF_OOM( checkBuffer(3*TAB_SIZE) );

        if (skipLF && ( *list != '\n' ))
        {
htmltokenizer.cpp - 源码说明

本页面展示了「手机浏览器源码程序,功能强大」中的 htmltokenizer.cpp 源码文件，采用 C++ 编程语言编写，共 1,900 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与手机相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?