📄 htmltokenizer.cpp
字号:
/* Copyright (C) 1997 Martin Jones (mjones@kde.org) (C) 1997 Torben Weis (weis@kde.org) (C) 1998 Waldo Bastian (bastian@kde.org) (C) 1999 Lars Knoll (knoll@kde.org) (C) 1999 Antti Koivisto (koivisto@kde.org) (C) 2001 Dirk Mueller (mueller@kde.org) Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved. Copyright (C) 2005, 2006 Alexey Proskuryakov (ap@nypop.com) This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.*/#include "config.h"#include "HTMLTokenizer.h"#include "CSSHelper.h"#include "Cache.h"#include "CachedScript.h"#include "DocLoader.h"#include "DocumentFragment.h"#include "EventNames.h"#include "Frame.h"#include "FrameLoader.h"#include "FrameView.h"#include "HTMLElement.h"#include "HTMLNames.h"#include "HTMLParser.h"#include "HTMLScriptElement.h"#include "HTMLViewSourceDocument.h"#include "Page.h"#include "PreloadScanner.h"#include "ScriptController.h"#include "ScriptSourceCode.h"#include "ScriptValue.h"#include <wtf/ASCIICType.h>#include <wtf/CurrentTime.h>#include "HTMLEntityNames.c"#define PRELOAD_SCANNER_ENABLED 1// #define INSTRUMENT_LAYOUT_SCHEDULING 1using namespace WTF;using namespace std;namespace WebCore {using namespace HTMLNames;#if MOBILE// The mobile device needs to be responsive, as such the tokenizer chunk size is reduced.// This value is used to define how many characters the tokenizer will process before // yeilding control.static const int defaultTokenizerChunkSize = 256;#elsestatic const int defaultTokenizerChunkSize = 4096;#endif#if MOBILE// As the chunks are smaller (above), the tokenizer should not yield for as long a period, otherwise// it will take way to long to load a page.static const double defaultTokenizerTimeDelay = 0.300;#else// FIXME: We would like this constant to be 200ms.// Yielding more aggressively results in increased responsiveness and better incremental rendering.// It slows down overall page-load on slower machines, though, so for now we set a value of 500.static const double defaultTokenizerTimeDelay = 0.500;#endifstatic const char commentStart [] = "<!--";static const char doctypeStart [] = "<!doctype";static const char publicStart [] = "public";static const char systemStart [] = "system";static const char scriptEnd [] = "</script";static const char xmpEnd [] = "</xmp";static const char styleEnd [] = "</style";static const char textareaEnd [] = "</textarea";static const char titleEnd [] = "</title";static const char iframeEnd [] = "</iframe";// Full support for MS Windows extensions to Latin-1.// Technically these extensions should only be activated for pages// marked "windows-1252" or "cp1252", but// in the standard Microsoft way, these extensions infect hundreds of thousands// of web pages. Note that people with non-latin-1 Microsoft extensions// are SOL.//// See: http://www.microsoft.com/globaldev/reference/WinCP.asp// http://www.bbsinc.com/iso8859.html// http://www.obviously.com///// There may be better equivalents// We only need this for entities. For non-entity text, we handle this in the text encoding.static const UChar windowsLatin1ExtensionArray[32] = { 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178 // 98-9F};static inline UChar fixUpChar(UChar c){ if ((c & ~0x1F) != 0x0080) return c; return windowsLatin1ExtensionArray[c - 0x80];}static inline bool tagMatch(const char* s1, const UChar* s2, unsigned length){ for (unsigned i = 0; i != length; ++i) { unsigned char c1 = s1[i]; unsigned char uc1 = toASCIIUpper(static_cast<char>(c1)); UChar c2 = s2[i]; if (c1 != c2 && uc1 != c2) return false; } return true;}inline void Token::addAttribute(AtomicString& attrName, const AtomicString& attributeValue, bool viewSourceMode){ if (!attrName.isEmpty()) { ASSERT(!attrName.contains('/')); RefPtr<MappedAttribute> a = MappedAttribute::create(attrName, attributeValue); if (!attrs) { attrs = NamedMappedAttrMap::create(); attrs->reserveInitialCapacity(10); } attrs->insertAttribute(a.release(), viewSourceMode); } attrName = emptyAtom;}// ----------------------------------------------------------------------------HTMLTokenizer::HTMLTokenizer(HTMLDocument* doc, bool reportErrors) : Tokenizer() , m_buffer(0) , m_scriptCode(0) , m_scriptCodeSize(0) , m_scriptCodeCapacity(0) , m_scriptCodeResync(0) , m_executingScript(0) , m_requestingScript(false) , m_hasScriptsWaitingForStylesheets(false) , m_timer(this, &HTMLTokenizer::timerFired) , m_doc(doc) , m_parser(new HTMLParser(doc, reportErrors)) , m_inWrite(false) , m_fragment(false){ begin();}HTMLTokenizer::HTMLTokenizer(HTMLViewSourceDocument* doc) : Tokenizer(true) , m_buffer(0) , m_scriptCode(0) , m_scriptCodeSize(0) , m_scriptCodeCapacity(0) , m_scriptCodeResync(0) , m_executingScript(0) , m_requestingScript(false) , m_hasScriptsWaitingForStylesheets(false) , m_timer(this, &HTMLTokenizer::timerFired) , m_doc(doc) , m_parser(0) , m_inWrite(false) , m_fragment(false){ begin();}HTMLTokenizer::HTMLTokenizer(DocumentFragment* frag) : m_buffer(0) , m_scriptCode(0) , m_scriptCodeSize(0) , m_scriptCodeCapacity(0) , m_scriptCodeResync(0) , m_executingScript(0) , m_requestingScript(false) , m_hasScriptsWaitingForStylesheets(false) , m_timer(this, &HTMLTokenizer::timerFired) , m_doc(frag->document()) , m_parser(new HTMLParser(frag)) , m_inWrite(false) , m_fragment(true){ begin();}void HTMLTokenizer::reset(){ ASSERT(m_executingScript == 0); while (!m_pendingScripts.isEmpty()) { CachedScript* cs = m_pendingScripts.first().get(); m_pendingScripts.removeFirst(); ASSERT(cache()->disabled() || cs->accessCount() > 0); cs->removeClient(this); } fastFree(m_buffer); m_buffer = m_dest = 0; m_bufferSize = 0; fastFree(m_scriptCode); m_scriptCode = 0; m_scriptCodeSize = m_scriptCodeCapacity = m_scriptCodeResync = 0; m_timer.stop(); m_state.setAllowYield(false); m_state.setForceSynchronous(false); m_currentToken.reset(); m_doctypeToken.reset(); m_doctypeSearchCount = 0; m_doctypeSecondarySearchCount = 0; m_hasScriptsWaitingForStylesheets = false;}void HTMLTokenizer::begin(){ m_executingScript = 0; m_requestingScript = false; m_hasScriptsWaitingForStylesheets = false; m_state.setLoadingExtScript(false); reset(); m_bufferSize = 254; m_buffer = static_cast<UChar*>(fastMalloc(sizeof(UChar) * 254)); m_dest = m_buffer; tquote = NoQuote; searchCount = 0; m_state.setEntityState(NoEntity); m_scriptTagSrcAttrValue = String(); m_pendingSrc.clear(); m_currentPrependingSrc = 0; m_noMoreData = false; m_brokenComments = false; m_brokenServer = false; m_lineNumber = 0; m_currentScriptTagStartLineNumber = 0; m_currentTagStartLineNumber = 0; m_state.setForceSynchronous(false); Page* page = m_doc->page(); if (page && page->hasCustomHTMLTokenizerTimeDelay()) m_tokenizerTimeDelay = page->customHTMLTokenizerTimeDelay(); else m_tokenizerTimeDelay = defaultTokenizerTimeDelay; if (page && page->hasCustomHTMLTokenizerChunkSize()) m_tokenizerChunkSize = page->customHTMLTokenizerChunkSize(); else m_tokenizerChunkSize = defaultTokenizerChunkSize;}void HTMLTokenizer::setForceSynchronous(bool force){ m_state.setForceSynchronous(force);}HTMLTokenizer::State HTMLTokenizer::processListing(SegmentedString list, State state){ // This function adds the listing 'list' as // preformatted text-tokens to the token-collection while (!list.isEmpty()) { if (state.skipLF()) { state.setSkipLF(false); if (*list == '\n') { list.advance(); continue; } } checkBuffer(); if (*list == '\n' || *list == '\r') { if (state.discardLF()) // Ignore this LF state.setDiscardLF(false); // We have discarded 1 LF else *m_dest++ = '\n'; /* Check for MS-DOS CRLF sequence */ if (*list == '\r') state.setSkipLF(true); list.advance(); } else { state.setDiscardLF(false); *m_dest++ = *list; list.advance(); } } return state;}HTMLTokenizer::State HTMLTokenizer::parseSpecial(SegmentedString& src, State state){ ASSERT(state.inTextArea() || state.inTitle() || state.inIFrame() || !state.hasEntityState()); ASSERT(!state.hasTagState()); ASSERT(state.inXmp() + state.inTextArea() + state.inTitle() + state.inStyle() + state.inScript() + state.inIFrame() == 1 ); if (state.inScript() && !m_currentScriptTagStartLineNumber) m_currentScriptTagStartLineNumber = m_lineNumber; if (state.inComment()) state = parseComment(src, state); int lastDecodedEntityPosition = -1;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -