📄 htmltokenizer.cpp
字号:
len--; } else // Start Tag beginTag = true; // Accept empty xml tags like <br/>. We trim off the "/" so that when we call // getTagID, we'll look up "br" as the tag name and not "br/". if(len > 1 && ptr[len-1] == '/' ) ptr[--len] = '\0'; // Look up the tagID for the specified tag name (now that we've shaved off any // invalid / that might have followed the name). uint tagID = getTagID(ptr, len); if (!tagID) {#ifdef TOKEN_DEBUG QCString tmp(ptr, len+1); kdDebug( 6036 ) << "Unknown tag: \"" << tmp.data() << "\"" << endl;#endif dest = buffer; } else {#ifdef TOKEN_DEBUG QCString tmp(ptr, len+1); kdDebug( 6036 ) << "found tag id=" << tagID << ": " << tmp.data() << endl;#endif currToken.id = beginTag ? tagID : tagID + ID_CLOSE_TAG; dest = buffer; } tag = SearchAttribute; cBufferPos = 0; } break; } case SearchAttribute: {#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1 qDebug("SearchAttribute");#endif bool atespace = false; ushort curchar; while(!src.isEmpty()) { curchar = *src; if(curchar > ' ') { if (curchar == '<' || curchar == '>') tag = SearchEnd; else if(atespace && (curchar == '\'' || curchar == '"')) { tag = SearchValue; *dest++ = 0; attrName = QString::null; attrNamePresent = false; } else tag = AttributeName; cBufferPos = 0; break; } atespace = true; ++src; } break; } case AttributeName: {#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1 qDebug("AttributeName");#endif ushort curchar; int ll = kMin(src.length(), CBUFLEN-cBufferPos); while(ll--) { curchar = *src; if(curchar <= '>') { if(curchar <= ' ' || curchar == '=' || curchar == '>') { unsigned int a; cBuffer[cBufferPos] = '\0'; a = getAttrID(cBuffer, cBufferPos); if (a) attrNamePresent = true; else { attrName = QString::fromLatin1(QCString(cBuffer, cBufferPos+1).data()); attrNamePresent = !attrName.isEmpty(); // This is a deliberate quirk to match Mozilla and Opera. We have to do this // since sites that use the "standards-compliant" path sometimes send // <script src="foo.js"/>. Both Moz and Opera will honor this, despite it // being bogus HTML. They do not honor the "/" for other tags. This behavior // also deviates from WinIE, but in this case we'll just copy Moz and Opera. if (currToken.id == ID_SCRIPT && curchar == '>' && attrName == "/") currToken.flat = true; } dest = buffer; *dest++ = a;#ifdef TOKEN_DEBUG if (!a || (cBufferPos && *cBuffer == '!')) kdDebug( 6036 ) << "Unknown attribute: *" << QCString(cBuffer, cBufferPos+1).data() << "*" << endl; else kdDebug( 6036 ) << "Known attribute: " << QCString(cBuffer, cBufferPos+1).data() << endl;#endif tag = SearchEqual; break; } } // Use tolower() instead of | 0x20 to lowercase the char because there is no // performance gain in using | 0x20 since tolower() is optimized and // | 0x20 turns characters such as '_' into junk. cBuffer[cBufferPos++] = tolower(curchar); ++src; } if ( cBufferPos == CBUFLEN ) { cBuffer[cBufferPos] = '\0'; attrName = QString::fromLatin1(QCString(cBuffer, cBufferPos+1).data()); attrNamePresent = !attrName.isEmpty(); dest = buffer; *dest++ = 0; tag = SearchEqual; } break; } case SearchEqual: {#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1 qDebug("SearchEqual");#endif ushort curchar; bool atespace = false; while(!src.isEmpty()) { curchar = src->unicode(); if(curchar > ' ') { if(curchar == '=') {#ifdef TOKEN_DEBUG kdDebug(6036) << "found equal" << endl;#endif tag = SearchValue; ++src; } else if(atespace && (curchar == '\'' || curchar == '"')) { tag = SearchValue; *dest++ = 0; attrName = QString::null; attrNamePresent = false; } else { currToken.addAttribute(parser->docPtr()->document(), buffer, attrName, emptyAtom); dest = buffer; tag = SearchAttribute; } break; } atespace = true; ++src; } break; } case SearchValue: { ushort curchar; while(!src.isEmpty()) { curchar = src->unicode(); if(curchar > ' ') { if(( curchar == '\'' || curchar == '\"' )) { tquote = curchar == '\"' ? DoubleQuote : SingleQuote; tag = QuotedValue; ++src; } else tag = Value; break; } ++src; } break; } case QuotedValue: {#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1 qDebug("QuotedValue");#endif ushort curchar; while(!src.isEmpty()) { checkBuffer(); curchar = src->unicode(); if (curchar == '>' && !attrNamePresent) { // Handle a case like <img '>. Just go ahead and be willing // to close the whole tag. Don't consume the character and // just go back into SearchEnd while ignoring the whole // value. // FIXME: Note that this is actually not a very good solution. It's // an interim hack and doesn't handle the general case of // unmatched quotes among attributes that have names. -dwh while(dest > buffer+1 && (*(dest-1) == '\n' || *(dest-1) == '\r')) dest--; // remove trailing newlines AtomicString v(buffer+1, dest-buffer-1); attrName.setUnicode(buffer+1,dest-buffer-1); currToken.addAttribute(parser->docPtr()->document(), buffer, attrName, v); tag = SearchAttribute; dest = buffer; tquote = NoQuote; break; } if(curchar <= '\'' && !src.escaped()) { // ### attributes like '&{blaa....};' are supposed to be treated as jscript. if ( curchar == '&' ) { ++src; parseEntity(src, dest, true); break; } else if ( (tquote == SingleQuote && curchar == '\'') || (tquote == DoubleQuote && curchar == '\"') ) { // some <input type=hidden> rely on trailing spaces. argh while(dest > buffer+1 && (*(dest-1) == '\n' || *(dest-1) == '\r')) dest--; // remove trailing newlines AtomicString v(buffer+1, dest-buffer-1); if (!attrNamePresent) attrName.setUnicode(buffer+1,dest-buffer-1); currToken.addAttribute(parser->docPtr()->document(), buffer, attrName, v); dest = buffer; tag = SearchAttribute; tquote = NoQuote; ++src; break; } } *dest = *src; fixUpChar(*dest); ++dest; ++src; } break; } case Value: {#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1 qDebug("Value");#endif ushort curchar; while(!src.isEmpty()) { checkBuffer(); curchar = src->unicode(); if(curchar <= '>' && !src.escaped()) { // parse Entities if ( curchar == '&' ) { ++src; parseEntity(src, dest, true); break; } // no quotes. Every space means end of value // '/' does not delimit in IE! if ( curchar <= ' ' || curchar == '>' ) { AtomicString v(buffer+1, dest-buffer-1); currToken.addAttribute(parser->docPtr()->document(), buffer, attrName, v); dest = buffer; tag = SearchAttribute; break; } } *dest = *src; fixUpChar(*dest); ++dest; ++src; } break; } case SearchEnd: {#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1 qDebug("SearchEnd");#endif while(!src.isEmpty()) { if (*src == '>' || *src == '<') break; if (*src == '/') currToken.flat = true; ++src; } if (src.isEmpty()) break; searchCount = 0; // Stop looking for '<!--' sequence tag = NoTag; tquote = NoQuote; if (*src != '<') ++src; if ( !currToken.id ) //stop if tag is unknown return; uint tagID = currToken.id;#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 0 kdDebug( 6036 ) << "appending Tag: " << tagID << endl;#endif bool beginTag = !currToken.flat && (tagID < ID_CLOSE_TAG); if (tagID >= ID_CLOSE_TAG) tagID -= ID_CLOSE_TAG; else if (tagID == ID_SCRIPT) { AttributeImpl* a = 0; scriptSrc = QString::null; scriptSrcCharset = QString::null; if ( currToken.attrs && /* potentially have a ATTR_SRC ? */ parser->doc()->part() && parser->doc()->part()->jScriptEnabled() && /* jscript allowed at all? */ view /* are we a regular tokenizer or just for innerHTML ? */ ) { if ( ( a = currToken.attrs->getAttributeItem( ATTR_SRC ) ) ) scriptSrc = parser->doc()->completeURL(parseURL( a->value() ).string() ); if ( ( a = currToken.attrs->getAttributeItem( ATTR_CHARSET ) ) ) scriptSrcCharset = a->value().string().stripWhiteSpace(); if ( scriptSrcCharset.isEmpty() ) scriptSrcCharset = parser->doc()->part()->encoding(); if (!(a = currToken.attrs->getAttributeItem( ATTR_LANGUAGE ))) a = currToken.attrs->getAttributeItem(ATTR_TYPE); } javascript = true; if( a ) { QString lang = a->value().string(); lang = lang.lower(); if( !lang.contains("javascript") && !lang.contains("ecmascript") && !lang.contains("livescript") && !lang.contains("jscript") ) javascript = false; } } processToken(); // we have to take care to close the pre block in // case we encounter an unallowed element.... if(pre && beginTag && !DOM::checkChild(ID_PRE, tagID)) { kdDebug(6036) << " not allowed in <pre> " << (int)tagID << endl; pre = false; } switch( tagID ) { case ID_PRE: prePos = 0; pre = beginTag; break; case ID_SCRIPT: if (beginTag) { searchStopper = scriptEnd; searchStopperLen = 8; script = true; parseSpecial(src); } else if (tagID < ID_CLOSE_TAG) // Handle <script src="foo"/> scriptHandler(); break; case ID_STYLE: if (beginTag) { searchStopper = styleEnd; searchStopperLen = 7; style = true; parseSpecial(src); } break; case ID_TEXTAREA: if(beginTag) { searchStopper = textareaEnd; searchStopperLen = 10; textarea = true; parseSpecial(src); } break; case ID_TITLE: if (beginTag) { searchStopper = titleEnd; searchStopperLen = 7; title = true; parseSpecial(src); } break; case ID_XMP: if (beginTag) { searchStopper = xmpEnd; searchStopperLen = 5; xmp = true; parseSpecial(src); } break; case ID_SELECT: select = beginTag; break; case ID_PLAINTEXT: plaintext = beginTag; break; } if (beginTag && endTag[tagID] == FORBIDDEN) // Don't discard LFs since this element has no end tag. discard = NoneDiscard; return; // Finished parsing tag! } } // end switch } return;}void HTMLTokenizer::addPending(){ if ( select && !script ) { *dest++ = ' '; } else if ( textarea || script ) { switch(pending) { case LFPending: *dest++ = '\n'; prePos = 0; break; case SpacePending: *dest++ = ' '; ++prePos; break; case TabPending: *dest++ = '\t'; prePos += TAB_SIZE - (prePos % TAB_SIZE); break; case NonePending: assert(0); } } else { int p; switch (pending) { case SpacePending: // Insert a breaking space *dest++ = QChar(' '); prePos++; break; case LFPending: *dest = '\n'; dest++; prePos = 0; break; case TabPending: p = TAB_SIZE - ( prePos % TAB_SIZE );#ifdef TOKEN_DEBUG qDebug("tab pending, prePos: %d, toadd: %d", prePos, p);#endif for ( int x = 0; x < p; x++ ) *dest++ = QChar(' '); prePos += p; break; case NonePending:
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -