📄 htmltokenizer.cpp
字号:
{ if (textarea && ch == '&') { QChar *scriptCodeDest = scriptCode+scriptCodeSize; ++src; parseEntity(src,scriptCodeDest,true); scriptCodeSize = scriptCodeDest-scriptCode; } else { scriptCode[ scriptCodeSize++ ] = src[0]; ++src; } } }}void HTMLTokenizer::parseScript(DOMStringIt &src){#if DEBUG_BY_XHTANG fprintf(stderr,"into parseScript\n"); #endif parseListing(src);#if DEBUG_BY_XHTANG fprintf(stderr,"quit parseScript\n"); #endif}void HTMLTokenizer::parseStyle(DOMStringIt &src){ parseListing(src);}void HTMLTokenizer::parseComment(DOMStringIt &src){ parseListing(src); // ### disabled temporarily - skips body attrs. if placed before <HTML>}void HTMLTokenizer::parseProcessingInstruction(DOMStringIt &src){ while ( src.length() ) { char chbegin = src[0].latin1(); // Look for '?>' if ( chbegin == '?' ) { if (searchCount < 1) // Watch out for '--->' searchCount++; } else if ((searchCount == 1) && (chbegin == '>')) { // We got a '?>' sequence processingInstruction = false; ++src; discard=LFDiscard; return; // Finished parsing comment! } else { searchCount = 0; } ++src; }}void HTMLTokenizer::parseText(DOMStringIt &src){ while ( src.length() ) { // do we need to enlarge the buffer? checkBuffer(); // ascii is okay because we only do ascii comparisons char chbegin = src[0].latin1(); if (skipLF && ( chbegin != '\n' )) { skipLF = false; } if (skipLF) { skipLF = false; ++src; } else if (( chbegin == '\n' ) || ( chbegin == '\r' )) { processToken(); /* Check for MS-DOS CRLF sequence */ if (chbegin == '\r') { skipLF = true; } ++src; } else { *dest++ = src[0]; ++src; } }}void HTMLTokenizer::parseEntity(DOMStringIt &src, QChar *&dest, bool start){ if( start ) { entityPos = 0; charEntity = true; } while( src.length() ) { if(entityPos > 8) { checkBuffer(10); // entity too long, ignore and insert as is *dest++ = '&'; memcpy(dest, entityBuffer, entityPos*sizeof(QChar)); dest += entityPos; if ( pre ) prePos += entityPos+1; charEntity = false; return; } if( (src[0].lower() >= 'a' && src[0].lower() <= 'z') || (src[0] >= '0' && src[0] <= '9') || src[0] == '#' ) { entityBuffer[entityPos] = src[0]; entityPos++; ++src; } else // end of entity... try to decode it { QChar res; if(entityPos > 1) { QConstString cStr(entityBuffer, entityPos); res = charsets->fromEntity(cStr.string()); //kdDebug( 6036 ) << "ENTITY " << res.unicode() << ", " << res << endl; if (tag && src[0] != ';' ) { // Don't translate entities in tags with a missing ';' res = QChar::null; } // Partial support for MS Windows Latin-1 extensions // full list http://www.bbsinc.com/iso8859.html // There may be better equivalents if ( res != QChar::null ) { switch (res.unicode()) { case 0x82: res = ','; break; case 0x84: res = '"'; break; case 0x8b: res = '<'; break; case 0x9b: res = '>'; break; case 0x91: res = '\''; break; case 0x92: res = '\''; break; case 0x93: res = '"'; break; case 0x94: res = '"'; break; case 0x95: res = 0xb7; break; case 0x96: res = '-'; break; case 0x97: res = '-'; break; case 0x98: res = '~'; break; default: break; } } } if ( res != QChar::null ) { checkBuffer(); // Just insert it *dest++ = res; if (pre) prePos++; if (src[0] == ';') ++src; } else {#ifdef TOKEN_DEBUG kdDebug( 6036 ) << "unknown entity!" << endl;#endif checkBuffer(10); // ignore the sequence, add it to the buffer as plaintext *dest++ = '&'; memcpy(dest, entityBuffer, entityPos*sizeof(QChar)); dest += entityPos; charEntity = false; if (pre) prePos += entityPos+1; } charEntity = false; return; } }}void HTMLTokenizer::parseTag(DOMStringIt &src){ if (charEntity) parseEntity(src,dest); while ( src.length() ) { checkBuffer(); char curchar = src[0].latin1(); // decide if quoted or not.... if ( curchar == '\"' || curchar == '\'' ) { // we treat " & ' the same in tags if ( tquote == NoQuote ) { // according to HTML4 DTD, we can simplify // strings like " my \nstring " to "my string" discard = SpaceDiscard; // ignore leading spaces pending = NonePending; if (curchar == '\'') tquote = SingleQuote; else tquote = DoubleQuote; } else if ( (( tquote == SingleQuote )&&( curchar == '\'')) || (( tquote == DoubleQuote )&&( curchar == '\"')) ) { tquote = IgnoreQuote; discard = NoneDiscard; pending = NonePending; // remove space at the end of value } else if (tquote == IgnoreQuote) { // we remove additional quotes directly following the // end of the quoted section. Helps with bad html as // <tag attr="value"" nextattr="..." ...> } else { *dest++ = src[0]; } ++src; } else if ( discard != NoneDiscard && ( curchar == ' ' || curchar == '\t' || curchar == '\n' || curchar == '\r' ) ) { pending = SpacePending; ++src; } else { if (tquote == IgnoreQuote) tquote = NoQuote; switch(tag) { case NoTag: { return; } case TagName: { if( tquote ) {#ifdef TOKEN_DEBUG kdDebug( 6036 ) << "bad HTML in parseTag: TagName" << endl;#endif searchCount = 0; ++src; break; } if (searchCount > 0) { if (src[0] == commentStart[searchCount]) { searchCount++; if (searchCount == 4) {#ifdef TOKEN_DEBUG kdDebug( 6036 ) << "Found comment" << endl;#endif // Found '<!--' sequence ++src; dest = buffer; // ignore the previous part of this tag comment = true; searchCount = 0; searchFor = commentEnd; scriptCode = QT_ALLOC_QCHAR_VEC( 1024 ); scriptCodeSize = 0; scriptCodeMaxSize = 1024; tag = NoTag; parseComment(src); return; // Finished parsing tag! } *dest = src[0].lower(); dest++; ++src; break; } else { searchCount = 0; // Stop looking for '<!--' sequence } } curchar = src[0].latin1(); if( ((curchar >= 'a') && (curchar <= 'z')) || ((curchar >= 'A') && (curchar <= 'Z')) || ((curchar >= '0') && (curchar <= '9')) || curchar == '/' ) { *dest = src[0].lower(); dest++; ++src; } else { int len = dest - buffer; bool beginTag; QChar *ptr = buffer; if ((len > 0) && (*ptr == '/')) { // End Tag beginTag = false; ptr++; len--; } else { // Start Tag beginTag = true; // Ignore CR/LF's after a start tag discard = LFDiscard; } // limited xhtml support. Accept empty xml tags like <br/> if((len > 1) && (*(dest-1) == '/')) len--; QConstString tmp(ptr, len); uint tagID = khtml::getTagID(tmp.string().ascii(), len); if (!tagID) {#ifdef TOKEN_DEBUG kdDebug( 6036 ) << "Unknown tag: \"" << tmp.string() << "\"" << endl;#endif dest = buffer; tag = SearchEnd; // ignore the tag } else {#ifdef TOKEN_DEBUG kdDebug( 6036 ) << "found tag id=" << tagID << endl;#endif if (beginTag) { currToken->id = tagID; tag = SearchAttribute; } else { currToken->id = tagID + ID_CLOSE_TAG; tag = SearchEnd; } dest = buffer; } } break; } case SearchAttribute: { if( tquote ) {#ifdef TOKEN_DEBUG kdDebug( 6036 ) << "broken HTML in parseTag: SearchAttribute " << endl;#endif tquote=NoQuote; ++src; break; } curchar = src[0].latin1(); if( curchar == '>' ) { tag = SearchEnd; // we reached the end break; } if( !curchar ) // we ignore everything that isn't ascii { ++src; break; } if( ((curchar >= 'a') && (curchar <= 'z')) || ((curchar >= 'A') && (curchar <= 'Z')) || ((curchar >= '0') && (curchar <= '9')) || curchar == '-' ) { tag = AttributeName; discard = NoneDiscard; break; } ++src; // ignore break; } case AttributeName: { if( (((curchar >= 'a') && (curchar <= 'z')) || ((curchar >= 'A') && (curchar <= 'Z')) || ((curchar >= '0') && (curchar <= '9')) || curchar == '-') && !tquote ) { *dest = src[0].lower(); dest++; ++src; } else { // beginning of name QChar *ptr = buffer; attrName = QString(ptr, dest-buffer); uint a = khtml::getAttrID(attrName.ascii(), dest-buffer); dest = buffer; *dest++ = a; if (!a) {#ifdef TOKEN_DEBUG kdDebug( 6036 ) << "Unknown attribute: \"" << attrName << "\"" << endl;#endif } else {#ifdef TOKEN_DEBUG
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -