📄 htmltokenizer.cpp
字号:
}
else if (scriptCodeSize > 3 && scriptCode[scriptCodeSize-4] == '-' && scriptCode[scriptCodeSize-3] == '-' &&
scriptCode[scriptCodeSize-2] == '!') {
// Other browsers will accept --!> as a close comment, even though it's
// not technically valid.
endCharsCount = 4;
}
}
if (canClose || handleBrokenComments || endCharsCount > 1) {
++src;
if (!( script || xmp || textarea || style)) {
if (includesCommentsInDOM) {
RETURN_IF_OOM( checkScriptBuffer() );
scriptCode[ scriptCodeSize ] = 0;
scriptCode[ scriptCodeSize + 1 ] = 0;
currToken.id = ID_COMMENT;
processListing(TokenizerString(scriptCode, scriptCodeSize - endCharsCount));
processToken();
currToken.id = ID_COMMENT + ID_CLOSE_TAG;
processToken();
}
scriptCodeSize = 0;
}
comment = false;
return; // Finished parsing comment
}
}
++src;
}
}
#endif
void HTMLTokenizer::parseServer(TokenizerString &src)
{
RETURN_IF_OOM( checkScriptBuffer(src.length()) );
while ( !src.isEmpty() ) {
scriptCode[ scriptCodeSize++ ] = *src;
if (src->unicode() == '>' &&
scriptCodeSize > 1 && scriptCode[scriptCodeSize-2] == '%') {
++src;
server = false;
scriptCodeSize = 0;
return; // Finished parsing server include
}
++src;
}
}
void HTMLTokenizer::parseProcessingInstruction(TokenizerString &src)
{
char oldchar = 0;
while ( !src.isEmpty() )
{
unsigned char chbegin = src->latin1();
if(chbegin == '\'') {
tquote = tquote == SingleQuote ? NoQuote : SingleQuote;
}
else if(chbegin == '\"') {
tquote = tquote == DoubleQuote ? NoQuote : DoubleQuote;
}
// Look for '?>'
// some crappy sites omit the "?" before it, so
// we look for an unquoted '>' instead. (IE compatible)
else if ( chbegin == '>' && ( !tquote || oldchar == '?' ) )
{
// We got a '?>' sequence
processingInstruction = false;
++src;
discard=LFDiscard;
return; // Finished parsing comment!
}
++src;
oldchar = chbegin;
}
}
void HTMLTokenizer::parseText(TokenizerString &src)
{
while ( !src.isEmpty() )
{
// do we need to enlarge the buffer?
RETURN_IF_OOM( checkBuffer() );
// ascii is okay because we only do ascii comparisons
unsigned char chbegin = src->latin1();
if (skipLF && ( chbegin != '\n' ))
{
skipLF = false;
}
if (skipLF)
{
skipLF = false;
++src;
}
else if (( chbegin == '\n' ) || ( chbegin == '\r' ))
{
if (chbegin == '\r')
skipLF = true;
*dest++ = '\n';
++src;
}
else {
*dest = *src;
if (src->unicode() >= 0x0080)
fixUpChar(*dest);
++dest;
++src;
}
}
}
void HTMLTokenizer::parseEntity(TokenizerString &src, QChar *&dest, bool start)
{
if( start )
{
cBufferPos = 0;
Entity = SearchEntity;
EntityUnicodeValue = 0;
}
while( !src.isEmpty() )
{
ushort cc = src->unicode();
switch(Entity) {
case NoEntity:
assert(Entity != NoEntity);
return;
case SearchEntity:
if(cc == '#') {
cBuffer[cBufferPos++] = cc;
++src;
Entity = NumericSearch;
}
else
Entity = EntityName;
break;
case NumericSearch:
if(cc == 'x' || cc == 'X') {
cBuffer[cBufferPos++] = cc;
++src;
Entity = Hexadecimal;
}
else if(cc >= '0' && cc <= '9')
Entity = Decimal;
else
Entity = SearchSemicolon;
break;
case Hexadecimal:
{
int ll = kMin(src.length(), 8);
while(ll--) {
QChar csrc(src->lower());
cc = csrc.cell();
if(csrc.row() || !((cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f'))) {
break;
}
EntityUnicodeValue = EntityUnicodeValue*16 + (cc - ( cc < 'a' ? '0' : 'a' - 10));
cBuffer[cBufferPos++] = cc;
++src;
}
Entity = SearchSemicolon;
break;
}
case Decimal:
{
int ll = kMin(src.length(), 9-cBufferPos);
while(ll--) {
cc = src->cell();
if(src->row() || !(cc >= '0' && cc <= '9')) {
Entity = SearchSemicolon;
break;
}
EntityUnicodeValue = EntityUnicodeValue * 10 + (cc - '0');
cBuffer[cBufferPos++] = cc;
++src;
}
if(cBufferPos == 9) Entity = SearchSemicolon;
break;
}
case EntityName:
{
int ll = kMin(src.length(), 9-cBufferPos);
while(ll--) {
QChar csrc = *src;
cc = csrc.cell();
if(csrc.row() || !((cc >= 'a' && cc <= 'z') ||
(cc >= '0' && cc <= '9') || (cc >= 'A' && cc <= 'Z'))) {
Entity = SearchSemicolon;
break;
}
cBuffer[cBufferPos++] = cc;
++src;
}
if(cBufferPos == 9) Entity = SearchSemicolon;
if(Entity == SearchSemicolon) {
if(cBufferPos > 1) {
const entity *e = findEntity(cBuffer, cBufferPos);
if(e)
EntityUnicodeValue = e->code;
// be IE compatible
if(tag && EntityUnicodeValue > 255 && *src != ';')
EntityUnicodeValue = 0;
}
}
else
break;
}
case SearchSemicolon:
//kdDebug( 6036 ) << "ENTITY " << EntityUnicodeValue << ", " << res << endl;
// Don't allow surrogate code points, or values that are more than 21 bits.
if ((EntityUnicodeValue > 0 && EntityUnicodeValue < 0xD800)
|| (EntityUnicodeValue >= 0xE000 && EntityUnicodeValue <= 0x1FFFFF)) {
if (*src == ';')
++src;
if (EntityUnicodeValue <= 0xFFFF) {
QChar c(EntityUnicodeValue);
if (c.unicode() >= 0x0080)
fixUpChar(c);
RETURN_IF_OOM( checkBuffer() );
src.push(c);
} else {
// Convert to UTF-16, using surrogate code points.
QChar c1(0xD800 | (((EntityUnicodeValue >> 16) - 1) << 6) | ((EntityUnicodeValue >> 10) & 0x3F));
QChar c2(0xDC00 | (EntityUnicodeValue & 0x3FF));
RETURN_IF_OOM( checkBuffer(2) );
src.push(c1);
src.push(c2);
}
} else {
#ifdef TOKEN_DEBUG
kdDebug( 6036 ) << "unknown entity!" << endl;
#endif
RETURN_IF_OOM( checkBuffer(10) );
// ignore the sequence, add it to the buffer as plaintext
*dest++ = '&';
for(unsigned int i = 0; i < cBufferPos; i++)
dest[i] = cBuffer[i];
dest += cBufferPos;
}
Entity = NoEntity;
return;
}
}
}
void HTMLTokenizer::parseTag(TokenizerString &src)
{
assert(!Entity );
#ifdef NOKIA_CHANGES
slashAtEndOfTag = false;
#endif
while ( !src.isEmpty() )
{
RETURN_IF_OOM( checkBuffer() );
#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
uint l = 0;
while(l < src.length() && (*(src.current()+l)).latin1() != '>')
l++;
qDebug("src is now: *%s*, tquote: %d",
QConstString((QChar*)src.current(), l).string().latin1(), tquote);
#endif
switch(tag) {
case NoTag:
{
return;
}
case TagName:
{
#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
qDebug("TagName");
#endif
if (searchCount > 0)
{
if (*src == commentStart[searchCount])
{
searchCount++;
if (searchCount == 4)
{
#ifdef TOKEN_DEBUG
kdDebug( 6036 ) << "Found comment" << endl;
#endif
// Found '<!--' sequence
++src;
dest = buffer; // ignore the previous part of this tag
comment = true;
tag = NoTag;
// Fix bug 34302 at kde.bugs.org. Go ahead and treat
// <!--> as a valid comment, since both mozilla and IE on windows
// can handle this case. Only do this in quirks mode. -dwh
if (!src.isEmpty() && *src == '>' && parser->doc()->inCompatMode()) {
comment = false;
++src;
if (!src.isEmpty())
cBuffer[cBufferPos++] = src->cell();
}
else
parseComment(src);
return; // Finished parsing tag!
}
// cuts of high part, is okay
cBuffer[cBufferPos++] = src->cell();
++src;
break;
}
else
searchCount = 0; // Stop looking for '<!--' sequence
}
bool finish = false;
unsigned int ll = kMin(src.length(), CBUFLEN-cBufferPos);
while(ll--) {
ushort curchar = *src;
if(curchar <= ' ' || curchar == '>' ) {
finish = true;
break;
}
// Use tolower() instead of | 0x20 to lowercase the char because there is no
// performance gain in using | 0x20 since tolower() is optimized and
// | 0x20 turns characters such as '_' into junk.
cBuffer[cBufferPos++] = tolower(curchar);
++src;
}
// Disadvantage: we add the possible rest of the tag
// as attribute names. ### judge if this causes problems
if(finish || CBUFLEN == cBufferPos) {
bool beginTag;
char* ptr = cBuffer;
unsigned int len = cBufferPos;
cBuffer[cBufferPos] = '\0';
if ((cBufferPos > 0) && (*ptr == '/'))
{
// End Tag
beginTag = false;
ptr++;
len--;
}
else
// Start Tag
beginTag = true;
// Accept empty xml tags like <br/>. We trim off the "/" so that when we call
// getTagID, we'll look up "br" as the tag name and not "br/". This will also
// handle a case when some wrong tags such as <title/> is found and ther is no
// end tag.
if(len > 1 && ptr[len-1] == '/' )
{
#ifdef NOKIA_CHANGES
slashAtEndOfTag = true;
#endif
ptr[--len] = '\0';
}
// Look up the tagID for the specified tag name (now that we've shaved off any
// invalid / that might have followed the name).
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -