📄 htmltokenizer.cpp

📁 最新Nokia手机浏览器全套源代码完美版。
💻 CPP
📖 第 1 页 / 共 5 页
字号:
                }
                else if (scriptCodeSize > 3 && scriptCode[scriptCodeSize-4] == '-' && scriptCode[scriptCodeSize-3] == '-' &&
                    scriptCode[scriptCodeSize-2] == '!') {
                    // Other browsers will accept --!> as a close comment, even though it's
                    // not technically valid.
                    endCharsCount = 4;
                }
            }
            if (canClose || handleBrokenComments || endCharsCount > 1) {
                ++src;
                if (!( script || xmp || textarea || style)) {
                    if (includesCommentsInDOM) {
                        RETURN_IF_OOM( checkScriptBuffer() );
                        scriptCode[ scriptCodeSize ] = 0;
                        scriptCode[ scriptCodeSize + 1 ] = 0;
                        currToken.id = ID_COMMENT;
                        processListing(TokenizerString(scriptCode, scriptCodeSize - endCharsCount));
                        processToken();
                        currToken.id = ID_COMMENT + ID_CLOSE_TAG;
                        processToken();
                    }
                    scriptCodeSize = 0;
                }
                comment = false;
                return; // Finished parsing comment
            }
        }
        ++src;
    }
}
#endif

void HTMLTokenizer::parseServer(TokenizerString &src)
{
    RETURN_IF_OOM( checkScriptBuffer(src.length()) );
    while ( !src.isEmpty() ) {
        scriptCode[ scriptCodeSize++ ] = *src;
        if (src->unicode() == '>' &&
            scriptCodeSize > 1 && scriptCode[scriptCodeSize-2] == '%') {
            ++src;
            server = false;
            scriptCodeSize = 0;
            return; // Finished parsing server include
        }
        ++src;
    }
}

void HTMLTokenizer::parseProcessingInstruction(TokenizerString &src)
{
    char oldchar = 0;
    while ( !src.isEmpty() )
    {
        unsigned char chbegin = src->latin1();
        if(chbegin == '\'') {
            tquote = tquote == SingleQuote ? NoQuote : SingleQuote;
        }
        else if(chbegin == '\"') {
            tquote = tquote == DoubleQuote ? NoQuote : DoubleQuote;
        }
        // Look for '?>'
        // some crappy sites omit the "?" before it, so
        // we look for an unquoted '>' instead. (IE compatible)
        else if ( chbegin == '>' && ( !tquote || oldchar == '?' ) )
        {
            // We got a '?>' sequence
            processingInstruction = false;
            ++src;
            discard=LFDiscard;
            return; // Finished parsing comment!
        }
        ++src;
        oldchar = chbegin;
    }
}

void HTMLTokenizer::parseText(TokenizerString &src)
{
    while ( !src.isEmpty() )
    {
        // do we need to enlarge the buffer?
        RETURN_IF_OOM( checkBuffer() );

        // ascii is okay because we only do ascii comparisons
        unsigned char chbegin = src->latin1();

        if (skipLF && ( chbegin != '\n' ))
        {
            skipLF = false;
        }

        if (skipLF)
        {
            skipLF = false;
            ++src;
        }
        else if (( chbegin == '\n' ) || ( chbegin == '\r' ))
        {
            if (chbegin == '\r')
                skipLF = true;

            *dest++ = '\n';
            ++src;
        }
        else {
            *dest = *src;
            if (src->unicode() >= 0x0080)
                fixUpChar(*dest);
            ++dest;
            ++src;
        }
    }
}


void HTMLTokenizer::parseEntity(TokenizerString &src, QChar *&dest, bool start)
{
    if( start )
    {
        cBufferPos = 0;
        Entity = SearchEntity;
        EntityUnicodeValue = 0;
    }

    while( !src.isEmpty() )
    {
        ushort cc = src->unicode();
        switch(Entity) {
        case NoEntity:
            assert(Entity != NoEntity);
            return;

        case SearchEntity:
            if(cc == '#') {
                cBuffer[cBufferPos++] = cc;
                ++src;
                Entity = NumericSearch;
            }
            else
                Entity = EntityName;

            break;

        case NumericSearch:
            if(cc == 'x' || cc == 'X') {
                cBuffer[cBufferPos++] = cc;
                ++src;
                Entity = Hexadecimal;
            }
            else if(cc >= '0' && cc <= '9')
                Entity = Decimal;
            else
                Entity = SearchSemicolon;

            break;

        case Hexadecimal:
        {
            int ll = kMin(src.length(), 8);
            while(ll--) {
                QChar csrc(src->lower());
                cc = csrc.cell();

                if(csrc.row() || !((cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f'))) {
                    break;
                }
                EntityUnicodeValue = EntityUnicodeValue*16 + (cc - ( cc < 'a' ? '0' : 'a' - 10));
                cBuffer[cBufferPos++] = cc;
                ++src;
            }
            Entity = SearchSemicolon;
            break;
        }
        case Decimal:
        {
            int ll = kMin(src.length(), 9-cBufferPos);
            while(ll--) {
                cc = src->cell();

                if(src->row() || !(cc >= '0' && cc <= '9')) {
                    Entity = SearchSemicolon;
                    break;
                }

                EntityUnicodeValue = EntityUnicodeValue * 10 + (cc - '0');
                cBuffer[cBufferPos++] = cc;
                ++src;
            }
            if(cBufferPos == 9)  Entity = SearchSemicolon;
            break;
        }
        case EntityName:
        {
            int ll = kMin(src.length(), 9-cBufferPos);
            while(ll--) {
                QChar csrc = *src;
                cc = csrc.cell();

                if(csrc.row() || !((cc >= 'a' && cc <= 'z') ||
                                   (cc >= '0' && cc <= '9') || (cc >= 'A' && cc <= 'Z'))) {
                    Entity = SearchSemicolon;
                    break;
                }

                cBuffer[cBufferPos++] = cc;
                ++src;
            }
            if(cBufferPos == 9) Entity = SearchSemicolon;
            if(Entity == SearchSemicolon) {
                if(cBufferPos > 1) {
                    const entity *e = findEntity(cBuffer, cBufferPos);
                    if(e)
                        EntityUnicodeValue = e->code;

                    // be IE compatible
                    if(tag && EntityUnicodeValue > 255 && *src != ';')
                        EntityUnicodeValue = 0;
                }
            }
            else
                break;
        }
        case SearchSemicolon:

            //kdDebug( 6036 ) << "ENTITY " << EntityUnicodeValue << ", " << res << endl;

            // Don't allow surrogate code points, or values that are more than 21 bits.
            if ((EntityUnicodeValue > 0 && EntityUnicodeValue < 0xD800)
                    || (EntityUnicodeValue >= 0xE000 && EntityUnicodeValue <= 0x1FFFFF)) {

                if (*src == ';')
                    ++src;

                if (EntityUnicodeValue <= 0xFFFF) {
                    QChar c(EntityUnicodeValue);
                    if (c.unicode() >= 0x0080)
                        fixUpChar(c);
                    RETURN_IF_OOM( checkBuffer() );
                    src.push(c);
                } else {
                    // Convert to UTF-16, using surrogate code points.
                    QChar c1(0xD800 | (((EntityUnicodeValue >> 16) - 1) << 6) | ((EntityUnicodeValue >> 10) & 0x3F));
                    QChar c2(0xDC00 | (EntityUnicodeValue & 0x3FF));
                    RETURN_IF_OOM( checkBuffer(2) );
                    src.push(c1);
                    src.push(c2);
                }

            } else {
#ifdef TOKEN_DEBUG
                kdDebug( 6036 ) << "unknown entity!" << endl;
#endif
                RETURN_IF_OOM( checkBuffer(10) );
                // ignore the sequence, add it to the buffer as plaintext
                *dest++ = '&';
                for(unsigned int i = 0; i < cBufferPos; i++)
                    dest[i] = cBuffer[i];
                dest += cBufferPos;
            }

            Entity = NoEntity;
            return;
        }
    }
}

void HTMLTokenizer::parseTag(TokenizerString &src)
{
    assert(!Entity );
#ifdef NOKIA_CHANGES
    slashAtEndOfTag = false;
#endif

    while ( !src.isEmpty() )
    {
        RETURN_IF_OOM( checkBuffer() );
#if defined(TOKEN_DEBUG) && TOKEN_DEBUG > 1
        uint l = 0;
        while(l < src.length() && (*(src.current()+l)).latin1() != '>')
            l++;
        qDebug("src is now: *%s*, tquote: %d",
               QConstString((QChar*)src.current(), l).string().latin1(), tquote);
#endif
        switch(tag) {
        case NoTag:
        {
            return;
        }
        case TagName:
        {
#if defined(TOKEN_DEBUG) &&  TOKEN_DEBUG > 1
            qDebug("TagName");
#endif
            if (searchCount > 0)
            {
                if (*src == commentStart[searchCount])
                {
                    searchCount++;
                    if (searchCount == 4)
                    {
#ifdef TOKEN_DEBUG
                        kdDebug( 6036 ) << "Found comment" << endl;
#endif
                        // Found '<!--' sequence
                        ++src;
                        dest = buffer; // ignore the previous part of this tag
                        comment = true;
                        tag = NoTag;

                        // Fix bug 34302 at kde.bugs.org.  Go ahead and treat
                        // <!--> as a valid comment, since both mozilla and IE on windows
                        // can handle this case.  Only do this in quirks mode. -dwh
                        if (!src.isEmpty() && *src == '>' && parser->doc()->inCompatMode()) {
                          comment = false;
                          ++src;
                          if (!src.isEmpty())
                              cBuffer[cBufferPos++] = src->cell();
                        }
            else
                          parseComment(src);

                        return; // Finished parsing tag!
                    }
                    // cuts of high part, is okay
                    cBuffer[cBufferPos++] = src->cell();
                    ++src;
                    break;
                }
                else
                    searchCount = 0; // Stop looking for '<!--' sequence
            }

            bool finish = false;
            unsigned int ll = kMin(src.length(), CBUFLEN-cBufferPos);
            while(ll--) {
                ushort curchar = *src;
                if(curchar <= ' ' || curchar == '>' ) {
                    finish = true;
                    break;
                }
                // Use tolower() instead of | 0x20 to lowercase the char because there is no
                // performance gain in using | 0x20 since tolower() is optimized and
                // | 0x20 turns characters such as '_' into junk.
                cBuffer[cBufferPos++] = tolower(curchar);
                ++src;
            }

            // Disadvantage: we add the possible rest of the tag
            // as attribute names. ### judge if this causes problems
            if(finish || CBUFLEN == cBufferPos) {
                bool beginTag;
                char* ptr = cBuffer;
                unsigned int len = cBufferPos;
                cBuffer[cBufferPos] = '\0';
                if ((cBufferPos > 0) && (*ptr == '/'))
                {
                    // End Tag
                    beginTag = false;
                    ptr++;
                    len--;
                }
                else
                    // Start Tag
                    beginTag = true;

                // Accept empty xml tags like <br/>.  We trim off the "/" so that when we call
                // getTagID, we'll look up "br" as the tag name and not "br/". This will also
                // handle a case when some wrong tags such as <title/> is found and ther is no
                // end tag.

                if(len > 1 && ptr[len-1] == '/' )
                {
#ifdef NOKIA_CHANGES
                 slashAtEndOfTag = true;
#endif
                 ptr[--len] = '\0';
                }

                // Look up the tagID for the specified tag name (now that we've shaved off any
                // invalid / that might have followed the name).
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -