📄 decoder.cpp

📁 khtml在gtk上的移植版本
💻 CPP
📖 第 1 页 / 共 2 页
字号:
上一页 12
                // This is the incorrect end of comment that other browsers allow, "--!>".                if (p[1] == '-' && p[2] == '!' && p[3] == '>') {                    p += 4;                    break;                }            }            p++;        }    }    ptr = p;}// Returns the position of the encoding string.static int findXMLEncoding(const QCString &str, int &encodingLength){    int len = str.length();    int pos = str.find("encoding");    if (pos == -1)        return -1;    pos += 8;        // Skip spaces and stray control characters.    while (str[pos] <= ' ' && pos != len)        ++pos;    // Skip equals sign.    if (str[pos] != '=')        return -1;    ++pos;    // Skip spaces and stray control characters.    while (str[pos] <= ' ' && pos != len)        ++pos;    // Skip quotation mark.    char quoteMark = str[pos];    if (quoteMark != '"' && quoteMark != '\'')        return -1;    ++pos;    // Find the trailing quotation mark.    int end = pos;    while (str[end] != quoteMark)        ++end;    if (end == len)        return -1;        encodingLength = end - pos;    return pos;}QString Decoder::decode(const char *data, int len){    // Check for UTF-16 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding.    int bufferLength = buffer.length();    const int maximumBOMLength = 3;    if (beginning && bufferLength + len >= maximumBOMLength) {        if (m_type != UserChosenEncoding) {            // Extract the first three bytes.            // Handle the case where some of bytes are already in the buffer.            // The last byte is always guaranteed to not be in the buffer.            const uchar *udata = (const uchar *)data;            uchar c1 = bufferLength >= 1 ? (uchar)buffer[0] : *udata++;            uchar c2 = bufferLength >= 2 ? (uchar)buffer[1] : *udata++;            assert(bufferLength < 3);            uchar c3 = *udata;            // Check for the BOM.            const char *autoDetectedEncoding;            if ((c1 == 0xFE && c2 == 0xFF) || (c1 == 0xFF && c2 == 0xFE)) {                autoDetectedEncoding = "ISO-10646-UCS-2";            } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {                autoDetectedEncoding = "UTF-8";            } else {                autoDetectedEncoding = 0;            }            // If we found a BOM, use the encoding it implies.            if (autoDetectedEncoding != 0) {                m_type = AutoDetectedEncoding;                m_codec = QTextCodec::codecForName(autoDetectedEncoding);                assert(m_codec);                enc = m_codec->name();                delete m_decoder;                m_decoder = m_codec->makeDecoder();            }        }        beginning = false;    }        // this is not completely efficient, since the function might go    // through the html head several times...    bool lookForMetaTag = m_type == DefaultEncoding && !body;        if (lookForMetaTag) {#ifdef DECODE_DEBUG        kdDebug(6005) << "looking for charset definition" << endl;#endif        { // extra level of braces to keep indenting matching original for better diff'ing#if APPLE_CHANGES            buffer.append(data, len);#else            if(m_codec->mibEnum() != 1000) // utf16            {                // ### hack for a bug in QTextCodec. It cut's the input stream                // in case there are \0 in it. ZDNET has them inside... :-(                char *d = const_cast<char *>(data);                int i = len - 1;                while(i >= 0) {                    if(*(d+i) == 0) *(d+i) = ' ';                    i--;                }            }            buffer += QCString(data, len+1);#endif            // we still don't have an encoding, and are in the head            // the following tags are allowed in <head>:            // SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE#if APPLE_CHANGES            const char *ptr = buffer.latin1();            const char *pEnd = ptr + buffer.length();#else            const char *ptr = buffer.data();            const char *pEnd = buffer.length();#endif            while(ptr != pEnd)            {                if(*ptr == '<') {                    bool end = false;                    ptr++;                    // Handle comments.                    if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-') {                        ptr += 3;                        skipComment(ptr, pEnd);                        continue;                    }                                        // Handle XML header, which can have encoding in it.                    if (ptr[0] == '?' && ptr[1] == 'x' && ptr[2] == 'm' && ptr[3] == 'l') {                        const char *end = ptr;                        while (*end != '>' && *end != '\0') end++;                        if (*end == '\0')                            break;                        QCString str(ptr, end - ptr);                        int len;                        int pos = findXMLEncoding(str, len);                        if (pos != -1) {                            setEncoding(str.mid(pos, len), EncodingFromXMLHeader);                            if (m_type == EncodingFromXMLHeader)                                goto found;                        }                    }                    if(*ptr == '/') ptr++, end=true;                    char tmp[20];                    int len = 0;                    while (                        ((*ptr >= 'a') && (*ptr <= 'z') ||                         (*ptr >= 'A') && (*ptr <= 'Z') ||                         (*ptr >= '0') && (*ptr <= '9'))                        && len < 19 )                    {                        tmp[len] = tolower( *ptr );                        ptr++;                        len++;                    }		    tmp[len] = 0;                    int id = khtml::getTagID(tmp, len);                    if(end) id += ID_CLOSE_TAG;                    switch( id ) {                    case ID_META:                    {                        // found a meta tag...                        //ptr += 5;                        const char * end = ptr;                        while(*end != '>' && *end != '\0') end++;                        if ( *end == '\0' ) break;                        QCString str( ptr, (end-ptr)+1);                        str = str.lower();                        int pos = 0;                        //if( (pos = str.find("http-equiv", pos)) == -1) break;                        //if( (pos = str.find("content-type", pos)) == -1) break;			while( pos < ( int ) str.length() ) {			    if( (pos = str.find("charset", pos, false)) == -1) break;			    pos += 7;                            // skip whitespace..			    while(  pos < (int)str.length() && str[pos] <= ' ' ) pos++;                            if ( pos == ( int )str.length()) break;                            if ( str[pos++] != '=' ) continue;                            while ( pos < ( int )str.length() &&                                    ( str[pos] <= ' ' ) || str[pos] == '=' || str[pos] == '"' || str[pos] == '\'')				pos++;                            // end ?                            if ( pos == ( int )str.length() ) break;			    uint endpos = pos;			    while( endpos < str.length() &&                                   (str[endpos] != ' ' && str[endpos] != '"' && str[endpos] != '\''                                    && str[endpos] != ';' && str[endpos] != '>') )				endpos++;#ifdef DECODE_DEBUG			    kdDebug( 6005 ) << "Decoder: found charset: " << str.mid(pos, endpos-pos) << endl;#endif			    setEncoding(str.mid(pos, endpos-pos), EncodingFromMetaTag);			    if( m_type == EncodingFromMetaTag ) goto found;                            if ( endpos >= str.length() || str[endpos] == '/' || str[endpos] == '>' ) break;			    pos = endpos + 1;			}		    }                    case ID_SCRIPT:                    case (ID_SCRIPT+ID_CLOSE_TAG):                    case ID_NOSCRIPT:                    case (ID_NOSCRIPT+ID_CLOSE_TAG):                    case ID_STYLE:                    case (ID_STYLE+ID_CLOSE_TAG):                    case ID_LINK:                    case (ID_LINK+ID_CLOSE_TAG):                    case ID_OBJECT:                    case (ID_OBJECT+ID_CLOSE_TAG):                    case ID_TITLE:                    case (ID_TITLE+ID_CLOSE_TAG):                    case ID_BASE:                    case (ID_BASE+ID_CLOSE_TAG):                    case ID_HTML:                    case ID_HEAD:                    case 0:                    case (0 + ID_CLOSE_TAG ):                        break;                    default:                        body = true;#ifdef DECODE_DEBUG			kdDebug( 6005 ) << "Decoder: no charset found. Id=" << id << endl;#endif                        goto found;                    }                }                else                    ptr++;            }            return QString::null;        }    } found:#if APPLE_CHANGES    // Do the auto-detect if our default encoding is one of the Japanese ones.    if (m_type != UserChosenEncoding && m_type != AutoDetectedEncoding && m_codec && m_codec->isJapanese())#else    if (m_type == DefaultEncoding && KGlobal::locale()->languageList()[0] == "ja")#endif    {#ifdef DECODE_DEBUG	kdDebug( 6005 ) << "Decoder: use auto-detect (" << strlen(data) << ")" << endl;#endif	const char *autoDetectedEncoding;        switch ( KanjiCode::judge( data, len ) ) {	case KanjiCode::JIS:	    autoDetectedEncoding = "jis7";	    break;	case KanjiCode::EUC:	    autoDetectedEncoding = "eucjp";	    break;	case KanjiCode::SJIS:	    autoDetectedEncoding = "sjis";	    break;	default:	    autoDetectedEncoding = NULL;	    break;	}#ifdef DECODE_DEBUG	kdDebug( 6005 ) << "Decoder: auto detect encoding is "            << (autoDetectedEncoding ? autoDetectedEncoding : "NULL") << endl;#endif	if (autoDetectedEncoding != 0) {	    setEncoding(autoDetectedEncoding, AutoDetectedEncoding);	}    }    // if we still haven't found an encoding latin1 will be used...    // this is according to HTML4.0 specs    if (!m_codec)    {        if(enc.isEmpty()) enc = "iso8859-1";        m_codec = QTextCodec::codecForName(enc);        // be sure not to crash        if(!m_codec) {            enc = "iso8859-1";            m_codec = QTextCodec::codecForName(enc);        }        delete m_decoder;        m_decoder = m_codec->makeDecoder();    }    QString out;#if APPLE_CHANGES    if (!buffer.isEmpty()) {        if (!lookForMetaTag)            buffer.append(data, len);        out = m_decoder->toUnicode(buffer.latin1(), buffer.length());        buffer.truncate(0);    } else {        out = m_decoder->toUnicode(data, len);    }#else    if(!buffer.isEmpty() && enc != "ISO-10646-UCS-2") {        out = m_decoder->toUnicode(buffer.latin1(), buffer.length());        buffer = "";    } else {        if(m_codec->mibEnum() != 1000) // utf16        {            // ### hack for a bug in QTextCodec. It cut's the input stream            // in case there are \0 in it. ZDNET has them inside... :-(            char *d = const_cast<char *>(data);            int i = len - 1;            while(i >= 0) {                if(*(d+i) == 0) *(d+i) = ' ';                i--;            }        }        out = m_decoder->toUnicode(data, len);    }    if (out.isNull()) {        fprintf(stderr, "ERROR:  decoded string is null\n");    } else if (out.length() == 0) {        fprintf(stderr, "ERROR:  decoded string length == 0\n");    }    // the hell knows, why the output does sometimes have a QChar::null at    // the end...    else if(out[out.length()-1] == QChar::null)        out.truncate(out.length() - 1);#endif    return out;}QString Decoder::flush() const{#if APPLE_CHANGES    return m_decoder->toUnicode(buffer.latin1(), buffer.length(), true);#else    return m_decoder->toUnicode(buffer.latin1(), buffer.length());#endif}// -----------------------------------------------------------------------------#undef DECODE_DEBUG
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -