📄 decoder.cpp

📁 konqueror3 embedded版本, KDE环境下的当家浏览器的嵌入式版本源码包.
💻 CPP
📖 第 1 页 / 共 2 页
字号:
上一页 12
                    case ID_META:                    {                        // found a meta tag...                        //ptr += 5;                        const char * end = ptr;                        while(*end != '>' && *end != '\0') end++;                        if ( *end == '\0' ) break;                        QCString str( ptr, (end-ptr)+1);                        str = str.lower();                        int pos = 0;                        //if( (pos = str.find("http-equiv", pos)) == -1) break;                        //if( (pos = str.find("content-type", pos)) == -1) break;			while( pos < ( int ) str.length() ) {			    if( (pos = str.find("charset", pos)) == -1) break;			    pos += 7;                            // skip whitespace..			    while(  pos < (int)str.length() && str[pos] <= ' ' ) pos++;                            if ( pos == ( int )str.length()) break;                            if ( str[pos++] != '=' ) continue;                            while ( pos < ( int )str.length() &&                                    ( str[pos] <= ' ' ) || str[pos] == '=' || str[pos] == '"' || str[pos] == '\'')				pos++;                            // end ?                            if ( pos == ( int )str.length() ) break;			    uint endpos = pos;			    while( endpos < str.length() &&                                   (str[endpos] != ' ' && str[endpos] != '"' && str[endpos] != '\''                                    && str[endpos] != ';' && str[endpos] != '>') )				endpos++;			    enc = str.mid(pos, endpos-pos);#ifdef DECODE_DEBUG			    kdDebug( 6005 ) << "Decoder: found charset: " << enc.data() << endl;#endif			    setEncoding(enc, EncodingFromMetaTag);			    if( m_type == EncodingFromMetaTag ) goto found;                            if ( endpos >= str.length() || str[endpos] == '/' || str[endpos] == '>' ) break;			    pos = endpos + 1;			}		    }                    case ID_SCRIPT:                    case (ID_SCRIPT+ID_CLOSE_TAG):                    case ID_NOSCRIPT:                    case (ID_NOSCRIPT+ID_CLOSE_TAG):                    case ID_STYLE:                    case (ID_STYLE+ID_CLOSE_TAG):                    case ID_LINK:                    case (ID_LINK+ID_CLOSE_TAG):                    case ID_OBJECT:                    case (ID_OBJECT+ID_CLOSE_TAG):                    case ID_TITLE:                    case (ID_TITLE+ID_CLOSE_TAG):                    case ID_BASE:                    case (ID_BASE+ID_CLOSE_TAG):                    case ID_HTML:                    case ID_HEAD:                    case 0:                    case (0 + ID_CLOSE_TAG ):                        break;                    case ID_BODY:                    case (ID_HEAD+ID_CLOSE_TAG):                        body = true;#ifdef DECODE_DEBUG			kdDebug( 6005 ) << "Decoder: no charset found. Id=" << id << endl;#endif                        goto found;                    default:                        // Invalid tag in head. Let's be a little tolerant                        invalid++;                        if (invalid > 2)  {                            body = true;#ifdef DECODE_DEBUG                            kdDebug( 6005 ) << "Decoder: no charset found. Id=" << id << endl;#endif                            goto found;                        }                    }                }                else                    ptr++;            }            if (invalid > 0) {                body = true;                goto found;            }            return QString::null;        }    } found:    if (m_type == DefaultEncoding)    {#ifdef DECODE_DEBUG	kdDebug( 6005 ) << "Decoder: use auto-detect (" << strlen(data) << ")" << endl;#endif        switch ( m_autoDetectLanguage) {        case Decoder::Arabic:            enc = automaticDetectionForArabic( (const unsigned char*) data, len );            break;        case Decoder::Baltic:            enc = automaticDetectionForBaltic( (const unsigned char*) data, len );            break;        case Decoder::CentralEuropean:            enc = automaticDetectionForCentralEuropean( (const unsigned char*) data, len );            break;        case Decoder::Russian:        case Decoder::Ukrainian:            enc = automaticDetectionForCyrillic( (const unsigned char*) data, len, m_autoDetectLanguage );            break;        case Decoder::Greek:            enc = automaticDetectionForGreek( (const unsigned char*) data, len );            break;        case Decoder::Hebrew:            enc = automaticDetectionForHebrew( (const unsigned char*) data, len );            break;        case Decoder::Japanese:            enc = automaticDetectionForJapanese( (const unsigned char*) data, len );            break;        case Decoder::Turkish:            enc = automaticDetectionForTurkish( (const unsigned char*) data, len );            break;        case Decoder::WesternEuropean:            enc = automaticDetectionForWesternEuropean( (const unsigned char*) data, len );            break;        case Decoder::SemiautomaticDetection:        case Decoder::Chinese:        case Decoder::Korean:        case Decoder::Thai:        case Decoder::Unicode:            // huh. somethings broken in this code ### FIXME            enc = 0; //Reset invalid codec we tried, so we get back to latin1 fallback.            break;        }#ifdef DECODE_DEBUG        kdDebug( 6005 ) << "Decoder: auto detect encoding is " << enc.data() << endl;#endif        if ( !enc.isEmpty() )            setEncoding( enc.data(), AutoDetectedEncoding);    }    // if we still haven't found an encoding latin1 will be used...    // this is according to HTML4.0 specs    if (!m_codec)    {        if(enc.isEmpty()) enc = "iso8859-1";        m_codec = QTextCodec::codecForName(enc);        // be sure not to crash        if(!m_codec) {            m_codec = QTextCodec::codecForMib(4);            enc = "iso8859-1";        }        delete m_decoder;        m_decoder = m_codec->makeDecoder();    }    QString out;    if(!buffer.isEmpty() && enc != "ISO-10646-UCS-2") {        out = m_decoder->toUnicode(buffer, buffer.length());        buffer = "";    } else {        if(m_codec->mibEnum() != 1000) // utf16        {            // ### hack for a bug in QTextCodec. It cut's the input stream            // in case there are \0 in it. ZDNET has them inside... :-(            char *d = const_cast<char *>(data);            int i = len - 1;            while(i >= 0) {                if(*(d+i) == 0) *(d+i) = ' ';                i--;            }        }        out = m_decoder->toUnicode(data, len);    }    return out;}QString Decoder::flush() const{    return m_decoder->toUnicode(buffer, buffer.length());}QCString Decoder::automaticDetectionForArabic( const unsigned char* ptr, int size ){    for ( int i = 0; i < size; ++i ) {        if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) || ptr[ i ] == 0xA1 || ptr[ i ] == 0xA2 || ptr[ i ] == 0xA3             || ( ptr[ i ] >= 0xA5 && ptr[ i ] <= 0xAB ) || ( ptr[ i ] >= 0xAE && ptr[ i ] <= 0xBA )             || ptr[ i ] == 0xBC || ptr[ i ] == 0xBD || ptr[ i ] == 0xBE || ptr[ i ] == 0xC0             || ( ptr[ i ] >= 0xDB && ptr[ i ] <= 0xDF ) || ( ptr[ i ] >= 0xF3 ) ) {            return "cp1256";        }    }    return "iso-8859-6";}QCString Decoder::automaticDetectionForBaltic( const unsigned char* ptr, int size ){    for ( int i = 0; i < size; ++i ) {        if ( ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9E ) )             return "cp1257";        if ( ptr[ i ] == 0xA1 || ptr[ i ] == 0xA5 )            return "iso-8859-13";    }    return "iso-8859-13";}QCString Decoder::automaticDetectionForCentralEuropean(const unsigned char* ptr, int size ){    QCString charset = QCString();    for ( int i = 0; i < size; ++i ) {        if ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) {            if ( ptr[ i ] == 0x81 || ptr[ i ] == 0x83 || ptr[ i ] == 0x90 || ptr[ i ] == 0x98 )                return "ibm852";            if ( i + 1 > size )                return "cp1250";            else { // maybe ibm852 ?                charset = "cp1250";                continue;            }        }        if ( ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE || ptr[ i ] == 0xBE || ptr[ i ] == 0xC3 || ptr[ i ] == 0xD0 || ptr[ i ] == 0xE3 || ptr[ i ] == 0xF0 ) {            if ( i + 1 > size )                return "iso-8859-2";            else {  // maybe ibm852 ?                if ( charset.isNull() )                    charset = "iso-8859-2";                continue;            }        }    }    if ( charset.isNull() )        charset = "iso-8859-3";    return charset.data();}QCString Decoder::automaticDetectionForCyrillic( const unsigned char* ptr, int size, AutoDetectLanguage _language ){    QCString charset = QCString();    for ( int i = 0; i < size; ++i ) {        if ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F ) {            if ( ptr[ i ] == 0x98 ) {                if ( _language == Russian )                    return "koi8-r";                else if ( _language == Ukrainian )                    return "koi8-u";            }            if ( i + 1 > size )                return "cp1251";            else { // maybe koi8-r or koi8-u ?                charset = "cp1251";                continue;            }        }        else {            if ( i + 1 > size )                return "iso-8859-5";            else {  // maybe koi8-r (koi8-u) or cp1251 ?                if ( charset.isNull() )                    charset = "iso-8859-5";                continue;            }        }    }    if ( charset.isNull() )        charset = "iso-8859-5";    return charset.data();}QCString Decoder::automaticDetectionForGreek( const unsigned char* ptr, int size ){    for ( int i = 0; i < size; ++i ) {        if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x87 ) || ptr[ i ] == 0x89 || ptr[ i ] == 0x8B             || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x97 ) || ptr[ i ] == 0x99 || ptr[ i ] == 0x9B || ptr[ i ] == 0xA4             || ptr[ i ] == 0xA5 || ptr[ i ] == 0xAE ) {            return "cp1253";        }    }    return "iso-8859-7";}QCString Decoder::automaticDetectionForHebrew( const unsigned char* ptr, int size ){    for ( int i = 0; i < size; ++i ) {        if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x89 ) || ptr[ i ] == 0x8B             || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x99 ) || ptr[ i ] == 0x9B || ptr[ i ] == 0xA1 || ( ptr[ i ] >= 0xBF && ptr[ i ] <= 0xC9 )             || ( ptr[ i ] >= 0xCB && ptr[ i ] <= 0xD8 ) ) {            return "cp1255";        }        if ( ptr[ i ] == 0xDF )            return "iso-8859-8-i";    }    return "iso-8859-8-i";}QCString Decoder::automaticDetectionForJapanese( const unsigned char* ptr, int size ){    if (!kc)        kc = new JapaneseCode();    switch ( kc->guess_jp( (const char*)ptr, size ) ) {    case JapaneseCode::JIS:        return "jis7";    case JapaneseCode::EUC:        return "eucjp";    case JapaneseCode::SJIS:        return "sjis";     case JapaneseCode::UTF8:        return "utf8";    default:        break;    }    return "";}QCString Decoder::automaticDetectionForTurkish( const unsigned char* ptr, int size ){    for ( int i = 0; i < size; ++i ) {        if ( ptr[ i ] == 0x80 || ( ptr[ i ] >= 0x82 && ptr[ i ] <= 0x8C ) || ( ptr[ i ] >= 0x91 && ptr[ i ] <= 0x9C ) || ptr[ i ] == 0x9F ) {            return "cp1254";        }    }    return "iso-8859-9";}QCString Decoder::automaticDetectionForWesternEuropean( const unsigned char* ptr, int size ){    for ( int i = 0; i < size; ++i ) {        if ( ptr[ i ] >= 0x80 && ptr[ i ] <= 0x9F )            return "cp1252";    }    return "iso-8859-1"; //"iso-8859-15"; Which better at default ?}// -----------------------------------------------------------------------------#undef DECODE_DEBUG
上一页 12
💿 文件大小 3938 K
👤 上传用户 zaindyclg
📂 所属分类嵌入式/单片机编程
🏷️ 相关标签

#konqueror3 #embedded #KDE #版本
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -