decoder.cpp

来自「konqueror3 embedded版本, KDE环境下的当家浏览器的嵌入式版」· C++ 代码 · 共 714 行 · 第 1/2 页
CPP
714 行
/*    This file is part of the KDE libraries    Copyright (C) 1999 Lars Knoll (knoll@kde.org)    Copyright (C) 2003 Dirk Mueller (mueller@kde.org)    Copyright (C) 2003 Apple Computer, Inc.    This library is free software; you can redistribute it and/or    modify it under the terms of the GNU Library General Public    License as published by the Free Software Foundation; either    version 2 of the License, or (at your option) any later version.    This library is distributed in the hope that it will be useful,    but WITHOUT ANY WARRANTY; without even the implied warranty of    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU    Library General Public License for more details.    You should have received a copy of the GNU Library General Public License    along with this library; see the file COPYING.LIB.  If not, write to    the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,    Boston, MA 02110-1301, USA.*///----------------------------------------------------------------------------//// KDE HTML Widget -- decoder for input stream#undef DECODE_DEBUG//#define DECODE_DEBUG#include <assert.h>#include "decoder.h"#include "guess_ja.h"using namespace khtml;#include "htmlhashes.h"#include <qregexp.h>#include <qtextcodec.h>#include <kglobal.h>#include <kcharsets.h>#include <ctype.h>#include <kdebug.h>#include <klocale.h>Decoder::Decoder(){    // latin1    m_codec = QTextCodec::codecForMib(4);    m_decoder = m_codec->makeDecoder();    enc = 0;    m_type = DefaultEncoding;    body = false;    beginning = true;    visualRTL = false;    m_autoDetectLanguage = SemiautomaticDetection;    kc = NULL;}Decoder::~Decoder(){    delete m_decoder;    if (kc)        delete kc;}void Decoder::setEncoding(const char *_encoding, EncodingType type){#ifdef DECODE_DEBUG    kdDebug(6005) << "setEncoding " << _encoding << " " << type << endl;#endif    enc = _encoding;#ifdef DECODE_DEBUG    kdDebug(6005) << "old encoding is:" << m_codec->name() << endl;#endif    enc = enc.lower();#ifdef DECODE_DEBUG    kdDebug(6005) << "requesting:" << enc << endl;#endif    if(enc.isNull() || enc.isEmpty())        return;#ifdef APPLE_CHANGES    QTextCodec *codec = (type == EncodingFromMetaTag || type == EncodingFromXMLHeader)        ? QTextCodec::codecForNameEightBitOnly(enc)        : QTextCodec::codecForName(enc);    if (codec) {        enc = codec->name();        visualRTL = codec->usesVisualOrdering();    }#else    if(enc == "visual") // hebrew visually ordered        enc = "iso8859-8";    bool b;    QTextCodec *codec = KGlobal::charsets()->codecForName(enc, b);    if (!b)        codec = 0;    if (type == EncodingFromMetaTag || type  == EncodingFromXMLHeader) {        //Sometimes the codec specified is absurd, i.e. UTF-16 despite        //us decoding a meta tag as ASCII. In that case, ignore it.        if (codec &&            (codec->mibEnum() == 1000)) //UTF16 or similar.                codec = 0;    }    if (codec && codec->mibEnum() == 11)  {        //We do NOT want to use Qt's QHebrewCodec, since it tries to reorder itself.        codec = QTextCodec::codecForName("iso8859-8-i");	        // visually ordered unless one of the following        if( !(enc == "iso-8859-8-i" || enc == "iso_8859-8-i"                || enc == "csiso88598i" || enc == "logical") )        visualRTL = true;    }#endif    if( codec ) { // in case the codec didn't exist, we keep the old one (fixes some sites specifying invalid codecs)        m_codec = codec;        m_type = type;        delete m_decoder;        m_decoder = m_codec->makeDecoder();    }#ifdef DECODE_DEBUG    kdDebug(6005) << "Decoder::encoding used is" << m_codec->name() << endl;#endif}const char *Decoder::encoding() const{    return enc;}// Other browsers allow comments in the head section, so we need to also.// It's important not to look for tags inside the comments.static void skipComment(const char *&ptr, const char *pEnd){    const char *p = ptr;    // Allow <!-->; other browsers do.    if (*p == '>') {        p++;    } else {        while (p != pEnd) {            if (*p == '-') {                // This is the real end of comment, "-->".                if (p[1] == '-' && p[2] == '>') {                    p += 3;                    break;                }                // This is the incorrect end of comment that other browsers allow, "--!>".                if (p[1] == '-' && p[2] == '!' && p[3] == '>') {                    p += 4;                    break;                }            }            p++;        }    }    ptr = p;}// Returns the position of the encoding string.static int findXMLEncoding(const QCString &str, int &encodingLength){    int len = str.length();    int pos = str.find("encoding");    if (pos == -1)        return -1;    pos += 8;    // Skip spaces and stray control characters.    while (pos < len && str[pos] <= ' ')        ++pos;    //Bail out if nothing after    if (pos >= len)        return -1;    // Skip equals sign.    if (str[pos] != '=')        return -1;    ++pos;    // Skip spaces and stray control characters.    while (pos < len && str[pos] <= ' ')        ++pos;    //Bail out if nothing after    if (pos >= len)        return -1;    // Skip quotation mark.    char quoteMark = str[pos];    if (quoteMark != '"' && quoteMark != '\'')        return -1;    ++pos;    // Find the trailing quotation mark.    int end = pos;    while (end < len && str[end] != quoteMark)        ++end;    if (end >= len)        return -1;    encodingLength = end - pos;    return pos;}QString Decoder::decode(const char *data, int len){    // Check for UTF-16 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding.    int bufferLength = buffer.length();    const int maximumBOMLength = 10;    if (beginning && bufferLength + len >= maximumBOMLength) {        // If the user has chosen utf16 we still need to auto-detect the endianness        if ((m_type != UserChosenEncoding) || (m_codec->mibEnum() == 1000)) {            // Extract the first three bytes.            // Handle the case where some of bytes are already in the buffer.            const uchar *udata = (const uchar *)data;            uchar c1 = bufferLength >= 1 ? (uchar)buffer[0] : *udata++;            uchar c2 = bufferLength >= 2 ? (uchar)buffer[1] : *udata++;            uchar c3 = bufferLength >= 3 ? (uchar)buffer[2] : *udata++;            // Check for the BOM            const char *autoDetectedEncoding;            if ((c1 == 0xFE && c2 == 0xFF) || (c1 == 0xFF && c2 == 0xFE)) {                autoDetectedEncoding = "ISO-10646-UCS-2";            } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {                autoDetectedEncoding = "UTF-8";            } else if (c1 == 0x00 || c2 == 0x00) {                uchar c4 = bufferLength >= 4 ? (uchar)buffer[3] : *udata++;                uchar c5 = bufferLength >= 5 ? (uchar)buffer[4] : *udata++;                uchar c6 = bufferLength >= 6 ? (uchar)buffer[5] : *udata++;                uchar c7 = bufferLength >= 7 ? (uchar)buffer[6] : *udata++;                uchar c8 = bufferLength >= 8 ? (uchar)buffer[7] : *udata++;                uchar c9 = bufferLength >= 9 ? (uchar)buffer[8] : *udata++;                uchar c10 = bufferLength >= 10 ? (uchar)buffer[9] : *udata++;                int nul_count_even = (c2 != 0) + (c4 != 0) + (c6 != 0) + (c8 != 0) + (c10 != 0);                int nul_count_odd = (c1 != 0) + (c3 != 0) + (c5 != 0) + (c7 != 0) + (c9 != 0);                if ((nul_count_even == 0 && nul_count_odd == 5) ||                    (nul_count_even == 5 && nul_count_odd == 0))                    autoDetectedEncoding = "ISO-10646-UCS-2";                else                    autoDetectedEncoding = 0;            } else {                autoDetectedEncoding = 0;            }            // If we found a BOM, use the encoding it implies.            if (autoDetectedEncoding != 0) {                m_type = AutoDetectedEncoding;                m_codec = QTextCodec::codecForName(autoDetectedEncoding);                assert(m_codec);                enc = m_codec->name();                delete m_decoder;                m_decoder = m_codec->makeDecoder();                if (m_codec->mibEnum() == 1000 && c2 == 0x00)                {                  // utf16LE, we need to put the decoder in LE mode                  char reverseUtf16[3] = {0xFF, 0xFE, 0x00};                  m_decoder->toUnicode(reverseUtf16, 2);                }            }        }        beginning = false;    }    // this is not completely efficient, since the function might go    // through the html head several times...    bool lookForMetaTag = m_type == DefaultEncoding && !body;    if (lookForMetaTag) {#ifdef DECODE_DEBUG        kdDebug(6005) << "looking for charset definition" << endl;#endif        { // extra level of braces to keep indenting matching original for better diff'ing#ifdef APPLE_CHANGES            buffer.append(data, len);#else            if(m_codec->mibEnum() != 1000) {  // utf16                // replace '\0' by spaces, for buggy pages                char *d = const_cast<char *>(data);                int i = len - 1;                while(i >= 0) {                    if(d[i] == 0) d[i] = ' ';                    i--;                }            }            buffer += QCString(data, len+1);#endif            // we still don't have an encoding, and are in the head            // the following tags are allowed in <head>:            // SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE            int invalid = 0; // invalid head tag count#ifdef APPLE_CHANGES            const char *ptr = buffer.latin1();            const char *pEnd = ptr + buffer.length();#else            const char *ptr = buffer.data();            const char *pEnd = ptr + buffer.length();#endif            while(ptr != pEnd)            {                if(*ptr == '<') {                    bool end = false;                    ptr++;                    // Handle comments.                    if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-') {                        ptr += 3;                        skipComment(ptr, pEnd);                        continue;                    }                    // Handle XML header, which can have encoding in it.                    if (ptr[0] == '?' && ptr[1] == 'x' && ptr[2] == 'm' && ptr[3] == 'l') {                        const char *end = ptr;                        while (*end != '>' && *end != '\0') end++;                        if (*end == '\0')                            break;                        QCString str(ptr, end - ptr + 1); //+1 as it must include the \0 terminator                        int len;                        int pos = findXMLEncoding(str, len);                        if (pos != -1) {                            setEncoding(str.mid(pos, len), EncodingFromXMLHeader);                            if (m_type == EncodingFromXMLHeader)                                goto found;                        }                    }                    if(*ptr == '/') ptr++, end=true;                    char tmp[20];                    int len = 0;                    while (                        ((*ptr >= 'a') && (*ptr <= 'z') ||                         (*ptr >= 'A') && (*ptr <= 'Z') ||                         (*ptr >= '0') && (*ptr <= '9'))                        && len < 19 )                    {                        tmp[len] = tolower( *ptr );                        ptr++;                        len++;                    }		    tmp[len] = 0;                    int id = khtml::getTagID(tmp, len);                    if(end) id += ID_CLOSE_TAG;                    switch( id ) {
decoder.cpp - 源码说明

本页面展示了「konqueror3 embedded版本, KDE环境下的当家浏览器的嵌入式版本源码包.」中的 decoder.cpp 源码文件，采用 C++ 编程语言编写，共 714 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与konqueror3相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?