📄 decoder.cpp
字号:
/* This file is part of the KDE libraries Copyright (C) 1999 Lars Knoll (knoll@kde.org) Copyright (C) 2003 Dirk Mueller (mueller@kde.org) Copyright (C) 2003 Apple Computer, Inc. This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.*///----------------------------------------------------------------------------//// KDE HTML Widget -- decoder for input stream#undef DECODE_DEBUG//#define DECODE_DEBUG#include <assert.h>#include "decoder.h"#include "guess_ja.h"using namespace khtml;#include "htmlhashes.h"#include <qregexp.h>#include <qtextcodec.h>#include <kglobal.h>#include <kcharsets.h>#include <ctype.h>#include <kdebug.h>#include <klocale.h>Decoder::Decoder(){ // latin1 m_codec = QTextCodec::codecForMib(4); m_decoder = m_codec->makeDecoder(); enc = 0; m_type = DefaultEncoding; body = false; beginning = true; visualRTL = false; m_autoDetectLanguage = SemiautomaticDetection; kc = NULL;}Decoder::~Decoder(){ delete m_decoder; if (kc) delete kc;}void Decoder::setEncoding(const char *_encoding, EncodingType type){#ifdef DECODE_DEBUG kdDebug(6005) << "setEncoding " << _encoding << " " << type << endl;#endif enc = _encoding;#ifdef DECODE_DEBUG kdDebug(6005) << "old encoding is:" << m_codec->name() << endl;#endif enc = enc.lower();#ifdef DECODE_DEBUG kdDebug(6005) << "requesting:" << enc << endl;#endif if(enc.isNull() || enc.isEmpty()) return;#ifdef APPLE_CHANGES QTextCodec *codec = (type == EncodingFromMetaTag || type == EncodingFromXMLHeader) ? QTextCodec::codecForNameEightBitOnly(enc) : QTextCodec::codecForName(enc); if (codec) { enc = codec->name(); visualRTL = codec->usesVisualOrdering(); }#else if(enc == "visual") // hebrew visually ordered enc = "iso8859-8"; bool b; QTextCodec *codec = KGlobal::charsets()->codecForName(enc, b); if (!b) codec = 0; if (type == EncodingFromMetaTag || type == EncodingFromXMLHeader) { //Sometimes the codec specified is absurd, i.e. UTF-16 despite //us decoding a meta tag as ASCII. In that case, ignore it. if (codec && (codec->mibEnum() == 1000)) //UTF16 or similar. codec = 0; } if (codec && codec->mibEnum() == 11) { //We do NOT want to use Qt's QHebrewCodec, since it tries to reorder itself. codec = QTextCodec::codecForName("iso8859-8-i"); // visually ordered unless one of the following if( !(enc == "iso-8859-8-i" || enc == "iso_8859-8-i" || enc == "csiso88598i" || enc == "logical") ) visualRTL = true; }#endif if( codec ) { // in case the codec didn't exist, we keep the old one (fixes some sites specifying invalid codecs) m_codec = codec; m_type = type; delete m_decoder; m_decoder = m_codec->makeDecoder(); }#ifdef DECODE_DEBUG kdDebug(6005) << "Decoder::encoding used is" << m_codec->name() << endl;#endif}const char *Decoder::encoding() const{ return enc;}// Other browsers allow comments in the head section, so we need to also.// It's important not to look for tags inside the comments.static void skipComment(const char *&ptr, const char *pEnd){ const char *p = ptr; // Allow <!-->; other browsers do. if (*p == '>') { p++; } else { while (p != pEnd) { if (*p == '-') { // This is the real end of comment, "-->". if (p[1] == '-' && p[2] == '>') { p += 3; break; } // This is the incorrect end of comment that other browsers allow, "--!>". if (p[1] == '-' && p[2] == '!' && p[3] == '>') { p += 4; break; } } p++; } } ptr = p;}// Returns the position of the encoding string.static int findXMLEncoding(const QCString &str, int &encodingLength){ int len = str.length(); int pos = str.find("encoding"); if (pos == -1) return -1; pos += 8; // Skip spaces and stray control characters. while (pos < len && str[pos] <= ' ') ++pos; //Bail out if nothing after if (pos >= len) return -1; // Skip equals sign. if (str[pos] != '=') return -1; ++pos; // Skip spaces and stray control characters. while (pos < len && str[pos] <= ' ') ++pos; //Bail out if nothing after if (pos >= len) return -1; // Skip quotation mark. char quoteMark = str[pos]; if (quoteMark != '"' && quoteMark != '\'') return -1; ++pos; // Find the trailing quotation mark. int end = pos; while (end < len && str[end] != quoteMark) ++end; if (end >= len) return -1; encodingLength = end - pos; return pos;}QString Decoder::decode(const char *data, int len){ // Check for UTF-16 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding. int bufferLength = buffer.length(); const int maximumBOMLength = 10; if (beginning && bufferLength + len >= maximumBOMLength) { // If the user has chosen utf16 we still need to auto-detect the endianness if ((m_type != UserChosenEncoding) || (m_codec->mibEnum() == 1000)) { // Extract the first three bytes. // Handle the case where some of bytes are already in the buffer. const uchar *udata = (const uchar *)data; uchar c1 = bufferLength >= 1 ? (uchar)buffer[0] : *udata++; uchar c2 = bufferLength >= 2 ? (uchar)buffer[1] : *udata++; uchar c3 = bufferLength >= 3 ? (uchar)buffer[2] : *udata++; // Check for the BOM const char *autoDetectedEncoding; if ((c1 == 0xFE && c2 == 0xFF) || (c1 == 0xFF && c2 == 0xFE)) { autoDetectedEncoding = "ISO-10646-UCS-2"; } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) { autoDetectedEncoding = "UTF-8"; } else if (c1 == 0x00 || c2 == 0x00) { uchar c4 = bufferLength >= 4 ? (uchar)buffer[3] : *udata++; uchar c5 = bufferLength >= 5 ? (uchar)buffer[4] : *udata++; uchar c6 = bufferLength >= 6 ? (uchar)buffer[5] : *udata++; uchar c7 = bufferLength >= 7 ? (uchar)buffer[6] : *udata++; uchar c8 = bufferLength >= 8 ? (uchar)buffer[7] : *udata++; uchar c9 = bufferLength >= 9 ? (uchar)buffer[8] : *udata++; uchar c10 = bufferLength >= 10 ? (uchar)buffer[9] : *udata++; int nul_count_even = (c2 != 0) + (c4 != 0) + (c6 != 0) + (c8 != 0) + (c10 != 0); int nul_count_odd = (c1 != 0) + (c3 != 0) + (c5 != 0) + (c7 != 0) + (c9 != 0); if ((nul_count_even == 0 && nul_count_odd == 5) || (nul_count_even == 5 && nul_count_odd == 0)) autoDetectedEncoding = "ISO-10646-UCS-2"; else autoDetectedEncoding = 0; } else { autoDetectedEncoding = 0; } // If we found a BOM, use the encoding it implies. if (autoDetectedEncoding != 0) { m_type = AutoDetectedEncoding; m_codec = QTextCodec::codecForName(autoDetectedEncoding); assert(m_codec); enc = m_codec->name(); delete m_decoder; m_decoder = m_codec->makeDecoder(); if (m_codec->mibEnum() == 1000 && c2 == 0x00) { // utf16LE, we need to put the decoder in LE mode char reverseUtf16[3] = {0xFF, 0xFE, 0x00}; m_decoder->toUnicode(reverseUtf16, 2); } } } beginning = false; } // this is not completely efficient, since the function might go // through the html head several times... bool lookForMetaTag = m_type == DefaultEncoding && !body; if (lookForMetaTag) {#ifdef DECODE_DEBUG kdDebug(6005) << "looking for charset definition" << endl;#endif { // extra level of braces to keep indenting matching original for better diff'ing#ifdef APPLE_CHANGES buffer.append(data, len);#else if(m_codec->mibEnum() != 1000) { // utf16 // replace '\0' by spaces, for buggy pages char *d = const_cast<char *>(data); int i = len - 1; while(i >= 0) { if(d[i] == 0) d[i] = ' '; i--; } } buffer += QCString(data, len+1);#endif // we still don't have an encoding, and are in the head // the following tags are allowed in <head>: // SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE int invalid = 0; // invalid head tag count#ifdef APPLE_CHANGES const char *ptr = buffer.latin1(); const char *pEnd = ptr + buffer.length();#else const char *ptr = buffer.data(); const char *pEnd = ptr + buffer.length();#endif while(ptr != pEnd) { if(*ptr == '<') { bool end = false; ptr++; // Handle comments. if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-') { ptr += 3; skipComment(ptr, pEnd); continue; } // Handle XML header, which can have encoding in it. if (ptr[0] == '?' && ptr[1] == 'x' && ptr[2] == 'm' && ptr[3] == 'l') { const char *end = ptr; while (*end != '>' && *end != '\0') end++; if (*end == '\0') break; QCString str(ptr, end - ptr + 1); //+1 as it must include the \0 terminator int len; int pos = findXMLEncoding(str, len); if (pos != -1) { setEncoding(str.mid(pos, len), EncodingFromXMLHeader); if (m_type == EncodingFromXMLHeader) goto found; } } if(*ptr == '/') ptr++, end=true; char tmp[20]; int len = 0; while ( ((*ptr >= 'a') && (*ptr <= 'z') || (*ptr >= 'A') && (*ptr <= 'Z') || (*ptr >= '0') && (*ptr <= '9')) && len < 19 ) { tmp[len] = tolower( *ptr ); ptr++; len++; } tmp[len] = 0; int id = khtml::getTagID(tmp, len); if(end) id += ID_CLOSE_TAG; switch( id ) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -