📄 decoder.cpp
字号:
/*
This file is part of the KDE libraries
Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)
Copyright (C) 2003 Apple Computer, Inc.
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public
License as published by the Free Software Foundation; either
version 2 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public License
along with this library; see the file COPYING.LIB. If not, write to
the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA 02111-1307, USA.
*/
//----------------------------------------------------------------------------
//
// KDE HTML Widget -- decoder for input stream
#undef DECODE_DEBUG
//#define DECODE_DEBUG
#include "decoder.h"
using namespace khtml;
#include "htmlhashes.h"
#include <qregexp.h>
#include <qtextcodec.h>
#include <kglobal.h>
#include <kcharsets.h>
#include <ctype.h>
#include <kdebug.h>
#include <klocale.h>
class KanjiCode
OOM_MODIFIED
{
public:
enum Type {ASCII, JIS, EUC, SJIS, UNICODE, UTF8 };
static enum Type judge(const char *str, int length);
static const int ESC;
static const int _SS2_;
static const unsigned char kanji_map_sjis[];
static int ISkanji(int code)
{
if (code >= 0x100)
return 0;
return (kanji_map_sjis[code & 0xff] & 1);
}
static int ISkana(int code)
{
if (code >= 0x100)
return 0;
return (kanji_map_sjis[code & 0xff] & 2);
}
};
const int KanjiCode::ESC = 0x1b;
const int KanjiCode::_SS2_ = 0x8e;
const unsigned char KanjiCode::kanji_map_sjis[] =
{
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0
};
/*
* EUC-JP is
* [0xa1 - 0xfe][0xa1 - 0xfe]
* 0x8e[0xa1 - 0xfe](SS2)
* 0x8f[0xa1 - 0xfe][0xa1 - 0xfe](SS3)
*
* Shift_Jis is
* [0x81 - 0x9f, 0xe0 - 0xef(0xfe?)][0x40 - 0x7e, 0x80 - 0xfc]
*
* Shift_Jis Hankaku Kana is
* [0xa1 - 0xdf]
*/
/*
* KanjiCode::judge() is based on judge_jcode() from jvim
* http://hp.vector.co.jp/authors/VA003457/vim/
*
* Special Thanks to Kenichi Tsuchida
*/
/*
* Maybe we should use QTextCodec::heuristicContentMatch()
* But it fails detection. It's not useful.
*/
enum KanjiCode::Type KanjiCode::judge(const char *str, int size)
{
enum Type code;
int i;
int bfr = FALSE; /* Kana Moji */
int bfk = 0; /* EUC Kana */
int sjis = 0;
int euc = 0;
const unsigned char *ptr = (const unsigned char *) str;
code = ASCII;
i = 0;
while (i < size) {
if (ptr[i] == ESC && (size - i >= 3)) {
if ((ptr[i + 1] == '$' && ptr[i + 2] == 'B')
|| (ptr[i + 1] == '(' && ptr[i + 2] == 'B')) {
code = JIS;
goto breakBreak;
} else if ((ptr[i + 1] == '$' && ptr[i + 2] == '@')
|| (ptr[i + 1] == '(' && ptr[i + 2] == 'J')) {
code = JIS;
goto breakBreak;
} else if (ptr[i + 1] == '(' && ptr[i + 2] == 'I') {
code = JIS;
i += 3;
} else if (ptr[i + 1] == ')' && ptr[i + 2] == 'I') {
code = JIS;
i += 3;
} else {
i++;
}
bfr = FALSE;
bfk = 0;
} else {
if (ptr[i] < 0x20) {
bfr = FALSE;
bfk = 0;
/* ?? check kudokuten ?? && ?? hiragana ?? */
if ((i >= 2) && (ptr[i - 2] == 0x81)
&& (0x41 <= ptr[i - 1] && ptr[i - 1] <= 0x49)) {
code = SJIS;
sjis += 100; /* kudokuten */
} else if ((i >= 2) && (ptr[i - 2] == 0xa1)
&& (0xa2 <= ptr[i - 1] && ptr[i - 1] <= 0xaa)) {
code = EUC;
euc += 100; /* kudokuten */
} else if ((i >= 2) && (ptr[i - 2] == 0x82) && (0xa0 <= ptr[i - 1])) {
sjis += 40; /* hiragana */
} else if ((i >= 2) && (ptr[i - 2] == 0xa4) && (0xa0 <= ptr[i - 1])) {
euc += 40; /* hiragana */
}
} else {
/* ?? check hiragana or katana ?? */
if ((size - i > 1) && (ptr[i] == 0x82) && (0xa0 <= ptr[i + 1])) {
sjis++; /* hiragana */
} else if ((size - i > 1) && (ptr[i] == 0x83)
&& (0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x9f)) {
sjis++; /* katakana */
} else if ((size - i > 1) && (ptr[i] == 0xa4) && (0xa0 <= ptr[i + 1])) {
euc++; /* hiragana */
} else if ((size - i > 1) && (ptr[i] == 0xa5) && (0xa0 <= ptr[i + 1])) {
euc++; /* katakana */
}
if (bfr) {
if ((i >= 1) && (0x40 <= ptr[i] && ptr[i] <= 0xa0) && ISkanji(ptr[i - 1])) {
code = SJIS;
goto breakBreak;
} else if ((i >= 1) && (0x81 <= ptr[i - 1] && ptr[i - 1] <= 0x9f) && ((0x40 <= ptr[i] && ptr[i] < 0x7e) || (0x7e < ptr[i] && ptr[i] <= 0xfc))) {
code = SJIS;
goto breakBreak;
} else if ((i >= 1) && (0xfd <= ptr[i] && ptr[i] <= 0xfe) && (0xa1 <= ptr[i - 1] && ptr[i - 1] <= 0xfe)) {
code = EUC;
goto breakBreak;
} else if ((i >= 1) && (0xfd <= ptr[i - 1] && ptr[i - 1] <= 0xfe) && (0xa1 <= ptr[i] && ptr[i] <= 0xfe)) {
code = EUC;
goto breakBreak;
} else if ((i >= 1) && (ptr[i] < 0xa0 || 0xdf < ptr[i]) && (0x8e == ptr[i - 1])) {
code = SJIS;
goto breakBreak;
} else if (ptr[i] <= 0x7f) {
code = SJIS;
goto breakBreak;
} else {
if (0xa1 <= ptr[i] && ptr[i] <= 0xa6) {
euc++; /* sjis hankaku kana kigo */
} else if (0xa1 <= ptr[i] && ptr[i] <= 0xdf) {
; /* sjis hankaku kana */
} else if (0xa1 <= ptr[i] && ptr[i] <= 0xfe) {
euc++;
} else if (0x8e == ptr[i]) {
euc++;
} else if (0x20 <= ptr[i] && ptr[i] <= 0x7f) {
sjis++;
}
bfr = FALSE;
bfk = 0;
}
} else if (0x8e == ptr[i]) {
if (size - i <= 1) {
;
} else if (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xdf) {
/* EUC KANA or SJIS KANJI */
if (bfk == 1) {
euc += 100;
}
bfk++;
i++;
} else {
/* SJIS only */
code = SJIS;
goto breakBreak;
}
} else if (0x81 <= ptr[i] && ptr[i] <= 0x9f) {
/* SJIS only */
code = SJIS;
if ((size - i >= 1)
&& ((0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x7e)
|| (0x80 <= ptr[i + 1] && ptr[i + 1] <= 0xfc))) {
goto breakBreak;
}
} else if (0xfd <= ptr[i] && ptr[i] <= 0xfe) {
/* EUC only */
code = EUC;
if ((size - i >= 1)
&& (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xfe)) {
goto breakBreak;
}
} else if (ptr[i] <= 0x7f) {
;
} else {
bfr = TRUE;
bfk = 0;
}
}
i++;
}
}
if (code == ASCII) {
if (sjis > euc) {
code = SJIS;
} else if (sjis < euc) {
code = EUC;
}
}
breakBreak:
return (code);
}
Decoder::Decoder()
{
_refCount = 1;
// latin1
m_codec = QTextCodec::codecForName("iso8859-1");
m_decoder = m_codec->makeDecoder();
enc = 0;
m_type = DefaultEncoding;
body = false;
beginning = true;
visualRTL = false;
}
Decoder::~Decoder()
{
assert(_refCount == 0);
delete m_codec;
}
void Decoder::setEncoding(const char *_encoding, EncodingType type)
{
#ifdef DECODE_DEBUG
kdDebug(6005) << "setEncoding " << _encoding << " " << force << endl;
#endif
enc = _encoding;
#ifdef DECODE_DEBUG
kdDebug(6005) << "old encoding is:" << m_codec->name() << endl;
#endif
enc = enc.lower();
#ifdef DECODE_DEBUG
kdDebug(6005) << "requesting:" << enc << endl;
#endif
if(enc.isNull() || enc.isEmpty())
return;
#if APPLE_CHANGES
QTextCodec *codec = (type == EncodingFromMetaTag || type == EncodingFromXMLHeader)
? QTextCodec::codecForNameEightBitOnly(enc)
: QTextCodec::codecForName(enc);
if (codec) {
enc = codec->name();
visualRTL = codec->usesVisualOrdering();
}
#else
if(enc == "visual") // hebrew visually ordered
enc = "iso8859-8";
bool b;
QTextCodec *codec = KGlobal::charsets()->codecForName(enc, b);
if (!b)
codec = 0;
if (codec && codec->mibEnum() == 11) {
// visually ordered unless one of the following
if( !(enc == "iso-8859-8-i" || enc == "iso_8859-8-i"
|| enc == "csiso88598i" || enc == "logical") )
visualRTL = true;
}
#endif
if( codec ) { // in case the codec didn't exist, we keep the old one (fixes some sites specifying invalid codecs)
delete m_codec;
m_codec = codec;
m_type = type;
m_decoder = m_codec->makeDecoder();
}
#ifdef DECODE_DEBUG
kdDebug(6005) << "Decoder::encoding used is" << m_codec->name() << endl;
#endif
}
const char *Decoder::encoding() const
{
return enc;
}
// Other browsers allow comments in the head section, so we need to also.
// It's important not to look for tags inside the comments.
static void skipComment(const char *&ptr, const char *pEnd)
{
const char *p = ptr;
// Allow <!-->; other browsers do.
if (*p == '>') {
p++;
} else {
while (p != pEnd) {
if (*p == '-') {
// This is the real end of comment, "-->".
if (p[1] == '-' && p[2] == '>') {
p += 3;
break;
}
// This is the incorrect end of comment that other browsers allow, "--!>".
if (p[1] == '-' && p[2] == '!' && p[3] == '>') {
p += 4;
break;
}
}
p++;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -