📄 decoder.cpp
字号:
}
}
ptr = p;
}
// Returns the position of the encoding string.
static int findXMLEncoding(const QCString &str, int &encodingLength)
{
int len = str.length();
int pos = str.find("encoding");
if (pos == -1)
return -1;
pos += 8;
// Skip spaces and stray control characters.
while (str[pos] <= ' ' && pos != len)
++pos;
// Skip equals sign.
if (str[pos] != '=')
return -1;
++pos;
// Skip spaces and stray control characters.
while (str[pos] <= ' ' && pos != len)
++pos;
// Skip quotation mark.
char quoteMark = str[pos];
if (quoteMark != '"' && quoteMark != '\'')
return -1;
++pos;
// Find the trailing quotation mark.
int end = pos;
while (str[end] != quoteMark)
++end;
if (end == len)
return -1;
encodingLength = end - pos;
return pos;
}
QString Decoder::decode(const char *data, int len)
{
// Check for UTF-16 or UTF-8 BOM mark at the beginning, which is a sure sign of a Unicode encoding.
int bufferLength = buffer.length();
const int maximumBOMLength = 3;
if (beginning && bufferLength + len >= maximumBOMLength) {
if (m_type != UserChosenEncoding) {
// Extract the first three bytes.
// Handle the case where some of bytes are already in the buffer.
// The last byte is always guaranteed to not be in the buffer.
const uchar *udata = (const uchar *)data;
uchar c1 = bufferLength >= 1 ? (uchar)buffer[0] : *udata++;
uchar c2 = bufferLength >= 2 ? (uchar)buffer[1] : *udata++;
assert(bufferLength < 3);
uchar c3 = *udata;
// Check for the BOM.
const char *autoDetectedEncoding;
if ((c1 == 0xFE && c2 == 0xFF) || (c1 == 0xFF && c2 == 0xFE)) {
autoDetectedEncoding = "ISO-10646-UCS-2";
} else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
autoDetectedEncoding = "UTF-8";
} else {
autoDetectedEncoding = 0;
}
// If we found a BOM, use the encoding it implies.
if (autoDetectedEncoding != 0) {
m_type = AutoDetectedEncoding;
delete m_codec;
m_codec = QTextCodec::codecForName(autoDetectedEncoding);
assert(m_codec);
enc = m_codec->name();
m_decoder = m_codec->makeDecoder();
}
}
beginning = false;
}
// this is not completely efficient, since the function might go
// through the html head several times...
bool lookForMetaTag = m_type == DefaultEncoding && !body;
if (lookForMetaTag) {
#ifdef DECODE_DEBUG
kdDebug(6005) << "looking for charset definition" << endl;
#endif
{ // extra level of braces to keep indenting matching original for better diff'ing
#if APPLE_CHANGES
buffer.append(data, len);
#else
if(m_codec->mibEnum() != 1000) // utf16
{
// ### hack for a bug in QTextCodec. It cut's the input stream
// in case there are \0 in it. ZDNET has them inside... :-(
char *d = const_cast<char *>(data);
int i = len - 1;
while(i >= 0) {
if(*(d+i) == 0) *(d+i) = ' ';
i--;
}
}
buffer += QCString(data, len+1);
#endif
// we still don't have an encoding, and are in the head
// the following tags are allowed in <head>:
// SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE
#if APPLE_CHANGES
const char *ptr = buffer.latin1();
const char *pEnd = ptr + buffer.length();
#else
const char *ptr = buffer.data();
const char *pEnd = buffer.length();
#endif
while(ptr != pEnd)
{
if(*ptr == '<') {
bool end = false;
ptr++;
// Handle comments.
if (ptr[0] == '!' && ptr[1] == '-' && ptr[2] == '-') {
ptr += 3;
skipComment(ptr, pEnd);
continue;
}
// Handle XML header, which can have encoding in it.
if (ptr[0] == '?' && ptr[1] == 'x' && ptr[2] == 'm' && ptr[3] == 'l') {
const char *end = ptr;
while (*end != '>' && *end != '\0') end++;
if (*end == '\0')
break;
QCString str(ptr, end - ptr);
int len;
int pos = findXMLEncoding(str, len);
if (pos != -1) {
setEncoding(str.mid(pos, len), EncodingFromXMLHeader);
if (m_type == EncodingFromXMLHeader)
goto found;
}
}
if(*ptr == '/') ptr++, end=true;
char tmp[20];
int len = 0;
while (
((*ptr >= 'a') && (*ptr <= 'z') ||
(*ptr >= 'A') && (*ptr <= 'Z') ||
(*ptr >= '0') && (*ptr <= '9'))
&& len < 19 )
{
tmp[len] = tolower( *ptr );
ptr++;
len++;
}
tmp[len] = 0;
int id = khtml::getTagID(tmp, len);
if(end) id += ID_CLOSE_TAG;
switch( id ) {
case ID_META:
{
// found a meta tag...
//ptr += 5;
const char * end = ptr;
while(*end != '>' && *end != '\0') end++;
if ( *end == '\0' ) break;
QCString str( ptr, (end-ptr)+1);
str = str.lower();
int pos = 0;
//if( (pos = str.find("http-equiv", pos)) == -1) break;
//if( (pos = str.find("content-type", pos)) == -1) break;
while( pos < ( int ) str.length() ) {
if( (pos = str.find("charset", pos, false)) == -1) break;
pos += 7;
// skip whitespace..
while( pos < (int)str.length() && str[pos] <= ' ' ) pos++;
if ( pos == ( int )str.length()) break;
if ( str[pos++] != '=' ) continue;
while ( pos < ( int )str.length() &&
( str[pos] <= ' ' ) || str[pos] == '=' || str[pos] == '"' || str[pos] == '\'')
pos++;
// end ?
if ( pos == ( int )str.length() ) break;
uint endpos = pos;
while( endpos < str.length() &&
(str[endpos] != ' ' && str[endpos] != '"' && str[endpos] != '\''
&& str[endpos] != ';' && str[endpos] != '>') )
endpos++;
#ifdef DECODE_DEBUG
kdDebug( 6005 ) << "Decoder: found charset: " << str.mid(pos, endpos-pos) << endl;
#endif
setEncoding(str.mid(pos, endpos-pos), EncodingFromMetaTag);
if( m_type == EncodingFromMetaTag ) goto found;
if ( endpos >= str.length() || str[endpos] == '/' || str[endpos] == '>' ) break;
pos = endpos + 1;
}
}
case (ID_META+ID_CLOSE_TAG):
case ID_SCRIPT:
case (ID_SCRIPT+ID_CLOSE_TAG):
case ID_NOSCRIPT:
case (ID_NOSCRIPT+ID_CLOSE_TAG):
case ID_STYLE:
case (ID_STYLE+ID_CLOSE_TAG):
case ID_LINK:
case (ID_LINK+ID_CLOSE_TAG):
case ID_OBJECT:
case (ID_OBJECT+ID_CLOSE_TAG):
case ID_TITLE:
case (ID_TITLE+ID_CLOSE_TAG):
case ID_BASE:
case (ID_BASE+ID_CLOSE_TAG):
case ID_HTML:
case ID_HEAD:
#if NOKIA_CHANGES
case ID_IMG:
#endif
case 0:
case (0 + ID_CLOSE_TAG ):
break;
default:
body = true;
#ifdef DECODE_DEBUG
kdDebug( 6005 ) << "Decoder: no charset found. Id=" << id << endl;
#endif
goto found;
}
}
else
ptr++;
}
return QString::null;
}
}
found:
#if APPLE_CHANGES
// Do the auto-detect if our default encoding is one of the Japanese ones.
if (m_type != UserChosenEncoding && m_type != AutoDetectedEncoding && m_codec && m_codec->isJapanese())
#else
if (m_type == DefaultEncoding && KGlobal::locale()->languageList()[0] == "ja")
#endif
{
#ifdef DECODE_DEBUG
kdDebug( 6005 ) << "Decoder: use auto-detect (" << strlen(data) << ")" << endl;
#endif
const char *autoDetectedEncoding;
switch ( KanjiCode::judge( data, len ) ) {
case KanjiCode::JIS:
#if NOKIA_CHANGES
autoDetectedEncoding = "jis_x0201-1997";
#else
autoDetectedEncoding = "jis7";
#endif
break;
case KanjiCode::EUC:
#if NOKIA_CHANGES
autoDetectedEncoding = "euc-jp" ;
#else
autoDetectedEncoding = "eucjp" ;
#endif
break;
case KanjiCode::SJIS:
#if NOKIA_CHANGES
autoDetectedEncoding = "shift_jis";
#else
autoDetectedEncoding = "sjis";
#endif
break;
default:
autoDetectedEncoding = NULL;
break;
}
#ifdef DECODE_DEBUG
kdDebug( 6005 ) << "Decoder: auto detect encoding is "
<< (autoDetectedEncoding ? autoDetectedEncoding : "NULL") << endl;
#endif
if (autoDetectedEncoding != 0) {
setEncoding(autoDetectedEncoding, AutoDetectedEncoding);
}
}
// if we still haven't found an encoding latin1 will be used...
// this is according to HTML4.0 specs
if (!m_codec)
{
if(enc.isEmpty()) enc = "iso8859-1";
m_codec = QTextCodec::codecForName(enc);
// be sure not to crash
if(!m_codec) {
enc = "iso8859-1";
m_codec = QTextCodec::codecForName(enc);
}
m_decoder = m_codec->makeDecoder();
}
QString out;
#if APPLE_CHANGES
if (!buffer.isEmpty()) {
if (!lookForMetaTag)
buffer.append(data, len);
out = m_decoder->toUnicode(buffer.latin1(), buffer.length());
buffer.truncate(0);
} else {
out = m_decoder->toUnicode(data, len);
}
#else
if(!buffer.isEmpty() && enc != "ISO-10646-UCS-2") {
out = m_decoder->toUnicode(buffer.latin1(), buffer.length());
buffer = "";
} else {
if(m_codec->mibEnum() != 1000) // utf16
{
// ### hack for a bug in QTextCodec. It cut's the input stream
// in case there are \0 in it. ZDNET has them inside... :-(
char *d = const_cast<char *>(data);
int i = len - 1;
while(i >= 0) {
if(*(d+i) == 0) *(d+i) = ' ';
i--;
}
}
out = m_decoder->toUnicode(data, len);
}
if (out.isNull()) {
fprintf(stderr, "ERROR: decoded string is null\n");
} else if (out.length() == 0) {
fprintf(stderr, "ERROR: decoded string length == 0\n");
}
// the hell knows, why the output does sometimes have a QChar::null at
// the end...
else if(out[out.length()-1] == QChar::null)
out.truncate(out.length() - 1);
#endif
return out;
}
QString Decoder::flush() const
{
#if APPLE_CHANGES
return m_decoder->toUnicode(buffer.latin1(), buffer.length(), true);
#else
return m_decoder->toUnicode(buffer.latin1(), buffer.length());
#endif
}
// -----------------------------------------------------------------------------
#undef DECODE_DEBUG
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -