textresourcedecoder.cpp

来自「linux下开源浏览器WebKit的源码,市面上的很多商用浏览器都是移植自Web」· C++ 代码 · 共 802 行 · 第 1/2 页
CPP
802 行
/*    Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008 Apple Inc. All rights reserved.    Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com)    This library is free software; you can redistribute it and/or    modify it under the terms of the GNU Library General Public    License as published by the Free Software Foundation; either    version 2 of the License, or (at your option) any later version.    This library is distributed in the hope that it will be useful,    but WITHOUT ANY WARRANTY; without even the implied warranty of    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU    Library General Public License for more details.    You should have received a copy of the GNU Library General Public License    along with this library; see the file COPYING.LIB.  If not, write to    the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,    Boston, MA 02110-1301, USA.*/#include "config.h"#include "TextResourceDecoder.h"#include "DOMImplementation.h"#include "HTMLNames.h"#include "TextCodec.h"#include <wtf/ASCIICType.h>#include <wtf/StringExtras.h>using namespace WTF;namespace WebCore {using namespace HTMLNames;// You might think we should put these find functions elsewhere, perhaps with the// similar functions that operate on UChar, but arguably only the decoder has// a reason to process strings of char rather than UChar.static int find(const char* subject, size_t subjectLength, const char* target){    size_t targetLength = strlen(target);    if (targetLength > subjectLength)        return -1;    for (size_t i = 0; i <= subjectLength - targetLength; ++i) {        bool match = true;        for (size_t j = 0; j < targetLength; ++j) {            if (subject[i + j] != target[j]) {                match = false;                break;            }        }        if (match)            return i;    }    return -1;}static int findIgnoringCase(const char* subject, size_t subjectLength, const char* target){    size_t targetLength = strlen(target);    if (targetLength > subjectLength)        return -1;#ifndef NDEBUG    for (size_t i = 0; i < targetLength; ++i)        ASSERT(isASCIILower(target[i]));#endif    for (size_t i = 0; i <= subjectLength - targetLength; ++i) {        bool match = true;        for (size_t j = 0; j < targetLength; ++j) {            if (toASCIILower(subject[i + j]) != target[j]) {                match = false;                break;            }        }        if (match)            return i;    }    return -1;}static TextEncoding findTextEncoding(const char* encodingName, int length){    Vector<char, 64> buffer(length + 1);    memcpy(buffer.data(), encodingName, length);    buffer[length] = '\0';    return buffer.data();}class KanjiCode {public:    enum Type { ASCII, JIS, EUC, SJIS, UTF16, UTF8 };    static enum Type judge(const char* str, int length);    static const int ESC = 0x1b;    static const unsigned char sjisMap[256];    static int ISkanji(int code)    {        if (code >= 0x100)            return 0;        return sjisMap[code & 0xff] & 1;    }    static int ISkana(int code)    {        if (code >= 0x100)            return 0;        return sjisMap[code & 0xff] & 2;    }};const unsigned char KanjiCode::sjisMap[256] = {    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,    0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0};/* * EUC-JP is *     [0xa1 - 0xfe][0xa1 - 0xfe] *     0x8e[0xa1 - 0xfe](SS2) *     0x8f[0xa1 - 0xfe][0xa1 - 0xfe](SS3) * * Shift_Jis is *     [0x81 - 0x9f, 0xe0 - 0xef(0xfe?)][0x40 - 0x7e, 0x80 - 0xfc] * * Shift_Jis Hankaku Kana is *     [0xa1 - 0xdf] *//* * KanjiCode::judge() is based on judge_jcode() from jvim *     http://hp.vector.co.jp/authors/VA003457/vim/ * * Special Thanks to Kenichi Tsuchida */enum KanjiCode::Type KanjiCode::judge(const char* str, int size){    enum Type code;    int i;    int bfr = false;            /* Kana Moji */    int bfk = 0;                /* EUC Kana */    int sjis = 0;    int euc = 0;    const unsigned char* ptr = reinterpret_cast<const unsigned char*>(str);    code = ASCII;    i = 0;    while (i < size) {        if (ptr[i] == ESC && (size - i >= 3)) {            if ((ptr[i + 1] == '$' && ptr[i + 2] == 'B')            || (ptr[i + 1] == '(' && ptr[i + 2] == 'B')) {                code = JIS;                goto breakBreak;            } else if ((ptr[i + 1] == '$' && ptr[i + 2] == '@')                    || (ptr[i + 1] == '(' && ptr[i + 2] == 'J')) {                code = JIS;                goto breakBreak;            } else if (ptr[i + 1] == '(' && ptr[i + 2] == 'I') {                code = JIS;                i += 3;            } else if (ptr[i + 1] == ')' && ptr[i + 2] == 'I') {                code = JIS;                i += 3;            } else {                i++;            }            bfr = false;            bfk = 0;        } else {            if (ptr[i] < 0x20) {                bfr = false;                bfk = 0;                /* ?? check kudokuten ?? && ?? hiragana ?? */                if ((i >= 2) && (ptr[i - 2] == 0x81)                        && (0x41 <= ptr[i - 1] && ptr[i - 1] <= 0x49)) {                    code = SJIS;                    sjis += 100;        /* kudokuten */                } else if ((i >= 2) && (ptr[i - 2] == 0xa1)                        && (0xa2 <= ptr[i - 1] && ptr[i - 1] <= 0xaa)) {                    code = EUC;                    euc += 100;         /* kudokuten */                } else if ((i >= 2) && (ptr[i - 2] == 0x82) && (0xa0 <= ptr[i - 1])) {                    sjis += 40;         /* hiragana */                } else if ((i >= 2) && (ptr[i - 2] == 0xa4) && (0xa0 <= ptr[i - 1])) {                    euc += 40;          /* hiragana */                }            } else {                /* ?? check hiragana or katana ?? */                if ((size - i > 1) && (ptr[i] == 0x82) && (0xa0 <= ptr[i + 1])) {                    sjis++;     /* hiragana */                } else if ((size - i > 1) && (ptr[i] == 0x83)                         && (0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x9f)) {                    sjis++;     /* katakana */                } else if ((size - i > 1) && (ptr[i] == 0xa4) && (0xa0 <= ptr[i + 1])) {                    euc++;      /* hiragana */                } else if ((size - i > 1) && (ptr[i] == 0xa5) && (0xa0 <= ptr[i + 1])) {                    euc++;      /* katakana */                }                if (bfr) {                    if ((i >= 1) && (0x40 <= ptr[i] && ptr[i] <= 0xa0) && ISkanji(ptr[i - 1])) {                        code = SJIS;                        goto breakBreak;                    } else if ((i >= 1) && (0x81 <= ptr[i - 1] && ptr[i - 1] <= 0x9f) && ((0x40 <= ptr[i] && ptr[i] < 0x7e) || (0x7e < ptr[i] && ptr[i] <= 0xfc))) {                        code = SJIS;                        goto breakBreak;                    } else if ((i >= 1) && (0xfd <= ptr[i] && ptr[i] <= 0xfe) && (0xa1 <= ptr[i - 1] && ptr[i - 1] <= 0xfe)) {                        code = EUC;                        goto breakBreak;                    } else if ((i >= 1) && (0xfd <= ptr[i - 1] && ptr[i - 1] <= 0xfe) && (0xa1 <= ptr[i] && ptr[i] <= 0xfe)) {                        code = EUC;                        goto breakBreak;                    } else if ((i >= 1) && (ptr[i] < 0xa0 || 0xdf < ptr[i]) && (0x8e == ptr[i - 1])) {                        code = SJIS;                        goto breakBreak;                    } else if (ptr[i] <= 0x7f) {                        code = SJIS;                        goto breakBreak;                    } else {                        if (0xa1 <= ptr[i] && ptr[i] <= 0xa6) {                            euc++;      /* sjis hankaku kana kigo */                        } else if (0xa1 <= ptr[i] && ptr[i] <= 0xdf) {                            ;           /* sjis hankaku kana */                        } else if (0xa1 <= ptr[i] && ptr[i] <= 0xfe) {                            euc++;                        } else if (0x8e == ptr[i]) {                            euc++;                        } else if (0x20 <= ptr[i] && ptr[i] <= 0x7f) {                            sjis++;                        }                        bfr = false;                        bfk = 0;                    }                } else if (0x8e == ptr[i]) {                    if (size - i <= 1) {                        ;                    } else if (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xdf) {                        /* EUC KANA or SJIS KANJI */                        if (bfk == 1) {                            euc += 100;                        }                        bfk++;                        i++;                    } else {                        /* SJIS only */                        code = SJIS;                        goto breakBreak;                    }                } else if (0x81 <= ptr[i] && ptr[i] <= 0x9f) {                    /* SJIS only */                    code = SJIS;                    if ((size - i >= 1)                            && ((0x40 <= ptr[i + 1] && ptr[i + 1] <= 0x7e)                            || (0x80 <= ptr[i + 1] && ptr[i + 1] <= 0xfc))) {                        goto breakBreak;                    }                } else if (0xfd <= ptr[i] && ptr[i] <= 0xfe) {                    /* EUC only */                    code = EUC;                    if ((size - i >= 1)                            && (0xa1 <= ptr[i + 1] && ptr[i + 1] <= 0xfe)) {                        goto breakBreak;                    }                } else if (ptr[i] <= 0x7f) {                    ;                } else {                    bfr = true;                    bfk = 0;                }            }            i++;        }    }    if (code == ASCII) {        if (sjis > euc) {            code = SJIS;        } else if (sjis < euc) {            code = EUC;        }    }breakBreak:    return (code);}TextResourceDecoder::ContentType TextResourceDecoder::determineContentType(const String& mimeType){    if (equalIgnoringCase(mimeType, "text/css"))        return CSS;    if (equalIgnoringCase(mimeType, "text/html"))        return HTML;    if (DOMImplementation::isXMLMIMEType(mimeType))        return XML;    return PlainText;}const TextEncoding& TextResourceDecoder::defaultEncoding(ContentType contentType, const TextEncoding& specifiedDefaultEncoding){    // Despite 8.5 "Text/xml with Omitted Charset" of RFC 3023, we assume UTF-8 instead of US-ASCII     // for text/xml. This matches Firefox.    if (contentType == XML)        return UTF8Encoding();    if (!specifiedDefaultEncoding.isValid())        return Latin1Encoding();    return specifiedDefaultEncoding;}TextResourceDecoder::TextResourceDecoder(const String& mimeType, const TextEncoding& specifiedDefaultEncoding)    : m_contentType(determineContentType(mimeType))    , m_decoder(defaultEncoding(m_contentType, specifiedDefaultEncoding))    , m_source(DefaultEncoding)    , m_checkedForBOM(false)    , m_checkedForCSSCharset(false)    , m_checkedForHeadCharset(false)    , m_useLenientXMLDecoding(false)    , m_sawError(false){}TextResourceDecoder::~TextResourceDecoder(){}void TextResourceDecoder::setEncoding(const TextEncoding& encoding, EncodingSource source){    // In case the encoding didn't exist, we keep the old one (helps some sites specifying invalid encodings).    if (!encoding.isValid())        return;    // When encoding comes from meta tag (i.e. it cannot be XML files sent via XHR),    // treat x-user-defined as windows-1252 (bug 18270)    if (source == EncodingFromMetaTag && strcasecmp(encoding.name(), "x-user-defined") == 0)        m_decoder.reset("windows-1252");     else if (source == EncodingFromMetaTag || source == EncodingFromXMLHeader || source == EncodingFromCSSCharset)                m_decoder.reset(encoding.closestByteBasedEquivalent());    else        m_decoder.reset(encoding);    m_source = source;}// Returns the position of the encoding string.static int findXMLEncoding(const char* str, int len, int& encodingLength){    int pos = find(str, len, "encoding");    if (pos == -1)        return -1;    pos += 8;        // Skip spaces and stray control characters.    while (pos < len && str[pos] <= ' ')        ++pos;    // Skip equals sign.    if (pos >= len || str[pos] != '=')        return -1;    ++pos;    // Skip spaces and stray control characters.    while (pos < len && str[pos] <= ' ')        ++pos;    // Skip quotation mark.    if (pos >= len)        return - 1;    char quoteMark = str[pos];    if (quoteMark != '"' && quoteMark != '\'')        return -1;    ++pos;    // Find the trailing quotation mark.    int end = pos;    while (end < len && str[end] != quoteMark)        ++end;    if (end >= len)        return -1;    encodingLength = end - pos;    return pos;}// true if there is more to parsestatic inline bool skipWhitespace(const char*& pos, const char* dataEnd){    while (pos < dataEnd && (*pos == '\t' || *pos == ' '))        ++pos;
textresourcedecoder.cpp - 源码说明

本页面展示了「linux下开源浏览器WebKit的源码,市面上的很多商用浏览器都是移植自WebKit」中的 textresourcedecoder.cpp 源码文件，采用 C++ 编程语言编写，共 802 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与WebKit相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?