xmlutf8transcoder.cpp

来自「IBM的解析xml的工具Xerces的源代码」· C++ 代码 · 共 555 行 · 第 1/2 页

CPP
555
字号
/* * Copyright 1999-2004 The Apache Software Foundation. *  * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at *  *      http://www.apache.org/licenses/LICENSE-2.0 *  * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *//** * $Id: XMLUTF8Transcoder.cpp,v 1.10 2004/09/08 13:56:24 peiyongz Exp $ */// ---------------------------------------------------------------------------//  Includes// ---------------------------------------------------------------------------#include <xercesc/util/TranscodingException.hpp>#include <xercesc/util/XMLString.hpp>#include <xercesc/util/XMLUniDefs.hpp>#include <xercesc/util/XMLUTF8Transcoder.hpp>XERCES_CPP_NAMESPACE_BEGIN// ---------------------------------------------------------------------------//  Local static data////  gUTFBytes//      A list of counts of trailing bytes for each initial byte in the input.////  gUTFByteIndicator//      For a UTF8 sequence of n bytes, n>=2, the first byte of the//      sequence must contain n 1's followed by precisely 1 0 with the//      rest of the byte containing arbitrary bits.  This array stores//      the required bit pattern for validity checking.//  gUTFByteIndicatorTest//      When bitwise and'd with the observed value, if the observed//      value is correct then a result matching gUTFByteIndicator will//      be produced.////  gUTFOffsets//      A list of values to offset each result char type, according to how//      many source bytes when into making it.////  gFirstByteMark//      A list of values to mask onto the first byte of an encoded sequence,//      indexed by the number of bytes used to create the sequence.// ---------------------------------------------------------------------------static const XMLByte gUTFBytes[256] ={        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0    ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0    ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0    ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0    ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0    ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0    ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0    ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0    ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0    ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0    ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0    ,   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0    ,   0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1    ,   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1    ,   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2    ,   3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5};static const XMLByte gUTFByteIndicator[6] ={    0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC};static const XMLByte gUTFByteIndicatorTest[6] ={    0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE};static const XMLUInt32 gUTFOffsets[6] ={    0, 0x3080, 0xE2080, 0x3C82080, 0xFA082080, 0x82082080};static const XMLByte gFirstByteMark[7] ={    0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC};// ---------------------------------------------------------------------------//  XMLUTF8Transcoder: Constructors and Destructor// ---------------------------------------------------------------------------XMLUTF8Transcoder::XMLUTF8Transcoder(const  XMLCh* const    encodingName                                    , const unsigned int    blockSize                                    , MemoryManager* const  manager):XMLTranscoder(encodingName, blockSize, manager){}XMLUTF8Transcoder::~XMLUTF8Transcoder(){}// ---------------------------------------------------------------------------//  XMLUTF8Transcoder: Implementation of the transcoder API// ---------------------------------------------------------------------------unsigned intXMLUTF8Transcoder::transcodeFrom(const  XMLByte* const          srcData                                , const unsigned int            srcCount                                ,       XMLCh* const            toFill                                , const unsigned int            maxChars                                ,       unsigned int&           bytesEaten                                ,       unsigned char* const    charSizes){    // Watch for pathological scenario. Shouldn't happen, but...    if (!srcCount || !maxChars)        return 0;    // If debugging, make sure that the block size is legal    #if defined(XERCES_DEBUG)    checkBlockSize(maxChars);    #endif    //    //  Get pointers to our start and end points of the input and output    //  buffers.    //    const XMLByte*  srcPtr = srcData;    const XMLByte*  srcEnd = srcPtr + srcCount;    XMLCh*          outPtr = toFill;    XMLCh*          outEnd = outPtr + maxChars;    unsigned char*  sizePtr = charSizes;    //    //  We now loop until we either run out of input data, or room to store    //  output chars.    //    while ((srcPtr < srcEnd) && (outPtr < outEnd))    {        // Special-case ASCII, which is a leading byte value of <= 127        if (*srcPtr <= 127)        {            *outPtr++ = XMLCh(*srcPtr++);            *sizePtr++ = 1;            continue;        }        // See how many trailing src bytes this sequence is going to require        const unsigned int trailingBytes = gUTFBytes[*srcPtr];        //        //  If there are not enough source bytes to do this one, then we        //  are done. Note that we done >= here because we are implicitly        //  counting the 1 byte we get no matter what.        //        //  If we break out here, then there is nothing to undo since we        //  haven't updated any pointers yet.        //        if (srcPtr + trailingBytes >= srcEnd)            break;        // Looks ok, so lets build up the value        // or at least let's try to do so--remembering that        // we cannot assume the encoding to be valid:        // first, test first byte        if((gUTFByteIndicatorTest[trailingBytes] & *srcPtr) != gUTFByteIndicator[trailingBytes]) {            char pos[2] = {(char)0x31, 0};             char len[2] = {(char)(trailingBytes+0x31), 0};            char byte[2] = {*srcPtr,0};            ThrowXMLwithMemMgr3(UTFDataFormatException, XMLExcepts::UTF8_FormatError, pos, byte, len, getMemoryManager());        }        /***         * http://www.unicode.org/reports/tr27/         *         * Table 3.1B. lists all of the byte sequences that are legal in UTF-8.          * A range of byte values such as A0..BF indicates that any byte from A0 to BF (inclusive)          * is legal in that position.          * Any byte value outside of the ranges listed is illegal.          * For example,          * the byte sequence <C0 AF> is illegal  since C0 is not legal in the 1st Byte column.          * The byte sequence <E0 9F 80> is illegal since in the row          *    where E0 is legal as a first byte,          *    9F is not legal as a second byte.          * The byte sequence <F4 80 83 92> is legal, since every byte in that sequence matches          * a byte range in a row of the table (the last row).          *         *         * Table 3.1B. Legal UTF-8 Byte Sequences           * Code Points              1st Byte    2nd Byte    3rd Byte    4th Byte          * =========================================================================         * U+0000..U+007F            00..7F                * -------------------------------------------------------------------------         * U+0080..U+07FF            C2..DF      80..BF               *         * -------------------------------------------------------------------------         * U+0800..U+0FFF            E0          A0..BF     80..BF            *                                       --          *                                   * U+1000..U+FFFF            E1..EF      80..BF     80..BF             *         * --------------------------------------------------------------------------         * U+10000..U+3FFFF          F0          90..BF     80..BF       80..BF          *                                       --         * U+40000..U+FFFFF          F1..F3      80..BF     80..BF       80..BF          * U+100000..U+10FFFF        F4          80..8F     80..BF       80..BF          *                                           --         * ==========================================================================         *         *  Cases where a trailing byte range is not 80..BF are underlined in the table to          *  draw attention to them. These occur only in the second byte of a sequence.         *         ***/        XMLUInt32 tmpVal = 0;        switch(trailingBytes)        {            case 1 :                // UTF-8:   [110y yyyy] [10xx xxxx]                // Unicode: [0000 0yyy] [yyxx xxxx]                //                // 0xC0, 0xC1 has been filtered out                             checkTrailingBytes(*(srcPtr+1), 1, 1);                tmpVal = *srcPtr++;                tmpVal <<= 6;                tmpVal += *srcPtr++;                break;            case 2 :                // UTF-8:   [1110 zzzz] [10yy yyyy] [10xx xxxx]                // Unicode: [zzzz yyyy] [yyxx xxxx]                //                if (( *srcPtr == 0xE0) && ( *(srcPtr+1) < 0xA0))                 {                    char byte0[2] = {*srcPtr    ,0};                    char byte1[2] = {*(srcPtr+1),0};                    ThrowXMLwithMemMgr2(UTFDataFormatException                                      , XMLExcepts::UTF8_Invalid_3BytesSeq                                      , byte0                                      , byte1                                      , getMemoryManager());                }                checkTrailingBytes(*(srcPtr+1), 2, 1);                checkTrailingBytes(*(srcPtr+2), 2, 2);                //                // D36 (a) UTF-8 is the Unicode Transformation Format that serializes                 //         a Unicode code point as a sequence of one to four bytes,                 //         as specified in Table 3.1, UTF-8 Bit Distribution.                //     (b) An illegal UTF-8 code unit sequence is any byte sequence that                 //         does not match the patterns listed in Table 3.1B, Legal UTF-8                 //         Byte Sequences.                //     (c) An irregular UTF-8 code unit sequence is a six-byte sequence                 //         where the first three bytes correspond to a high surrogate,                 //         and the next three bytes correspond to a low surrogate.                 //         As a consequence of C12, these irregular UTF-8 sequences shall                 //         not be generated by a conformant process.                 //                //irregular three bytes sequence                // that is zzzzyy matches leading surrogate tag 110110 or                 //                       trailing surrogate tag 110111                // *srcPtr=1110 1101                 // *(srcPtr+1)=1010 yyyy or                 // *(srcPtr+1)=1011 yyyy

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?