xmlutf8transcoder390.cpp

来自「IBM的解析xml的工具Xerces的源代码」· C++ 代码 · 共 632 行 · 第 1/2 页

CPP
632
字号
                break;            case 2 :                // UTF-8:   [1110 zzzz] [10yy yyyy] [10xx xxxx]                // Unicode: [zzzz yyyy] [yyxx xxxx]                //                if (( *srcPtr == 0xE0) && ( *(srcPtr+1) < 0xA0))                 {                    char byte0[2] = {*srcPtr    ,0};                    char byte1[2] = {*(srcPtr+1),0};                    ThrowXMLwithMemMgr2(UTFDataFormatException                                      , XMLExcepts::UTF8_Invalid_3BytesSeq                                      , byte0                                      , byte1                                      , getMemoryManager());                }                checkTrailingBytes(*(srcPtr+1), 2, 1);                checkTrailingBytes(*(srcPtr+2), 2, 2);                //                // D36 (a) UTF-8 is the Unicode Transformation Format that serializes                 //         a Unicode code point as a sequence of one to four bytes,                 //         as specified in Table 3.1, UTF-8 Bit Distribution.                //     (b) An illegal UTF-8 code unit sequence is any byte sequence that                 //         does not match the patterns listed in Table 3.1B, Legal UTF-8                 //         Byte Sequences.                //     (c) An irregular UTF-8 code unit sequence is a six-byte sequence                 //         where the first three bytes correspond to a high surrogate,                 //         and the next three bytes correspond to a low surrogate.                 //         As a consequence of C12, these irregular UTF-8 sequences shall                 //         not be generated by a conformant process.                 //                //irregular three bytes sequence                // that is zzzzyy matches leading surrogate tag 110110 or                 //                       trailing surrogate tag 110111                // *srcPtr=1110 1101                 // *(srcPtr+1)=1010 yyyy or                 // *(srcPtr+1)=1011 yyyy                //                // 0xED 1110 1101                // 0xA0 1010 0000                if ((*srcPtr == 0xED) && (*(srcPtr+1) >= 0xA0))                {                    char byte0[2] = {*srcPtr,    0};                    char byte1[2] = {*(srcPtr+1),0};                     ThrowXMLwithMemMgr2(UTFDataFormatException                              , XMLExcepts::UTF8_Irregular_3BytesSeq                              , byte0                              , byte1                              , getMemoryManager());                }                tmpVal = *srcPtr++;                tmpVal <<= 6;                tmpVal += *srcPtr++;                tmpVal <<= 6;                tmpVal += *srcPtr++;                break;            case 3 :                 // UTF-8:   [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*                // Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)                //          [1101 11yy] [yyxx xxxx] (low surrogate)                //          * uuuuu = wwww + 1                //                if (((*srcPtr == 0xF0) && (*(srcPtr+1) < 0x90)) ||                    ((*srcPtr == 0xF4) && (*(srcPtr+1) > 0x8F))  )                {                    char byte0[2] = {*srcPtr    ,0};                    char byte1[2] = {*(srcPtr+1),0};                    ThrowXMLwithMemMgr2(UTFDataFormatException                                      , XMLExcepts::UTF8_Invalid_4BytesSeq                                      , byte0                                      , byte1                                      , getMemoryManager());                }                checkTrailingBytes(*(srcPtr+1), 3, 1);                checkTrailingBytes(*(srcPtr+2), 3, 2);                checkTrailingBytes(*(srcPtr+3), 3, 3);                                tmpVal = *srcPtr++;                tmpVal <<= 6;                tmpVal += *srcPtr++;                tmpVal <<= 6;                tmpVal += *srcPtr++;                tmpVal <<= 6;                tmpVal += *srcPtr++;                break;            default: // trailingBytes > 3                /***                 * The definition of UTF-8 in Annex D of ISO/IEC 10646-1:2000 also allows                  * for the use of five- and six-byte sequences to encode characters that                  * are outside the range of the Unicode character set; those five- and                  * six-byte sequences are illegal for the use of UTF-8 as a transformation                  * of Unicode characters. ISO/IEC 10646 does not allow mapping of unpaired                  * surrogates, nor U+FFFE and U+FFFF (but it does allow other noncharacters).                 ***/                char len[2]  = {(char)(trailingBytes+0x31), 0};                char byte[2] = {*srcPtr,0};                ThrowXMLwithMemMgr2(UTFDataFormatException                                  , XMLExcepts::UTF8_Exceede_BytesLimit                                  , byte                                  , len                                  , getMemoryManager());                break;        }        // since trailingBytes comes from an array, this logic is redundant        //  default :        //      ThrowXML(TranscodingException, XMLExcepts::Trans_BadSrcSeq);        //}        tmpVal -= gUTFOffsets[trailingBytes];        //        //  If it will fit into a single char, then put it in. Otherwise        //  encode it as a surrogate pair. If its not valid, use the        //  replacement char.        //        if (!(tmpVal & 0xFFFF0000))        {            *sizePtr++ = trailingBytes + 1;            *outPtr++ = XMLCh(tmpVal);        }         else if (tmpVal > 0x10FFFF)        {            //            //  If we've gotten more than 32 chars so far, then just break            //  out for now and lets process those. When we come back in            //  here again, we'll get no chars and throw an exception. This            //  way, the error will have a line and col number closer to            //  the real problem area.            //            if ((outPtr - toFill) > 32)                break;            ThrowXMLwithMemMgr(TranscodingException, XMLExcepts::Trans_BadSrcSeq, getMemoryManager());        }         else        {            //            //  If we have enough room to store the leading and trailing            //  chars, then lets do it. Else, pretend this one never            //  happened, and leave it for the next time. Since we don't            //  update the bytes read until the bottom of the loop, by            //  breaking out here its like it never happened.            //            if (outPtr + 1 >= outEnd)                break;            // Store the leading surrogate char            tmpVal -= 0x10000;            *sizePtr++ = trailingBytes + 1;            *outPtr++ = XMLCh((tmpVal >> 10) + 0xD800);            //            //  And then the trailing char. This one accounts for no            //  bytes eaten from the source, so set the char size for this            //  one to be zero.            //            *sizePtr++ = 0;            *outPtr++ = XMLCh((tmpVal & 0x3FF) + 0xDC00);        }    }    // Update the bytes eaten    bytesEaten = srcPtr - srcData;    // Return the characters read    return outPtr - toFill;}unsigned intXMLUTF8Transcoder390::transcodeTo( const   XMLCh* const    srcData                                , const unsigned int    srcCount                                ,       XMLByte* const  toFill                                , const unsigned int    maxBytes                                ,       unsigned int&   charsEaten                                , const UnRepOpts       options){    // Watch for pathological scenario. Shouldn't happen, but...    if (!srcCount || !maxBytes)        return 0;    //    //  Get pointers to our start and end points of the input and output    //  buffers.    //    const XMLCh*    srcPtr = srcData;    const XMLCh*    srcEnd = srcPtr + srcCount;    XMLByte*        outPtr = toFill;    XMLByte*        outEnd = toFill + maxBytes;    while (srcPtr < srcEnd)    {        //        //  Tentatively get the next char out. We have to get it into a        //  32 bit value, because it could be a surrogate pair.        //        XMLUInt32 curVal = *srcPtr;        //        //  If its a leading surrogate, then lets see if we have the trailing        //  available. If not, then give up now and leave it for next time.        //        unsigned int srcUsed = 1;        if ((curVal >= 0xD800) && (curVal <= 0xDBFF))        {            if (srcPtr + 1 >= srcEnd)                break;            // Create the composite surrogate pair            curVal = ((curVal - 0xD800) << 10)                    + ((*(srcPtr + 1) - 0xDC00) + 0x10000);            // And indicate that we ate another one            srcUsed++;        }        // Figure out how many bytes we need        unsigned int encodedBytes;        if (curVal < 0x80)            encodedBytes = 1;        else if (curVal < 0x800)            encodedBytes = 2;        else if (curVal < 0x10000)            encodedBytes = 3;        else if (curVal < 0x110000)            encodedBytes = 4;        else        {            // If the options say to throw, then throw            if (options == UnRep_Throw)            {                XMLCh tmpBuf[17];                XMLString::binToText(curVal, tmpBuf, 16, 16, getMemoryManager());                ThrowXMLwithMemMgr2                (                    TranscodingException                    , XMLExcepts::Trans_Unrepresentable                    , tmpBuf                    , getEncodingName()                    , getMemoryManager()                );            }            // Else, use the replacement character            *outPtr++ = chSpace;            srcPtr += srcUsed;            continue;        }        //        //  If we cannot fully get this char into the output buffer,        //  then leave it for the next time.        //        if (outPtr + encodedBytes > outEnd)            break;        // We can do it, so update the source index        srcPtr += srcUsed;        //        //  And spit out the bytes. We spit them out in reverse order        //  here, so bump up the output pointer and work down as we go.        //        outPtr += encodedBytes;        switch(encodedBytes)        {            case 6 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);                     curVal >>= 6;            case 5 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);                     curVal >>= 6;            case 4 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);                     curVal >>= 6;            case 3 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);                     curVal >>= 6;            case 2 : *--outPtr = XMLByte((curVal | 0x80UL) & 0xBFUL);                     curVal >>= 6;            case 1 : *--outPtr = XMLByte                     (                        curVal | gFirstByteMark[encodedBytes]                     );        }        // Add the encoded bytes back in again to indicate we've eaten them        outPtr += encodedBytes;    }    // Fill in the chars we ate    charsEaten = (srcPtr - srcData);    // And return the bytes we filled in    return (outPtr - toFill);}bool XMLUTF8Transcoder390::canTranscodeTo(const unsigned int toCheck) const{    // We can represent anything in the Unicode (with surrogates) range    return (toCheck <= 0x10FFFF);}XERCES_CPP_NAMESPACE_END

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?