xmlutf8transcoder.cpp
来自「IBM的解析xml的工具Xerces的源代码」· C++ 代码 · 共 555 行 · 第 1/2 页
CPP
555 行
/* * Copyright 1999-2004 The Apache Software Foundation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *//** * $Id: XMLUTF8Transcoder.cpp,v 1.10 2004/09/08 13:56:24 peiyongz Exp $ */// ---------------------------------------------------------------------------// Includes// ---------------------------------------------------------------------------#include <xercesc/util/TranscodingException.hpp>#include <xercesc/util/XMLString.hpp>#include <xercesc/util/XMLUniDefs.hpp>#include <xercesc/util/XMLUTF8Transcoder.hpp>XERCES_CPP_NAMESPACE_BEGIN// ---------------------------------------------------------------------------// Local static data//// gUTFBytes// A list of counts of trailing bytes for each initial byte in the input.//// gUTFByteIndicator// For a UTF8 sequence of n bytes, n>=2, the first byte of the// sequence must contain n 1's followed by precisely 1 0 with the// rest of the byte containing arbitrary bits. This array stores// the required bit pattern for validity checking.// gUTFByteIndicatorTest// When bitwise and'd with the observed value, if the observed// value is correct then a result matching gUTFByteIndicator will// be produced.//// gUTFOffsets// A list of values to offset each result char type, according to how// many source bytes when into making it.//// gFirstByteMark// A list of values to mask onto the first byte of an encoded sequence,// indexed by the number of bytes used to create the sequence.// ---------------------------------------------------------------------------static const XMLByte gUTFBytes[256] ={ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 , 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 , 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 , 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 , 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5};static const XMLByte gUTFByteIndicator[6] ={ 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC};static const XMLByte gUTFByteIndicatorTest[6] ={ 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE};static const XMLUInt32 gUTFOffsets[6] ={ 0, 0x3080, 0xE2080, 0x3C82080, 0xFA082080, 0x82082080};static const XMLByte gFirstByteMark[7] ={ 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC};// ---------------------------------------------------------------------------// XMLUTF8Transcoder: Constructors and Destructor// ---------------------------------------------------------------------------XMLUTF8Transcoder::XMLUTF8Transcoder(const XMLCh* const encodingName , const unsigned int blockSize , MemoryManager* const manager):XMLTranscoder(encodingName, blockSize, manager){}XMLUTF8Transcoder::~XMLUTF8Transcoder(){}// ---------------------------------------------------------------------------// XMLUTF8Transcoder: Implementation of the transcoder API// ---------------------------------------------------------------------------unsigned intXMLUTF8Transcoder::transcodeFrom(const XMLByte* const srcData , const unsigned int srcCount , XMLCh* const toFill , const unsigned int maxChars , unsigned int& bytesEaten , unsigned char* const charSizes){ // Watch for pathological scenario. Shouldn't happen, but... if (!srcCount || !maxChars) return 0; // If debugging, make sure that the block size is legal #if defined(XERCES_DEBUG) checkBlockSize(maxChars); #endif // // Get pointers to our start and end points of the input and output // buffers. // const XMLByte* srcPtr = srcData; const XMLByte* srcEnd = srcPtr + srcCount; XMLCh* outPtr = toFill; XMLCh* outEnd = outPtr + maxChars; unsigned char* sizePtr = charSizes; // // We now loop until we either run out of input data, or room to store // output chars. // while ((srcPtr < srcEnd) && (outPtr < outEnd)) { // Special-case ASCII, which is a leading byte value of <= 127 if (*srcPtr <= 127) { *outPtr++ = XMLCh(*srcPtr++); *sizePtr++ = 1; continue; } // See how many trailing src bytes this sequence is going to require const unsigned int trailingBytes = gUTFBytes[*srcPtr]; // // If there are not enough source bytes to do this one, then we // are done. Note that we done >= here because we are implicitly // counting the 1 byte we get no matter what. // // If we break out here, then there is nothing to undo since we // haven't updated any pointers yet. // if (srcPtr + trailingBytes >= srcEnd) break; // Looks ok, so lets build up the value // or at least let's try to do so--remembering that // we cannot assume the encoding to be valid: // first, test first byte if((gUTFByteIndicatorTest[trailingBytes] & *srcPtr) != gUTFByteIndicator[trailingBytes]) { char pos[2] = {(char)0x31, 0}; char len[2] = {(char)(trailingBytes+0x31), 0}; char byte[2] = {*srcPtr,0}; ThrowXMLwithMemMgr3(UTFDataFormatException, XMLExcepts::UTF8_FormatError, pos, byte, len, getMemoryManager()); } /*** * http://www.unicode.org/reports/tr27/ * * Table 3.1B. lists all of the byte sequences that are legal in UTF-8. * A range of byte values such as A0..BF indicates that any byte from A0 to BF (inclusive) * is legal in that position. * Any byte value outside of the ranges listed is illegal. * For example, * the byte sequence <C0 AF> is illegal since C0 is not legal in the 1st Byte column. * The byte sequence <E0 9F 80> is illegal since in the row * where E0 is legal as a first byte, * 9F is not legal as a second byte. * The byte sequence <F4 80 83 92> is legal, since every byte in that sequence matches * a byte range in a row of the table (the last row). * * * Table 3.1B. Legal UTF-8 Byte Sequences * Code Points 1st Byte 2nd Byte 3rd Byte 4th Byte * ========================================================================= * U+0000..U+007F 00..7F * ------------------------------------------------------------------------- * U+0080..U+07FF C2..DF 80..BF * * ------------------------------------------------------------------------- * U+0800..U+0FFF E0 A0..BF 80..BF * -- * * U+1000..U+FFFF E1..EF 80..BF 80..BF * * -------------------------------------------------------------------------- * U+10000..U+3FFFF F0 90..BF 80..BF 80..BF * -- * U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF * U+100000..U+10FFFF F4 80..8F 80..BF 80..BF * -- * ========================================================================== * * Cases where a trailing byte range is not 80..BF are underlined in the table to * draw attention to them. These occur only in the second byte of a sequence. * ***/ XMLUInt32 tmpVal = 0; switch(trailingBytes) { case 1 : // UTF-8: [110y yyyy] [10xx xxxx] // Unicode: [0000 0yyy] [yyxx xxxx] // // 0xC0, 0xC1 has been filtered out checkTrailingBytes(*(srcPtr+1), 1, 1); tmpVal = *srcPtr++; tmpVal <<= 6; tmpVal += *srcPtr++; break; case 2 : // UTF-8: [1110 zzzz] [10yy yyyy] [10xx xxxx] // Unicode: [zzzz yyyy] [yyxx xxxx] // if (( *srcPtr == 0xE0) && ( *(srcPtr+1) < 0xA0)) { char byte0[2] = {*srcPtr ,0}; char byte1[2] = {*(srcPtr+1),0}; ThrowXMLwithMemMgr2(UTFDataFormatException , XMLExcepts::UTF8_Invalid_3BytesSeq , byte0 , byte1 , getMemoryManager()); } checkTrailingBytes(*(srcPtr+1), 2, 1); checkTrailingBytes(*(srcPtr+2), 2, 2); // // D36 (a) UTF-8 is the Unicode Transformation Format that serializes // a Unicode code point as a sequence of one to four bytes, // as specified in Table 3.1, UTF-8 Bit Distribution. // (b) An illegal UTF-8 code unit sequence is any byte sequence that // does not match the patterns listed in Table 3.1B, Legal UTF-8 // Byte Sequences. // (c) An irregular UTF-8 code unit sequence is a six-byte sequence // where the first three bytes correspond to a high surrogate, // and the next three bytes correspond to a low surrogate. // As a consequence of C12, these irregular UTF-8 sequences shall // not be generated by a conformant process. // //irregular three bytes sequence // that is zzzzyy matches leading surrogate tag 110110 or // trailing surrogate tag 110111 // *srcPtr=1110 1101 // *(srcPtr+1)=1010 yyyy or // *(srcPtr+1)=1011 yyyy
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?