xmlreader.cpp

来自「IBM的解析xml的工具Xerces的源代码」· C++ 代码 · 共 1,501 行 · 第 1/4 页

CPP
1,501
字号
        }    #elif defined(ENDIANMODE_BIG)        if ((fEncoding == XMLRecognizer::UTF_16L)        ||  (fEncoding == XMLRecognizer::UCS_4L))        {            fSwapped = true;        }    #endif}////  This is called from the constructor when the encoding is not forced.//  We assume that the encoding has been auto-sensed at this point and that//  fSwapped is set correctly.////  In the case of UCS-4 and EBCDIC, we don't have to check for a decl.//  The fact that we got here, means that there is one, because that's the//  only way we can autosense those.//void XMLReader::doInitDecode(){    switch(fEncoding)    {        case XMLRecognizer::UCS_4B :        case XMLRecognizer::UCS_4L :        {            // Remove bom if any            if (((fRawByteBuf[0] == 0x00) && (fRawByteBuf[1] == 0x00) && (fRawByteBuf[2] == 0xFE) && (fRawByteBuf[3] == 0xFF)) ||                ((fRawByteBuf[0] == 0xFF) && (fRawByteBuf[1] == 0xFE) && (fRawByteBuf[2] == 0x00) && (fRawByteBuf[3] == 0x00))  )            {                for (unsigned int i = 0; i < fRawBytesAvail; i++)                    fRawByteBuf[i] = fRawByteBuf[i+4];                fRawBytesAvail -=4;            }            // Look at the raw buffer as UCS4 chars            const UCS4Ch* asUCS = (const UCS4Ch*)fRawByteBuf;            while (fRawBufIndex < fRawBytesAvail)            {                // Get out the current 4 byte value and inc our raw buf index                UCS4Ch curVal = *asUCS++;                fRawBufIndex += sizeof(UCS4Ch);                // Swap if that is required for this machine                if (fSwapped)                    curVal = BitOps::swapBytes(curVal);                // Make sure its at least semi legal. If not, undo and throw                if (curVal > 0xFFFF)                {                    fCharsAvail = 0;                    fRawBufIndex = 0;                    fMemoryManager->deallocate(fPublicId);                    fMemoryManager->deallocate(fEncodingStr);                    ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);                    ThrowXMLwithMemMgr1                    (                        TranscodingException                        , XMLExcepts::Reader_CouldNotDecodeFirstLine                        , fSystemId                        , fMemoryManager                    );                }                // Convert the value to an XML char and store it                fCharSizeBuf[fCharsAvail] = 4;                fCharBuf[fCharsAvail++] = XMLCh(curVal);                // Break out on the > character                if (curVal == chCloseAngle)                    break;            }            break;        }        case XMLRecognizer::UTF_8 :        {            // If there's a utf-8 BOM  (0xEF 0xBB 0xBF), skip past it.            //   Don't move to char buf - no one wants to see it.            //   Note: this causes any encoding= declaration to override            //         the BOM's attempt to say that the encoding is utf-8.            // Look at the raw buffer as short chars            const char* asChars = (const char*)fRawByteBuf;            if (fRawBytesAvail > XMLRecognizer::fgUTF8BOMLen &&                XMLString::compareNString(  asChars                                            , XMLRecognizer::fgUTF8BOM                                            , XMLRecognizer::fgUTF8BOMLen) == 0)            {                fRawBufIndex += XMLRecognizer::fgUTF8BOMLen;                asChars      += XMLRecognizer::fgUTF8BOMLen;            }            //            //  First check that there are enough bytes to even see the            //  decl indentifier. If not, get out now with no action since            //  there is no decl.            //            if (fRawBytesAvail < XMLRecognizer::fgASCIIPreLen)                break;            // Check for the opening sequence. If not, then no decl            if (XMLString::compareNString(  asChars                                            , XMLRecognizer::fgASCIIPre                                            , XMLRecognizer::fgASCIIPreLen))            {                break;            }            while (fRawBufIndex < fRawBytesAvail)            {                const char curCh = *asChars++;                fRawBufIndex++;                // Looks ok, so store it                fCharSizeBuf[fCharsAvail] = 1;                fCharBuf[fCharsAvail++] = XMLCh(curCh);                // Break out on a > character                if (curCh == chCloseAngle)                    break;                //                //  A char greater than 0x7F is not allowed in this case. If                //  so, undo and throw.                //                if (curCh & 0x80)                {                    fCharsAvail = 0;                    fRawBufIndex = 0;                    fMemoryManager->deallocate(fPublicId);                    fMemoryManager->deallocate(fEncodingStr);                    ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager);                    ThrowXMLwithMemMgr1                    (                        TranscodingException                        , XMLExcepts::Reader_CouldNotDecodeFirstLine                        , fSystemId                        , fMemoryManager                    );                }            }            break;        }        case XMLRecognizer::UTF_16B :        case XMLRecognizer::UTF_16L :        {            //            //  If there is a decl here, we just truncate back the characters            //  as we go. No surrogate creation would be allowed here in legal            //  XML, so we consider it a transoding error if we find one.            //            if (fRawBytesAvail < 2)                break;            unsigned int postBOMIndex = 0;            const UTF16Ch* asUTF16 = (const UTF16Ch*)&fRawByteBuf[fRawBufIndex];            if ((*asUTF16 == chUnicodeMarker) || (*asUTF16 == chSwappedUnicodeMarker))            {                fRawBufIndex += sizeof(UTF16Ch);                asUTF16++;                postBOMIndex = fRawBufIndex;            }            //  First check that there are enough raw bytes for there to even            //  be a decl indentifier. If not, then nothing to do.            //            if (fRawBytesAvail - fRawBufIndex < XMLRecognizer::fgUTF16PreLen)            {                fRawBufIndex = postBOMIndex;                break;            }            //            //  See we get a match on the prefix. If not, then reset and            //  break out.            //            if (fEncoding == XMLRecognizer::UTF_16B)            {                if (memcmp(asUTF16, XMLRecognizer::fgUTF16BPre, XMLRecognizer::fgUTF16PreLen))                {                    fRawBufIndex = postBOMIndex;                    break;                }            }             else            {                if (memcmp(asUTF16, XMLRecognizer::fgUTF16LPre, XMLRecognizer::fgUTF16PreLen))                {                    fRawBufIndex = postBOMIndex;                    break;                }            }            while (fRawBufIndex < fRawBytesAvail)            {                // Get out the current 2 byte value                UTF16Ch curVal = *asUTF16++;                fRawBufIndex += sizeof(UTF16Ch);                // Swap if that is required for this machine                if (fSwapped)                    curVal = BitOps::swapBytes(curVal);                //                //  Store it and bump the target index, implicitly converting                //  if UTF16Ch and XMLCh are not the same size.                //                fCharSizeBuf[fCharsAvail] = 2;                fCharBuf[fCharsAvail++] = curVal;                // Break out on a > char                if (curVal == chCloseAngle)                    break;            }            break;        }        case XMLRecognizer::EBCDIC :        {            //            //  We use special support in the intrinsic EBCDIC-US transcoder            //  to go through one char at a time.            //            const XMLByte* srcPtr = fRawByteBuf;            while (1)            {                // Transcode one char from the source                const XMLCh chCur = XMLEBCDICTranscoder::xlatThisOne(*srcPtr++);                fRawBufIndex++;                //                //  And put it into the character buffer. This stuff has to                //  look like it was normally transcoded.                //                fCharSizeBuf[fCharsAvail] = 1;                fCharBuf[fCharsAvail++] = chCur;                // If its a > char, then break out                if (chCur == chCloseAngle)                    break;                // Watch for using up all input and get out                if (fRawBufIndex == fRawBytesAvail)                    break;            }            break;        }        default :            // It should never be anything else here            fMemoryManager->deallocate(fPublicId);            fMemoryManager->deallocate(fEncodingStr);                                fMemoryManager->deallocate(fSystemId);            ThrowXMLwithMemMgr(TranscodingException, XMLExcepts::Reader_BadAutoEncoding, fMemoryManager);            break;    }    //    //  Ok, by the time we get here, if its a legal XML file we have eaten    //  the XML/TextDecl. So, if we are a PE and are being referenced from    //  outside a literal, then we need to throw in an arbitrary space that    //  is required by XML.    //    if ((fType == Type_PE) && (fRefFrom == RefFrom_NonLiteral))        fCharBuf[fCharsAvail++] = chSpace;        //  Calculate fCharOfsBuf buffer using the elements from fCharBufSize    if (fCalculateSrcOfs)    {        fCharOfsBuf[0] = 0;        for (unsigned int index = 1; index < fCharsAvail; ++index) {            fCharOfsBuf[index] = fCharOfsBuf[index-1]+fCharSizeBuf[index-1];        }    }}////  This method is called internally when we run out of bytes in the raw//  buffer. We just read as many bytes as we can into the raw buffer again//  and store the number of bytes we got.//void XMLReader::refreshRawBuffer(){    //    //  If there are any bytes left, move them down to the start. There    //  should only ever be (max bytes per char - 1) at the most.    //    const unsigned int bytesLeft = fRawBytesAvail - fRawBufIndex;    // Move the existing ones down    for (unsigned int index = 0; index < bytesLeft; index++)        fRawByteBuf[index] = fRawByteBuf[fRawBufIndex + index];    //    //  And then read into the buffer past the existing bytes. Add back in    //  that many to the bytes read, and subtract that many from the bytes    //  requested.    //    fRawBytesAvail = fStream->readBytes    (        &fRawByteBuf[bytesLeft], kRawBufSize - bytesLeft    ) + bytesLeft;    //    //  We need to reset the buffer index back to the start in all cases,    //  since any trailing data was copied down to the start.    //    fRawBufIndex = 0;}////  This method is called internally when we run out of characters in the//  trancoded character buffer. We transcode up to another maxChars chars//  from the//unsigned intXMLReader::xcodeMoreChars(          XMLCh* const            bufToFill                            ,       unsigned char* const    charSizes                            , const unsigned int            maxChars){    // If we are plain tuckered out, then return zero now    if (!fRawBytesAvail)        return 0;    //    //  If our raw buffer is low, then lets load up another batch of    //  raw bytes now.  We can't check for exactly zero bytes left because    //  transcoding of multi-byte encodings may have left a few bytes    //  representing a partial character in the buffer that can't be    //  used until the next buffer (and the rest of the character)    //  is read.    //    unsigned int bytesLeft = fRawBytesAvail - fRawBufIndex;    if (bytesLeft < 100)    {        refreshRawBuffer();        // If we didn't get anything more just return a zero now        if (!fRawBytesAvail)            return 0;    }    // Ask the transcoder to internalize another batch of chars    unsigned int bytesEaten;    const unsigned int charsDone = fTranscoder->transcodeFrom    (        &fRawByteBuf[fRawBufIndex]        , fRawBytesAvail - fRawBufIndex        , bufToFill        , maxChars        , bytesEaten        , charSizes    );    // Update the raw buffer index    fRawBufIndex += bytesEaten;    return charsDone;}XERCES_CPP_NAMESPACE_END

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?