xmlreader.cpp
来自「IBM的解析xml的工具Xerces的源代码」· C++ 代码 · 共 1,501 行 · 第 1/4 页
CPP
1,501 行
} #elif defined(ENDIANMODE_BIG) if ((fEncoding == XMLRecognizer::UTF_16L) || (fEncoding == XMLRecognizer::UCS_4L)) { fSwapped = true; } #endif}//// This is called from the constructor when the encoding is not forced.// We assume that the encoding has been auto-sensed at this point and that// fSwapped is set correctly.//// In the case of UCS-4 and EBCDIC, we don't have to check for a decl.// The fact that we got here, means that there is one, because that's the// only way we can autosense those.//void XMLReader::doInitDecode(){ switch(fEncoding) { case XMLRecognizer::UCS_4B : case XMLRecognizer::UCS_4L : { // Remove bom if any if (((fRawByteBuf[0] == 0x00) && (fRawByteBuf[1] == 0x00) && (fRawByteBuf[2] == 0xFE) && (fRawByteBuf[3] == 0xFF)) || ((fRawByteBuf[0] == 0xFF) && (fRawByteBuf[1] == 0xFE) && (fRawByteBuf[2] == 0x00) && (fRawByteBuf[3] == 0x00)) ) { for (unsigned int i = 0; i < fRawBytesAvail; i++) fRawByteBuf[i] = fRawByteBuf[i+4]; fRawBytesAvail -=4; } // Look at the raw buffer as UCS4 chars const UCS4Ch* asUCS = (const UCS4Ch*)fRawByteBuf; while (fRawBufIndex < fRawBytesAvail) { // Get out the current 4 byte value and inc our raw buf index UCS4Ch curVal = *asUCS++; fRawBufIndex += sizeof(UCS4Ch); // Swap if that is required for this machine if (fSwapped) curVal = BitOps::swapBytes(curVal); // Make sure its at least semi legal. If not, undo and throw if (curVal > 0xFFFF) { fCharsAvail = 0; fRawBufIndex = 0; fMemoryManager->deallocate(fPublicId); fMemoryManager->deallocate(fEncodingStr); ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager); ThrowXMLwithMemMgr1 ( TranscodingException , XMLExcepts::Reader_CouldNotDecodeFirstLine , fSystemId , fMemoryManager ); } // Convert the value to an XML char and store it fCharSizeBuf[fCharsAvail] = 4; fCharBuf[fCharsAvail++] = XMLCh(curVal); // Break out on the > character if (curVal == chCloseAngle) break; } break; } case XMLRecognizer::UTF_8 : { // If there's a utf-8 BOM (0xEF 0xBB 0xBF), skip past it. // Don't move to char buf - no one wants to see it. // Note: this causes any encoding= declaration to override // the BOM's attempt to say that the encoding is utf-8. // Look at the raw buffer as short chars const char* asChars = (const char*)fRawByteBuf; if (fRawBytesAvail > XMLRecognizer::fgUTF8BOMLen && XMLString::compareNString( asChars , XMLRecognizer::fgUTF8BOM , XMLRecognizer::fgUTF8BOMLen) == 0) { fRawBufIndex += XMLRecognizer::fgUTF8BOMLen; asChars += XMLRecognizer::fgUTF8BOMLen; } // // First check that there are enough bytes to even see the // decl indentifier. If not, get out now with no action since // there is no decl. // if (fRawBytesAvail < XMLRecognizer::fgASCIIPreLen) break; // Check for the opening sequence. If not, then no decl if (XMLString::compareNString( asChars , XMLRecognizer::fgASCIIPre , XMLRecognizer::fgASCIIPreLen)) { break; } while (fRawBufIndex < fRawBytesAvail) { const char curCh = *asChars++; fRawBufIndex++; // Looks ok, so store it fCharSizeBuf[fCharsAvail] = 1; fCharBuf[fCharsAvail++] = XMLCh(curCh); // Break out on a > character if (curCh == chCloseAngle) break; // // A char greater than 0x7F is not allowed in this case. If // so, undo and throw. // if (curCh & 0x80) { fCharsAvail = 0; fRawBufIndex = 0; fMemoryManager->deallocate(fPublicId); fMemoryManager->deallocate(fEncodingStr); ArrayJanitor<XMLCh> janValue(fSystemId, fMemoryManager); ThrowXMLwithMemMgr1 ( TranscodingException , XMLExcepts::Reader_CouldNotDecodeFirstLine , fSystemId , fMemoryManager ); } } break; } case XMLRecognizer::UTF_16B : case XMLRecognizer::UTF_16L : { // // If there is a decl here, we just truncate back the characters // as we go. No surrogate creation would be allowed here in legal // XML, so we consider it a transoding error if we find one. // if (fRawBytesAvail < 2) break; unsigned int postBOMIndex = 0; const UTF16Ch* asUTF16 = (const UTF16Ch*)&fRawByteBuf[fRawBufIndex]; if ((*asUTF16 == chUnicodeMarker) || (*asUTF16 == chSwappedUnicodeMarker)) { fRawBufIndex += sizeof(UTF16Ch); asUTF16++; postBOMIndex = fRawBufIndex; } // First check that there are enough raw bytes for there to even // be a decl indentifier. If not, then nothing to do. // if (fRawBytesAvail - fRawBufIndex < XMLRecognizer::fgUTF16PreLen) { fRawBufIndex = postBOMIndex; break; } // // See we get a match on the prefix. If not, then reset and // break out. // if (fEncoding == XMLRecognizer::UTF_16B) { if (memcmp(asUTF16, XMLRecognizer::fgUTF16BPre, XMLRecognizer::fgUTF16PreLen)) { fRawBufIndex = postBOMIndex; break; } } else { if (memcmp(asUTF16, XMLRecognizer::fgUTF16LPre, XMLRecognizer::fgUTF16PreLen)) { fRawBufIndex = postBOMIndex; break; } } while (fRawBufIndex < fRawBytesAvail) { // Get out the current 2 byte value UTF16Ch curVal = *asUTF16++; fRawBufIndex += sizeof(UTF16Ch); // Swap if that is required for this machine if (fSwapped) curVal = BitOps::swapBytes(curVal); // // Store it and bump the target index, implicitly converting // if UTF16Ch and XMLCh are not the same size. // fCharSizeBuf[fCharsAvail] = 2; fCharBuf[fCharsAvail++] = curVal; // Break out on a > char if (curVal == chCloseAngle) break; } break; } case XMLRecognizer::EBCDIC : { // // We use special support in the intrinsic EBCDIC-US transcoder // to go through one char at a time. // const XMLByte* srcPtr = fRawByteBuf; while (1) { // Transcode one char from the source const XMLCh chCur = XMLEBCDICTranscoder::xlatThisOne(*srcPtr++); fRawBufIndex++; // // And put it into the character buffer. This stuff has to // look like it was normally transcoded. // fCharSizeBuf[fCharsAvail] = 1; fCharBuf[fCharsAvail++] = chCur; // If its a > char, then break out if (chCur == chCloseAngle) break; // Watch for using up all input and get out if (fRawBufIndex == fRawBytesAvail) break; } break; } default : // It should never be anything else here fMemoryManager->deallocate(fPublicId); fMemoryManager->deallocate(fEncodingStr); fMemoryManager->deallocate(fSystemId); ThrowXMLwithMemMgr(TranscodingException, XMLExcepts::Reader_BadAutoEncoding, fMemoryManager); break; } // // Ok, by the time we get here, if its a legal XML file we have eaten // the XML/TextDecl. So, if we are a PE and are being referenced from // outside a literal, then we need to throw in an arbitrary space that // is required by XML. // if ((fType == Type_PE) && (fRefFrom == RefFrom_NonLiteral)) fCharBuf[fCharsAvail++] = chSpace; // Calculate fCharOfsBuf buffer using the elements from fCharBufSize if (fCalculateSrcOfs) { fCharOfsBuf[0] = 0; for (unsigned int index = 1; index < fCharsAvail; ++index) { fCharOfsBuf[index] = fCharOfsBuf[index-1]+fCharSizeBuf[index-1]; } }}//// This method is called internally when we run out of bytes in the raw// buffer. We just read as many bytes as we can into the raw buffer again// and store the number of bytes we got.//void XMLReader::refreshRawBuffer(){ // // If there are any bytes left, move them down to the start. There // should only ever be (max bytes per char - 1) at the most. // const unsigned int bytesLeft = fRawBytesAvail - fRawBufIndex; // Move the existing ones down for (unsigned int index = 0; index < bytesLeft; index++) fRawByteBuf[index] = fRawByteBuf[fRawBufIndex + index]; // // And then read into the buffer past the existing bytes. Add back in // that many to the bytes read, and subtract that many from the bytes // requested. // fRawBytesAvail = fStream->readBytes ( &fRawByteBuf[bytesLeft], kRawBufSize - bytesLeft ) + bytesLeft; // // We need to reset the buffer index back to the start in all cases, // since any trailing data was copied down to the start. // fRawBufIndex = 0;}//// This method is called internally when we run out of characters in the// trancoded character buffer. We transcode up to another maxChars chars// from the//unsigned intXMLReader::xcodeMoreChars( XMLCh* const bufToFill , unsigned char* const charSizes , const unsigned int maxChars){ // If we are plain tuckered out, then return zero now if (!fRawBytesAvail) return 0; // // If our raw buffer is low, then lets load up another batch of // raw bytes now. We can't check for exactly zero bytes left because // transcoding of multi-byte encodings may have left a few bytes // representing a partial character in the buffer that can't be // used until the next buffer (and the rest of the character) // is read. // unsigned int bytesLeft = fRawBytesAvail - fRawBufIndex; if (bytesLeft < 100) { refreshRawBuffer(); // If we didn't get anything more just return a zero now if (!fRawBytesAvail) return 0; } // Ask the transcoder to internalize another batch of chars unsigned int bytesEaten; const unsigned int charsDone = fTranscoder->transcodeFrom ( &fRawByteBuf[fRawBufIndex] , fRawBytesAvail - fRawBufIndex , bufToFill , maxChars , bytesEaten , charSizes ); // Update the raw buffer index fRawBufIndex += bytesEaten; return charsDone;}XERCES_CPP_NAMESPACE_END
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?