📄 txml_parser.cpp
字号:
const char* q = p;
if ( ignoreCase )
{
while ( *q && *tag && to_lower( *q, encoding_ ) == to_lower( *tag, encoding_ ) )
{
++q;
++tag;
}
if ( *tag == 0 )
return true;
}
else
{
while ( *q && *tag && *q == *tag )
{
++q;
++tag;
}
if ( *tag == 0 ) // Have we found the end of the tag, and everything equal?
return true;
}
return false;
}
const char* base::read_text( const char* p,
TIXML_STRING * text_,
bool trimWhiteSpace,
const char* endTag,
bool caseInsensitive,
encoding encoding_ )
{
*text_ = "";
if ( !trimWhiteSpace // certain tags always keep whitespace_
|| !condenseWhiteSpace ) // if true, whitespace_ is always kept
{
// Keep all the white_ space.
while ( p && *p
&& !string_equal( p, endTag, caseInsensitive, encoding_ )
)
{
int len;
char cArr[4] = { 0, 0, 0, 0 };
p = get_char( p, cArr, &len, encoding_ );
text_->append( cArr, len );
}
}
else
{
bool whitespace_ = false;
// remove leading white_ space:
p = skip_white_space( p, encoding_ );
while ( p && *p
&& !string_equal( p, endTag, caseInsensitive, encoding_ ) )
{
if ( *p == '\r' || *p == '\n' )
{
whitespace_ = true;
++p;
}
else if ( is_white_space( *p ) )
{
whitespace_ = true;
++p;
}
else
{
// If we've found whitespace_, add_ it_ before the
// new character. Any whitespace_ just becomes a space.
if ( whitespace_ )
{
(*text_) += ' ';
whitespace_ = false;
}
int len;
char cArr[4] = { 0, 0, 0, 0 };
p = get_char( p, cArr, &len, encoding_ );
if ( len == 1 )
(*text_) += cArr[0]; // more efficient
else
text_->append( cArr, len );
}
}
}
if ( p )
p += strlen( endTag );
return p;
}
#ifdef TIXML_USE_STL
void document::stream_in( std::istream * in, TIXML_STRING * tag )
{
// The basic_ issue with a document_ is that we don't know what we're
// streaming_. read something presumed to be a tag (and hope), then
// identify_ it_, and call the appropriate stream_ method on the tag.
//
// this "pre-streaming" will never read_ the closing ">" so the
// sub-tag can orient itself.
if ( !stream_to( in, '<', tag ) )
{
set_error( TIXML_ERROR_PARSING_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
return;
}
while ( in->good() )
{
int tagIndex = (int) tag->length();
while ( in->good() && in->peek() != '>' )
{
int c = in->get();
if ( c <= 0 )
{
set_error( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
break;
}
(*tag) += (char) c;
}
if ( in->good() )
{
// We now have something we presume to be a node_ of
// some sort. identify it_, and call the node_ to
// continue streaming_.
node* node_ = identify( tag->c_str() + tagIndex, TIXML_DEFAULT_ENCODING );
if ( node_ )
{
node_->stream_in( in, tag );
bool isElement = node_->to_element() != 0;
delete node_;
node_ = 0;
// If this is the root_ element_, we're done. parsing will be
// done by the >> operator.
if ( isElement )
{
return;
}
}
else
{
set_error( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
return;
}
}
}
// We should have returned sooner.
set_error( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
}
#endif
const char* document::parse( const char* p, parsing_data* prevData, encoding encoding_ )
{
clear_error();
// parse away, at the document_ level. Since a document_
// contains nothing but other tags, most of what happens
// here is skipping white_ space.
if ( !p || !*p )
{
set_error( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
return 0;
}
// Note that, for a document_, this needs to come
// before the while space skip, so that parsing_
// starts from the pointer we are_ given.
location_.clear();
if ( prevData )
{
location_.row_ = prevData->cursor_.row_;
location_.col = prevData->cursor_.col;
}
else
{
location_.row_ = 0;
location_.col = 0;
}
parsing_data data( p, tab_size(), location_.row_, location_.col );
location_ = data.get_cursor();
if ( encoding_ == TIXML_ENCODING_UNKNOWN )
{
// Check for the Microsoft UTF-8 lead bytes.
const unsigned char* pU = (const unsigned char*)p;
if ( *(pU+0) && *(pU+0) == TIXML_UTF_LEAD_0
&& *(pU+1) && *(pU+1) == TIXML_UTF_LEAD_1
&& *(pU+2) && *(pU+2) == TIXML_UTF_LEAD_2 )
{
encoding_ = TIXML_ENCODING_UTF8;
useMicrosoftBOM = true;
}
}
p = skip_white_space( p, encoding_ );
if ( !p )
{
set_error( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
return 0;
}
while ( p && *p )
{
node* node_ = identify( p, encoding_ );
if ( node_ )
{
p = node_->parse( p, &data, encoding_ );
link_end_child( node_ );
}
else
{
break;
}
// Did we get encoding_ info?
if ( encoding_ == TIXML_ENCODING_UNKNOWN
&& node_->to_declaration() )
{
declaration* dec = node_->to_declaration();
const char* enc = dec->get_encoding();
assert( enc );
if ( *enc == 0 )
encoding_ = TIXML_ENCODING_UTF8;
else if ( string_equal( enc, "UTF-8", true, TIXML_ENCODING_UNKNOWN ) )
encoding_ = TIXML_ENCODING_UTF8;
else if ( string_equal( enc, "UTF8", true, TIXML_ENCODING_UNKNOWN ) )
encoding_ = TIXML_ENCODING_UTF8; // incorrect, but be nice
else
encoding_ = TIXML_ENCODING_LEGACY;
}
p = skip_white_space( p, encoding_ );
}
// Was this empty?
if ( !firstChild ) {
set_error( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, encoding_ );
return 0;
}
// All is well.
return p;
}
void document::set_error( int err, const char* pError, parsing_data* data, encoding encoding_ )
{
// The first_ error_ in a chain is more accurate - don't set again!
if ( error_ )
return;
assert( err > 0 && err < TIXML_ERROR_STRING_COUNT );
error_ = true;
errorId = err;
errorDesc = errorString[ errorId ];
errorLocation.clear();
if ( pError && data )
{
data->stamp( pError, encoding_ );
errorLocation = data->get_cursor();
}
}
node* node::identify( const char* p, encoding encoding_ )
{
node* returnNode = 0;
p = skip_white_space( p, encoding_ );
if( !p || !*p || *p != '<' )
{
return 0;
}
document* doc = get_document();
p = skip_white_space( p, encoding_ );
if ( !p || !*p )
{
return 0;
}
// What is this thing?
// - Elements start_ with a letter or underscore, but xml is reserved.
// - comments: <!--
// - Decleration: <?xml
// - Everthing else is unknown_ to tinyxml.
//
const char* xmlHeader = { "<?xml" };
const char* commentHeader = { "<!--" };
const char* dtdHeader = { "<!" };
const char* cdataHeader = { "<![CDATA[" };
if ( string_equal( p, xmlHeader, true, encoding_ ) )
{
#ifdef DEBUG_PARSER
TIXML_LOG( "XML parsing Declaration\n" );
#endif
returnNode = new declaration();
}
else if ( string_equal( p, commentHeader, false, encoding_ ) )
{
#ifdef DEBUG_PARSER
TIXML_LOG( "XML parsing Comment\n" );
#endif
returnNode = new comment();
}
else if ( string_equal( p, cdataHeader, false, encoding_ ) )
{
#ifdef DEBUG_PARSER
TIXML_LOG( "XML parsing CDATA\n" );
#endif
text* text_ = new text( "" );
text_->set_c_d_a_t_a( true );
returnNode = text_;
}
else if ( string_equal( p, dtdHeader, false, encoding_ ) )
{
#ifdef DEBUG_PARSER
TIXML_LOG( "XML parsing Unknown(1)\n" );
#endif
returnNode = new unknown();
}
else if ( is_alpha( *(p+1), encoding_ )
|| *(p+1) == '_' )
{
#ifdef DEBUG_PARSER
TIXML_LOG( "XML parsing Element\n" );
#endif
returnNode = new element( "" );
}
else
{
#ifdef DEBUG_PARSER
TIXML_LOG( "XML parsing Unknown(2)\n" );
#endif
returnNode = new unknown();
}
if ( returnNode )
{
// Set the parent_, so it_ can report errors
returnNode->parent_ = this;
}
else
{
if ( doc )
doc->set_error( TIXML_ERROR_OUT_OF_MEMORY, 0, 0, TIXML_ENCODING_UNKNOWN );
}
return returnNode;
}
#ifdef TIXML_USE_STL
void element::stream_in (std::istream * in, TIXML_STRING * tag)
{
// We're called with some amount of pre-parsing_. That is, some of "this"
// element_ is in "tag". go ahead and stream_ to the closing ">"
while( in->good() )
{
int c = in->get();
if ( c <= 0 )
{
document* document_ = get_document();
if ( document_ )
document_->set_error( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
return;
}
(*tag) += (char) c ;
if ( c == '>' )
break;
}
if ( tag->length() < 3 ) return;
// Okay...if we are_ a "/>" tag, then we're done. We've read_ a complete_ tag.
// If not, identify_ and stream_.
if ( tag->at( tag->length() - 1 ) == '>'
&& tag->at( tag->length() - 2 ) == '/' )
{
// All good!
return;
}
else if ( tag->at( tag->length() - 1 ) == '>' )
{
// There is more. could be:
// text_
// cdata text_ (which looks like another node_)
// closing tag
// another node_.
for ( ;; )
{
stream_white_space( in, tag );
// do we have text_?
if ( in->good() && in->peek() != '<' )
{
// Yep, text_.
text text_( "" );
text_.stream_in( in, tag );
// What follows text_ is a closing tag or another node_.
// go around again and figure it_ out.
continue;
}
// We now have either a closing tag...or another node_.
// We should be at a "<", regardless.
if ( !in->good() ) return;
assert( in->peek() == '<' );
int tagIndex = (int) tag->length();
bool closingTag = false;
bool firstCharFound = false;
for( ;; )
{
if ( !in->good() )
return;
int c = in->peek();
if ( c <= 0 )
{
document* document_ = get_document();
if ( document_ )
document_->set_error( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
return;
}
if ( c == '>' )
break;
*tag += (char) c;
in->get();
// Early out if we find the CDATA id.
if ( c == '[' && tag->size() >= 9 )
{
size_t len = tag->size();
const char* start_ = tag->c_str() + len - 9;
if ( strcmp( start_, "<![CDATA[" ) == 0 ) {
assert( !closingTag );
break;
}
}
if ( !firstCharFound && c != '<' && !is_white_space( c ) )
{
firstCharFound = true;
if ( c == '/' )
closingTag = true;
}
}
// If it_ was a closing tag, then read_ in the closing '>' to clean up the input stream_.
// If it_ was not, the streaming_ will be done by the tag.
if ( closingTag )
{
if ( !in->good() )
return;
int c = in->get();
if ( c <= 0 )
{
document* document_ = get_document();
if ( document_ )
document_->set_error( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
return;
}
assert( c == '>' );
*tag += (char) c;
// We are_ done, once we've found our_ closing tag.
return;
}
else
{
// If not a closing tag, id it_, and stream_.
const char* tagloc = tag->c_str() + tagIndex;
node* node_ = identify( tagloc, TIXML_DEFAULT_ENCODING );
if ( !node_ )
return;
node_->stream_in( in, tag );
delete node_;
node_ = 0;
// no return: go_ around from the beginning: text_, closing tag, or node_.
}
}
}
}
#endif
const char* element::parse( const char* p, parsing_data* data, encoding encoding_ )
{
p = skip_white_space( p, encoding_ );
document* document_ = get_document();
if ( !p || !*p )
{
if ( document_ ) document_->set_error( TIXML_ERROR_PARSING_ELEMENT, 0, 0, encoding_ );
return 0;
}
if ( data )
{
data->stamp( p, encoding_ );
location_ = data->get_cursor();
}
if ( *p != '<' )
{
if ( document_ ) document_->set_error( TIXML_ERROR_PARSING_ELEMENT, p, data, encoding_ );
return 0;
}
p = skip_white_space( p+1, encoding_ );
// read the name_.
const char* pErr = p;
p = read_name( p, &value_, encoding_ );
if ( !p || !*p )
{
if ( document_ ) document_->set_error( TIXML_ERROR_FAILED_TO_READ_ELEMENT_NAME, pErr, data, encoding_ );
return 0;
}
TIXML_STRING endTag ("</");
endTag += value_;
endTag += ">";
// Check for and read_ attributes_. Also look_ for an empty
// tag or an end tag.
while ( p && *p )
{
pErr = p;
p = skip_white_space( p, encoding_ );
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -