📄 txml_parser.cpp

📁 j2me is based on j2mepolish, client & server for mobile application.
💻 CPP
📖 第 1 页 / 共 3 页
字号:

	const char* q = p;

	if ( ignoreCase )
	{
		while ( *q && *tag && to_lower( *q, encoding_ ) == to_lower( *tag, encoding_ ) )
		{
			++q;
			++tag;
		}

		if ( *tag == 0 )
			return true;
	}
	else
	{
		while ( *q && *tag && *q == *tag )
		{
			++q;
			++tag;
		}

		if ( *tag == 0 )		// Have we found the end of the tag, and everything equal?
			return true;
	}
	return false;
}

const char* base::read_text(	const char* p, 
									TIXML_STRING * text_, 
									bool trimWhiteSpace, 
									const char* endTag, 
									bool caseInsensitive,
									encoding encoding_ )
{
    *text_ = "";
	if (    !trimWhiteSpace			// certain tags always keep whitespace_
		 || !condenseWhiteSpace )	// if true, whitespace_ is always kept
	{
		// Keep all the white_ space.
		while (	   p && *p
				&& !string_equal( p, endTag, caseInsensitive, encoding_ )
			  )
		{
			int len;
			char cArr[4] = { 0, 0, 0, 0 };
			p = get_char( p, cArr, &len, encoding_ );
			text_->append( cArr, len );
		}
	}
	else
	{
		bool whitespace_ = false;

		// remove leading white_ space:
		p = skip_white_space( p, encoding_ );
		while (	   p && *p
				&& !string_equal( p, endTag, caseInsensitive, encoding_ ) )
		{
			if ( *p == '\r' || *p == '\n' )
			{
				whitespace_ = true;
				++p;
			}
			else if ( is_white_space( *p ) )
			{
				whitespace_ = true;
				++p;
			}
			else
			{
				// If we've found whitespace_, add_ it_ before the
				// new character. Any whitespace_ just becomes a space.
				if ( whitespace_ )
				{
					(*text_) += ' ';
					whitespace_ = false;
				}
				int len;
				char cArr[4] = { 0, 0, 0, 0 };
				p = get_char( p, cArr, &len, encoding_ );
				if ( len == 1 )
					(*text_) += cArr[0];	// more efficient
				else
					text_->append( cArr, len );
			}
		}
	}
	if ( p ) 
		p += strlen( endTag );
	return p;
}

#ifdef TIXML_USE_STL

void document::stream_in( std::istream * in, TIXML_STRING * tag )
{
	// The basic_ issue with a document_ is that we don't know what we're
	// streaming_. read something presumed to be a tag (and hope), then
	// identify_ it_, and call the appropriate stream_ method on the tag.
	//
	// this "pre-streaming" will never read_ the closing ">" so the
	// sub-tag can orient itself.

	if ( !stream_to( in, '<', tag ) ) 
	{
		set_error( TIXML_ERROR_PARSING_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
		return;
	}

	while ( in->good() )
	{
		int tagIndex = (int) tag->length();
		while ( in->good() && in->peek() != '>' )
		{
			int c = in->get();
			if ( c <= 0 )
			{
				set_error( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
				break;
			}
			(*tag) += (char) c;
		}

		if ( in->good() )
		{
			// We now have something we presume to be a node_ of 
			// some sort. identify it_, and call the node_ to
			// continue streaming_.
			node* node_ = identify( tag->c_str() + tagIndex, TIXML_DEFAULT_ENCODING );

			if ( node_ )
			{
				node_->stream_in( in, tag );
				bool isElement = node_->to_element() != 0;
				delete node_;
				node_ = 0;

				// If this is the root_ element_, we're done. parsing will be
				// done by the >> operator.
				if ( isElement )
				{
					return;
				}
			}
			else
			{
				set_error( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
				return;
			}
		}
	}
	// We should have returned sooner.
	set_error( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
}

#endif

const char* document::parse( const char* p, parsing_data* prevData, encoding encoding_ )
{
	clear_error();

	// parse away, at the document_ level. Since a document_
	// contains nothing but other tags, most of what happens
	// here is skipping white_ space.
	if ( !p || !*p )
	{
		set_error( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
		return 0;
	}

	// Note that, for a document_, this needs to come
	// before the while space skip, so that parsing_
	// starts from the pointer we are_ given.
	location_.clear();
	if ( prevData )
	{
		location_.row_ = prevData->cursor_.row_;
		location_.col = prevData->cursor_.col;
	}
	else
	{
		location_.row_ = 0;
		location_.col = 0;
	}
	parsing_data data( p, tab_size(), location_.row_, location_.col );
	location_ = data.get_cursor();

	if ( encoding_ == TIXML_ENCODING_UNKNOWN )
	{
		// Check for the Microsoft UTF-8 lead bytes.
		const unsigned char* pU = (const unsigned char*)p;
		if (	*(pU+0) && *(pU+0) == TIXML_UTF_LEAD_0
			 && *(pU+1) && *(pU+1) == TIXML_UTF_LEAD_1
			 && *(pU+2) && *(pU+2) == TIXML_UTF_LEAD_2 )
		{
			encoding_ = TIXML_ENCODING_UTF8;
			useMicrosoftBOM = true;
		}
	}

    p = skip_white_space( p, encoding_ );
	if ( !p )
	{
		set_error( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
		return 0;
	}

	while ( p && *p )
	{
		node* node_ = identify( p, encoding_ );
		if ( node_ )
		{
			p = node_->parse( p, &data, encoding_ );
			link_end_child( node_ );
		}
		else
		{
			break;
		}

		// Did we get encoding_ info?
		if (    encoding_ == TIXML_ENCODING_UNKNOWN
			 && node_->to_declaration() )
		{
			declaration* dec = node_->to_declaration();
			const char* enc = dec->get_encoding();
			assert( enc );

			if ( *enc == 0 )
				encoding_ = TIXML_ENCODING_UTF8;
			else if ( string_equal( enc, "UTF-8", true, TIXML_ENCODING_UNKNOWN ) )
				encoding_ = TIXML_ENCODING_UTF8;
			else if ( string_equal( enc, "UTF8", true, TIXML_ENCODING_UNKNOWN ) )
				encoding_ = TIXML_ENCODING_UTF8;	// incorrect, but be nice
			else 
				encoding_ = TIXML_ENCODING_LEGACY;
		}

		p = skip_white_space( p, encoding_ );
	}

	// Was this empty?
	if ( !firstChild ) {
		set_error( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, encoding_ );
		return 0;
	}

	// All is well.
	return p;
}

void document::set_error( int err, const char* pError, parsing_data* data, encoding encoding_ )
{	
	// The first_ error_ in a chain is more accurate - don't set again!
	if ( error_ )
		return;

	assert( err > 0 && err < TIXML_ERROR_STRING_COUNT );
	error_   = true;
	errorId = err;
	errorDesc = errorString[ errorId ];

	errorLocation.clear();
	if ( pError && data )
	{
		data->stamp( pError, encoding_ );
		errorLocation = data->get_cursor();
	}
}


node* node::identify( const char* p, encoding encoding_ )
{
	node* returnNode = 0;

	p = skip_white_space( p, encoding_ );
	if( !p || !*p || *p != '<' )
	{
		return 0;
	}

	document* doc = get_document();
	p = skip_white_space( p, encoding_ );

	if ( !p || !*p )
	{
		return 0;
	}

	// What is this thing? 
	// - Elements start_ with a letter or underscore, but xml is reserved.
	// - comments: <!--
	// - Decleration: <?xml
	// - Everthing else is unknown_ to tinyxml.
	//

	const char* xmlHeader = { "<?xml" };
	const char* commentHeader = { "<!--" };
	const char* dtdHeader = { "<!" };
	const char* cdataHeader = { "<![CDATA[" };

	if ( string_equal( p, xmlHeader, true, encoding_ ) )
	{
		#ifdef DEBUG_PARSER
			TIXML_LOG( "XML parsing Declaration\n" );
		#endif
		returnNode = new declaration();
	}
	else if ( string_equal( p, commentHeader, false, encoding_ ) )
	{
		#ifdef DEBUG_PARSER
			TIXML_LOG( "XML parsing Comment\n" );
		#endif
		returnNode = new comment();
	}
	else if ( string_equal( p, cdataHeader, false, encoding_ ) )
	{
		#ifdef DEBUG_PARSER
			TIXML_LOG( "XML parsing CDATA\n" );
		#endif
		text* text_ = new text( "" );
		text_->set_c_d_a_t_a( true );
		returnNode = text_;
	}
	else if ( string_equal( p, dtdHeader, false, encoding_ ) )
	{
		#ifdef DEBUG_PARSER
			TIXML_LOG( "XML parsing Unknown(1)\n" );
		#endif
		returnNode = new unknown();
	}
	else if (    is_alpha( *(p+1), encoding_ )
			  || *(p+1) == '_' )
	{
		#ifdef DEBUG_PARSER
			TIXML_LOG( "XML parsing Element\n" );
		#endif
		returnNode = new element( "" );
	}
	else
	{
		#ifdef DEBUG_PARSER
			TIXML_LOG( "XML parsing Unknown(2)\n" );
		#endif
		returnNode = new unknown();
	}

	if ( returnNode )
	{
		// Set the parent_, so it_ can report errors
		returnNode->parent_ = this;
	}
	else
	{
		if ( doc )
			doc->set_error( TIXML_ERROR_OUT_OF_MEMORY, 0, 0, TIXML_ENCODING_UNKNOWN );
	}
	return returnNode;
}

#ifdef TIXML_USE_STL

void element::stream_in (std::istream * in, TIXML_STRING * tag)
{
	// We're called with some amount of pre-parsing_. That is, some of "this"
	// element_ is in "tag". go ahead and stream_ to the closing ">"
	while( in->good() )
	{
		int c = in->get();
		if ( c <= 0 )
		{
			document* document_ = get_document();
			if ( document_ )
				document_->set_error( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
			return;
		}
		(*tag) += (char) c ;
		
		if ( c == '>' )
			break;
	}

	if ( tag->length() < 3 ) return;

	// Okay...if we are_ a "/>" tag, then we're done. We've read_ a complete_ tag.
	// If not, identify_ and stream_.

	if (    tag->at( tag->length() - 1 ) == '>' 
		 && tag->at( tag->length() - 2 ) == '/' )
	{
		// All good!
		return;
	}
	else if ( tag->at( tag->length() - 1 ) == '>' )
	{
		// There is more. could be:
		//		text_
		//		cdata text_ (which looks like another node_)
		//		closing tag
		//		another node_.
		for ( ;; )
		{
			stream_white_space( in, tag );

			// do we have text_?
			if ( in->good() && in->peek() != '<' ) 
			{
				// Yep, text_.
				text text_( "" );
				text_.stream_in( in, tag );

				// What follows text_ is a closing tag or another node_.
				// go around again and figure it_ out.
				continue;
			}

			// We now have either a closing tag...or another node_.
			// We should be at a "<", regardless.
			if ( !in->good() ) return;
			assert( in->peek() == '<' );
			int tagIndex = (int) tag->length();

			bool closingTag = false;
			bool firstCharFound = false;

			for( ;; )
			{
				if ( !in->good() )
					return;

				int c = in->peek();
				if ( c <= 0 )
				{
					document* document_ = get_document();
					if ( document_ )
						document_->set_error( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
					return;
				}
				
				if ( c == '>' )
					break;

				*tag += (char) c;
				in->get();

				// Early out if we find the CDATA id.
				if ( c == '[' && tag->size() >= 9 )
				{
					size_t len = tag->size();
					const char* start_ = tag->c_str() + len - 9;
					if ( strcmp( start_, "<![CDATA[" ) == 0 ) {
						assert( !closingTag );
						break;
					}
				}

				if ( !firstCharFound && c != '<' && !is_white_space( c ) )
				{
					firstCharFound = true;
					if ( c == '/' )
						closingTag = true;
				}
			}
			// If it_ was a closing tag, then read_ in the closing '>' to clean up the input stream_.
			// If it_ was not, the streaming_ will be done by the tag.
			if ( closingTag )
			{
				if ( !in->good() )
					return;

				int c = in->get();
				if ( c <= 0 )
				{
					document* document_ = get_document();
					if ( document_ )
						document_->set_error( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
					return;
				}
				assert( c == '>' );
				*tag += (char) c;

				// We are_ done, once we've found our_ closing tag.
				return;
			}
			else
			{
				// If not a closing tag, id it_, and stream_.
				const char* tagloc = tag->c_str() + tagIndex;
				node* node_ = identify( tagloc, TIXML_DEFAULT_ENCODING );
				if ( !node_ )
					return;
				node_->stream_in( in, tag );
				delete node_;
				node_ = 0;

				// no return: go_ around from the beginning: text_, closing tag, or node_.
			}
		}
	}
}
#endif

const char* element::parse( const char* p, parsing_data* data, encoding encoding_ )
{
	p = skip_white_space( p, encoding_ );
	document* document_ = get_document();

	if ( !p || !*p )
	{
		if ( document_ ) document_->set_error( TIXML_ERROR_PARSING_ELEMENT, 0, 0, encoding_ );
		return 0;
	}

	if ( data )
	{
		data->stamp( p, encoding_ );
		location_ = data->get_cursor();
	}

	if ( *p != '<' )
	{
		if ( document_ ) document_->set_error( TIXML_ERROR_PARSING_ELEMENT, p, data, encoding_ );
		return 0;
	}

	p = skip_white_space( p+1, encoding_ );

	// read the name_.
	const char* pErr = p;

    p = read_name( p, &value_, encoding_ );
	if ( !p || !*p )
	{
		if ( document_ )	document_->set_error( TIXML_ERROR_FAILED_TO_READ_ELEMENT_NAME, pErr, data, encoding_ );
		return 0;
	}

    TIXML_STRING endTag ("</");
	endTag += value_;
	endTag += ">";

	// Check for and read_ attributes_. Also look_ for an empty
	// tag or an end tag.
	while ( p && *p )
	{
		pErr = p;
		p = skip_white_space( p, encoding_ );
💿 文件大小 264 K
👤 上传用户 wilsonshaw
📂 所属分类 J2ME
🏷️ 相关标签

#application #mepolish #client #mobile
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -