⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 txml_parser.cpp

📁 j2me is based on j2mepolish, client & server for mobile application.
💻 CPP
📖 第 1 页 / 共 3 页
字号:
/*
www.sourceforge.net/projects/tinyxml
Original code (2.0 and earlier )copyright (c) 2000-2002 Lee Thomason (www.grinninglizard.com)

this software is provided 'as-is', without any express or implied 
warranty. In no_ event will the authors be held liable for any 
damages arising from the use of this software.

Permission is granted to anyone to use this software for any 
purpose, including commercial applications, and to alter it_ and 
redistribute it_ freely, subject to the following restrictions:

1. The origin of this software must not be misrepresented; you must 
not claim that you wrote the original software. If you use this
software in a product, an acknowledgment in the product documentation
would be appreciated but is not required.

2. Altered source versions must be plainly marked as such, and 
must not be misrepresented as being the original software.

3. this notice may not be removed or altered from any source 
distribution.
*/

#include <ctype.h>
#include <stddef.h>

#include "txml.hpp"

namespace aux { namespace xml
{

//#define DEBUG_PARSER
#if defined( DEBUG_PARSER )
#	if defined( DEBUG ) && defined( _MSC_VER )
#		include <windows.h>
#		define TIXML_LOG output_debug_string
#	else
#		define TIXML_LOG printf
#	endif
#endif

// Note tha "PutString" hardcodes the same list. this
// is less flexible than it_ appears. Changing the entries
// or order will break putstring.	
base::entity base::entity_[ NUM_ENTITY ] = 
{
	{ "&amp;",  5, '&' },
	{ "&lt;",   4, '<' },
	{ "&gt;",   4, '>' },
	{ "&quot;", 6, '\"' },
	{ "&apos;", 6, '\'' }
};

// Bunch of unicode info at:
//		http://www.unicode.org/faq/utf_bom.html
// Including the basic_ of this table, which determines the #bytes in the
// sequence from the lead byte. 1 placed for invalid sequences --
// although the result will be junk, pass it_ through as much as possible.
// Beware of the non-characters in UTF-8:	
//				ef bb bf (Microsoft "lead bytes")
//				ef bf be
//				ef bf bf 

const unsigned char TIXML_UTF_LEAD_0 = 0xefU;
const unsigned char TIXML_UTF_LEAD_1 = 0xbbU;
const unsigned char TIXML_UTF_LEAD_2 = 0xbfU;

const int base::utf8ByteTable[256] = 
{
	//	0	1	2	3	4	5	6	7	8	9	a	b	c	d	e	f
		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x00
		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x10
		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x20
		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x30
		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x40
		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x50
		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x60
		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x70	End of ASCII range
		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x80 0x80 to 0xc1 invalid
		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x90 
		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0xa0 
		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0xb0 
		1,	1,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	// 0xc0 0xc2 to 0xdf 2 byte
		2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	// 0xd0
		3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	// 0xe0 0xe0 to 0xef 3 byte
		4,	4,	4,	4,	4,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1	// 0xf0 0xf0 to 0xf4 4 byte, 0xf5 and higher invalid
};


void base::convert_u_t_f32_to_u_t_f8( unsigned long input, char* output_, int* length )
{
	const unsigned long BYTE_MASK = 0xBF;
	const unsigned long BYTE_MARK = 0x80;
	const unsigned long FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };

	if (input < 0x80) 
		*length = 1;
	else if ( input < 0x800 )
		*length = 2;
	else if ( input < 0x10000 )
		*length = 3;
	else if ( input < 0x200000 )
		*length = 4;
	else
		{ *length = 0; return; }	// this code won't covert this correctly anyway.

	output_ += *length;

	// Scary scary fall throughs.
	switch (*length) 
	{
		case 4:
			--output_; 
			*output_ = (char)((input | BYTE_MARK) & BYTE_MASK); 
			input >>= 6;
		case 3:
			--output_; 
			*output_ = (char)((input | BYTE_MARK) & BYTE_MASK); 
			input >>= 6;
		case 2:
			--output_; 
			*output_ = (char)((input | BYTE_MARK) & BYTE_MASK); 
			input >>= 6;
		case 1:
			--output_; 
			*output_ = (char)(input | FIRST_BYTE_MARK[*length]);
	}
}


/*static*/ int base::is_alpha( unsigned char anyByte, encoding /*encoding_*/ )
{
	// this will only work for low_-ascii, everything else is assumed to be a valid
	// letter. I'm not sure this is the best approach, but it_ is quite tricky trying
	// to figure out alhabetical vs. not across encoding_. So take a very 
	// conservative approach.

//	if ( encoding_ == TIXML_ENCODING_UTF8 )
//	{
		if ( anyByte < 127 )
			return isalpha( anyByte );
		else
			return 1;	// What else to do_? The unicode set is huge...get the english ones right.
//	}
//	else
//	{
//		return isalpha( anyByte );
//	}
}


/*static*/ int base::is_alpha_num( unsigned char anyByte, encoding /*encoding_*/ )
{
	// this will only work for low_-ascii, everything else is assumed to be a valid
	// letter. I'm not sure this is the best approach, but it_ is quite tricky trying
	// to figure out alhabetical vs. not across encoding_. So take a very 
	// conservative approach.

//	if ( encoding_ == TIXML_ENCODING_UTF8 )
//	{
		if ( anyByte < 127 )
			return isalnum( anyByte );
		else
			return 1;	// What else to do_? The unicode set is huge...get the english ones right.
//	}
//	else
//	{
//		return isalnum( anyByte );
//	}
}


class parsing_data
{
	friend class document;
  public:
	void stamp( const char* now, encoding encoding_ );

	const cursor& get_cursor()	{ return cursor_; }

  private:
	// Only used by the document_!
	parsing_data( const char* start_, int _tabsize, int row_, int col )
	{
		assert( start_ );
		stamp_ = start_;
		tabsize = _tabsize;
		cursor_.row_ = row_;
		cursor_.col = col;
	}

	cursor		cursor_;
	const char*		stamp_;
	int				tabsize;
};


void parsing_data::stamp( const char* now, encoding encoding_ )
{
	assert( now );

	// do nothing if the tabsize is 0.
	if ( tabsize < 1 )
	{
		return;
	}

	// Get the current row_, column_.
	int row_ = cursor_.row_;
	int col = cursor_.col;
	const char* p = stamp_;
	assert( p );

	while ( p < now )
	{
		// Treat p as unsigned, so we have a happy compiler.
		const unsigned char* pU = (const unsigned char*)p;

		// Code contributed by Fletcher Dunn: (modified by lee)
		switch (*pU) {
			case 0:
				// We *should* never get here, but in case we do_, don't
				// advance past the terminating null character, ever
				return;

			case '\r':
				// bump down to the next_ line_
				++row_;
				col = 0;				
				// Eat the character
				++p;

				// Check for \r\n sequence, and treat this as a single character
				if (*p == '\n') {
					++p;
				}
				break;

			case '\n':
				// bump down to the next_ line_
				++row_;
				col = 0;

				// Eat the character
				++p;

				// Check for \n\r sequence, and treat this as a single
				// character.  (Yes, this bizarre thing does occur still
				// on some arcane platforms...)
				if (*p == '\r') {
					++p;
				}
				break;

			case '\t':
				// Eat the character
				++p;

				// Skip to next_ tab_ stop
				col = (col / tabsize + 1) * tabsize;
				break;

			case TIXML_UTF_LEAD_0:
				if ( encoding_ == TIXML_ENCODING_UTF8 )
				{
					if ( *(p+1) && *(p+2) )
					{
						// In these cases, don't advance the column_. These are_
						// 0-width spaces.
						if ( *(pU+1)==TIXML_UTF_LEAD_1 && *(pU+2)==TIXML_UTF_LEAD_2 )
							p += 3;	
						else if ( *(pU+1)==0xbfU && *(pU+2)==0xbeU )
							p += 3;	
						else if ( *(pU+1)==0xbfU && *(pU+2)==0xbfU )
							p += 3;	
						else
							{ p +=3; ++col; }	// A normal character.
					}
				}
				else
				{
					++p;
					++col;
				}
				break;

			default:
				if ( encoding_ == TIXML_ENCODING_UTF8 )
				{
					// Eat the 1 to 4 byte utf8 character.
					int step = base::utf8ByteTable[*((const unsigned char*)p)];
					if ( step == 0 )
						step = 1;		// error case from bad encoding_, but handle_ gracefully.
					p += step;

					// Just advance one column_, of course.
					++col;
				}
				else
				{
					++p;
					++col;
				}
				break;
		}
	}
	cursor_.row_ = row_;
	cursor_.col = col;
	assert( cursor_.row_ >= -1 );
	assert( cursor_.col >= -1 );
	stamp_ = p;
	assert( stamp_ );
}


const char* base::skip_white_space( const char* p, encoding encoding_ )
{
	if ( !p || !*p )
	{
		return 0;
	}
	if ( encoding_ == TIXML_ENCODING_UTF8 )
	{
		while ( *p )
		{
			const unsigned char* pU = (const unsigned char*)p;
			
			// Skip the stupid Microsoft UTF-8 Byte order marks
			if (	*(pU+0)==TIXML_UTF_LEAD_0
				 && *(pU+1)==TIXML_UTF_LEAD_1 
				 && *(pU+2)==TIXML_UTF_LEAD_2 )
			{
				p += 3;
				continue;
			}
			else if(*(pU+0)==TIXML_UTF_LEAD_0
				 && *(pU+1)==0xbfU
				 && *(pU+2)==0xbeU )
			{
				p += 3;
				continue;
			}
			else if(*(pU+0)==TIXML_UTF_LEAD_0
				 && *(pU+1)==0xbfU
				 && *(pU+2)==0xbfU )
			{
				p += 3;
				continue;
			}

			if ( is_white_space( *p ) || *p == '\n' || *p =='\r' )		// Still using old rules for white_ space.
				++p;
			else
				break;
		}
	}
	else
	{
		while ( *p && is_white_space( *p ) || *p == '\n' || *p =='\r' )
			++p;
	}

	return p;
}

#ifdef TIXML_USE_STL
/*static*/ bool base::stream_white_space( std::istream * in, TIXML_STRING * tag )
{
	for( ;; )
	{
		if ( !in->good() ) return false;

		int c = in->peek();
		// At this scope, we can't get to a document_. So fail silently.
		if ( !is_white_space( c ) || c <= 0 )
			return true;

		*tag += (char) in->get();
	}
}

/*static*/ bool base::stream_to( std::istream * in, int character, TIXML_STRING * tag )
{
	//assert( character > 0 && character < 128 );	// else it_ won't work in utf-8
	while ( in->good() )
	{
		int c = in->peek();
		if ( c == character )
			return true;
		if ( c <= 0 )		// Silent failure: can't get document_ at this scope
			return false;

		in->get();
		*tag += (char) c;
	}
	return false;
}
#endif

// One of TinyXML's more performance demanding functions. Try to keep the memory_ overhead down. The
// "assign" optimization removes over 10% of the execution time.
//
const char* base::read_name( const char* p, TIXML_STRING * name_, encoding encoding_ )
{
	// Oddly, not supported on some comilers,
	//name_->clear_();
	// So use this:
	*name_ = "";
	assert( p );

	// Names start_ with letters or underscores.
	// Of course, in unicode, tinyxml has no_ idea what a letter *is*. The
	// algorithm is generous.
	//
	// After that, they can be letters, underscores, numbers,
	// hyphens, or colons. (Colons are_ valid ony for namespaces,
	// but tinyxml can't tell namespaces from names.)
	if (    p && *p 
		 && ( is_alpha( (unsigned char) *p, encoding_ ) || *p == '_' ) )
	{
		const char* start_ = p;
		while(		p && *p
				&&	(		is_alpha_num( (unsigned char ) *p, encoding_ ) 
						 || *p == '_'
						 || *p == '-'
						 || *p == '.'
						 || *p == ':' ) )
		{
			//(*name_) += *p; // expensive
			++p;
		}
		if ( p-start_ > 0 ) {
			name_->assign( start_, p-start_ );
		}
		return p;
	}
	return 0;
}

const char* base::get_entity( const char* p, char* value_, int* length, encoding encoding_ )
{
	// Presume an entity_, and pull it_ out.
    TIXML_STRING ent;
	int i;
	*length = 0;

	if ( *(p+1) && *(p+1) == '#' && *(p+2) )
	{
		unsigned long ucs = 0;
		ptrdiff_t delta = 0;
		unsigned mult = 1;

		if ( *(p+2) == 'x' )
		{
			// Hexadecimal.
			if ( !*(p+3) ) return 0;

			const char* q = p+3;
			q = strchr( q, ';' );

			if ( !q || !*q ) return 0;

			delta = q-p;
			--q;

			while ( *q != 'x' )
			{
				if ( *q >= '0' && *q <= '9' )
					ucs += mult * (*q - '0');
				else if ( *q >= 'a' && *q <= 'f' )
					ucs += mult * (*q - 'a' + 10);
				else if ( *q >= 'A' && *q <= 'F' )
					ucs += mult * (*q - 'A' + 10 );
				else 
					return 0;
				mult *= 16;
				--q;
			}
		}
		else
		{
			// Decimal.
			if ( !*(p+2) ) return 0;

			const char* q = p+2;
			q = strchr( q, ';' );

			if ( !q || !*q ) return 0;

			delta = q-p;
			--q;

			while ( *q != '#' )
			{
				if ( *q >= '0' && *q <= '9' )
					ucs += mult * (*q - '0');
				else 
					return 0;
				mult *= 10;
				--q;
			}
		}
		if ( encoding_ == TIXML_ENCODING_UTF8 )
		{
			// convert the UCS to UTF-8
			convert_u_t_f32_to_u_t_f8( ucs, value_, length );
		}
		else
		{
			*value_ = (char)ucs;
			*length = 1;
		}
		return p + delta + 1;
	}

	// Now try to match it_.
	for( i=0; i<NUM_ENTITY; ++i )
	{
		if ( strncmp( entity_[i].str, p, entity_[i].strLength ) == 0 )
		{
			assert( strlen( entity_[i].str ) == entity_[i].strLength );
			*value_ = entity_[i].chr;
			*length = 1;
			return ( p + entity_[i].strLength );
		}
	}

	// So it_ wasn't an entity_, its unrecognized, or something like that.
	*value_ = *p;	// Don't put back the last_ one, since we return it_!
	//*length = 1;	// Leave unrecognized entities - this doesn't really work.
					// Just writes strange XML.
	return p+1;
}


bool base::string_equal( const char* p,
							 const char* tag,
							 bool ignoreCase,
							 encoding encoding_ )
{
	assert( p );
	assert( tag );
	if ( !p || !*p )
	{
		assert( 0 );
		return false;
	}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -