tinyxmlparser.cpp

来自「otl简单包装实现类,对数据库进行操作的,简单易用.」· C++ 代码 · 共 1,494 行 · 第 1/3 页
CPP
1,494 行
/*www.sourceforge.net/projects/tinyxmlOriginal code (2.0 and earlier )copyright (c) 2000-2002 Lee Thomason (www.grinninglizard.com)This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software.Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions:1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use thissoftware in a product, an acknowledgment in the product documentationwould be appreciated but is not required.2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.3. This notice may not be removed or altered from any source distribution.*/#include "tinyxml.h"#include <ctype.h>//#define DEBUG_PARSER// Note tha "PutString" hardcodes the same list. This// is less flexible than it appears. Changing the entries// or order will break putstring.	TiXmlBase::Entity TiXmlBase::entity[ NUM_ENTITY ] = {	{ "&amp;",  5, '&' },	{ "&lt;",   4, '<' },	{ "&gt;",   4, '>' },	{ "&quot;", 6, '\"' },	{ "&apos;", 6, '\'' }};// Bunch of unicode info at://		http://www.unicode.org/faq/utf_bom.html// Including the basic of this table, which determines the #bytes in the// sequence from the lead byte. 1 placed for invalid sequences --// although the result will be junk, pass it through as much as possible.// Beware of the non-characters in UTF-8:	//				ef bb bf (Microsoft "lead bytes")//				ef bf be//				ef bf bf const int TiXmlBase::utf8ByteTable[256] = {	//	0	1	2	3	4	5	6	7	8	9	a	b	c	d	e	f		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x00		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x10		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x20		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x30		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x40		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x50		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x60		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x70	End of ASCII range		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x80 0x80 to 0xc1 invalid		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x90 		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0xa0 		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0xb0 		1,	1,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	// 0xc0 0xc2 to 0xdf 2 byte		2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	// 0xd0		3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	// 0xe0 0xe0 to 0xef 3 byte		4,	4,	4,	4,	4,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1	// 0xf0 0xf0 to 0xf4 4 byte, 0xf5 and higher invalid};void TiXmlBase::ConvertUTF32ToUTF8( unsigned long input, char* output, int* length ){	const unsigned long BYTE_MASK = 0xBF;	const unsigned long BYTE_MARK = 0x80;	const unsigned long FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };	if (input < 0x80) 		*length = 1;	else if ( input < 0x800 )		*length = 2;	else if ( input < 0x10000 )		*length = 3;	else if ( input < 0x200000 )		*length = 4;	else		{ *length = 0; return; }	// This code won't covert this correctly anyway.	output += *length;	// Scary scary fall throughs.	switch (*length) 	{		case 4:			--output; 			*output = (char)((input | BYTE_MARK) & BYTE_MASK); 			input >>= 6;		case 3:			--output; 			*output = (char)((input | BYTE_MARK) & BYTE_MASK); 			input >>= 6;		case 2:			--output; 			*output = (char)((input | BYTE_MARK) & BYTE_MASK); 			input >>= 6;		case 1:			--output; 			*output = (char)(input | FIRST_BYTE_MARK[*length]);	}}/*static*/ int TiXmlBase::IsAlpha( unsigned char anyByte, TiXmlEncoding encoding ){	// This will only work for low-ascii, everything else is assumed to be a valid	// letter. I'm not sure this is the best approach, but it is quite tricky trying	// to figure out alhabetical vs. not across encoding. So take a very 	// conservative approach.//	if ( encoding == TIXML_ENCODING_UTF8 )//	{		if ( anyByte < 127 )			return isalpha( anyByte );		else			return 1;	// What else to do? The unicode set is huge...get the english ones right.//	}//	else//	{//		return isalpha( anyByte );//	}}/*static*/ int TiXmlBase::IsAlphaNum( unsigned char anyByte, TiXmlEncoding encoding ){	// This will only work for low-ascii, everything else is assumed to be a valid	// letter. I'm not sure this is the best approach, but it is quite tricky trying	// to figure out alhabetical vs. not across encoding. So take a very 	// conservative approach.//	if ( encoding == TIXML_ENCODING_UTF8 )//	{		if ( anyByte < 127 )			return isalnum( anyByte );		else			return 1;	// What else to do? The unicode set is huge...get the english ones right.//	}//	else//	{//		return isalnum( anyByte );//	}}class TiXmlParsingData{	friend class TiXmlDocument;  public:	void Stamp( const char* now, TiXmlEncoding encoding );	const TiXmlCursor& Cursor()	{ return cursor; }  private:	// Only used by the document!	TiXmlParsingData( const char* start, int _tabsize, int row, int col )	{		assert( start );		stamp = start;		tabsize = _tabsize;		cursor.row = row;		cursor.col = col;	}	TiXmlCursor		cursor;	const char*		stamp;	int				tabsize;};void TiXmlParsingData::Stamp( const char* now, TiXmlEncoding encoding ){	assert( now );	// Do nothing if the tabsize is 0.	if ( tabsize < 1 )	{		return;	}	// Get the current row, column.	int row = cursor.row;	int col = cursor.col;	const char* p = stamp;	assert( p );	while ( p < now )	{		// Code contributed by Fletcher Dunn: (modified by lee)		switch (*p) {			case 0:				// We *should* never get here, but in case we do, don't				// advance past the terminating null character, ever				return;			case '\r':				// bump down to the next line				++row;				col = 0;								// Eat the character				++p;				// Check for \r\n sequence, and treat this as a single character				if (*p == '\n') {					++p;				}				break;			case '\n':				// bump down to the next line				++row;				col = 0;				// Eat the character				++p;				// Check for \n\r sequence, and treat this as a single				// character.  (Yes, this bizarre thing does occur still				// on some arcane platforms...)				if (*p == '\r') {					++p;				}				break;			case '\t':				// Eat the character				++p;				// Skip to next tab stop				col = (col / tabsize + 1) * tabsize;				break;			case (char)(0xef):				if ( encoding == TIXML_ENCODING_UTF8 )				{					if ( *(p+1) && *(p+2) )					{						// In these cases, don't advance the column. These are						// 0-width spaces.						if ( *(p+1)==(char)(0xbb) && *(p+2)==(char)(0xbf) )							p += 3;							else if ( *(p+1)==(char)(0xbf) && *(p+2)==(char)(0xbe) )							p += 3;							else if ( *(p+1)==(char)(0xbf) && *(p+2)==(char)(0xbf) )							p += 3;							else							{ p +=3; ++col; }	// A normal character.					}				}				else				{					++p;					++col;				}				break;			default:				if ( encoding == TIXML_ENCODING_UTF8 )				{					// Eat the 1 to 4 byte utf8 character.					int step = TiXmlBase::utf8ByteTable[*((unsigned char*)p)];					if ( step == 0 )						step = 1;		// Error case from bad encoding, but handle gracefully.					p += step;					// Just advance one column, of course.					++col;				}				else				{					++p;					++col;				}				break;		}	}	cursor.row = row;	cursor.col = col;	assert( cursor.row >= -1 );	assert( cursor.col >= -1 );	stamp = p;	assert( stamp );}const char* TiXmlBase::SkipWhiteSpace( const char* p, TiXmlEncoding encoding ){	if ( !p || !*p )	{		return 0;	}	if ( encoding == TIXML_ENCODING_UTF8 )	{		while ( *p )		{			// Skip the stupid Microsoft UTF-8 Byte order marks			if (	*(p+0)==(char) 0xef 				 && *(p+1)==(char) 0xbb 				 && *(p+2)==(char) 0xbf )			{				p += 3;				continue;			}			else if(*(p+0)==(char) 0xef				 && *(p+1)==(char) 0xbf				 && *(p+2)==(char) 0xbe )			{				p += 3;				continue;			}			else if(*(p+0)==(char) 0xef				 && *(p+1)==(char) 0xbf				 && *(p+2)==(char) 0xbf )			{				p += 3;				continue;			}			if ( IsWhiteSpace( *p ) || *p == '\n' || *p =='\r' )		// Still using old rules for white space.				++p;			else				break;		}	}	else	{		while ( *p && IsWhiteSpace( *p ) || *p == '\n' || *p =='\r' )			++p;	}	return p;}#ifdef TIXML_USE_STL/*static*/ bool TiXmlBase::StreamWhiteSpace( TIXML_ISTREAM * in, TIXML_STRING * tag ){	for( ;; )	{		if ( !in->good() ) return false;		int c = in->peek();		// At this scope, we can't get to a document. So fail silently.		if ( !IsWhiteSpace( c ) || c <= 0 )			return true;		*tag += (char) in->get();	}}/*static*/ bool TiXmlBase::StreamTo( TIXML_ISTREAM * in, int character, TIXML_STRING * tag ){	//assert( character > 0 && character < 128 );	// else it won't work in utf-8	while ( in->good() )	{		int c = in->peek();		if ( c == character )			return true;		if ( c <= 0 )		// Silent failure: can't get document at this scope			return false;		in->get();		*tag += (char) c;	}	return false;}#endifconst char* TiXmlBase::ReadName( const char* p, TIXML_STRING * name, TiXmlEncoding encoding ){	*name = "";	assert( p );	// Names start with letters or underscores.	// Of course, in unicode, tinyxml has no idea what a letter *is*. The	// algorithm is generous.	//	// After that, they can be letters, underscores, numbers,	// hyphens, or colons. (Colons are valid ony for namespaces,	// but tinyxml can't tell namespaces from names.)	if (    p && *p 		 && ( IsAlpha( (unsigned char) *p, encoding ) || *p == '_' ) )	{		while(		p && *p				&&	(		IsAlphaNum( (unsigned char ) *p, encoding ) 						 || *p == '_'						 || *p == '-'						 || *p == '.'						 || *p == ':' ) )		{			(*name) += *p;			++p;		}		return p;	}	return 0;}const char* TiXmlBase::GetEntity( const char* p, char* value, int* length, TiXmlEncoding encoding ){	// Presume an entity, and pull it out.    TIXML_STRING ent;	int i;	*length = 0;	if ( *(p+1) && *(p+1) == '#' && *(p+2) )	{		unsigned long ucs = 0;		unsigned delta = 0;		unsigned mult = 1;		if ( *(p+2) == 'x' )		{			// Hexadecimal.			if ( !*(p+3) ) return 0;			const char* q = p+3;			q = strchr( q, ';' );			if ( !q || !*q ) return 0;			delta = q-p;			--q;			while ( *q != 'x' )			{				if ( *q >= '0' && *q <= '9' )					ucs += mult * (*q - '0');				else if ( *q >= 'a' && *q <= 'f' )					ucs += mult * (*q - 'a' + 10);				else if ( *q >= 'A' && *q <= 'F' )					ucs += mult * (*q - 'A' + 10 );				else 					return 0;				mult *= 16;				--q;			}		}		else		{			// Decimal.			if ( !*(p+2) ) return 0;			const char* q = p+2;			q = strchr( q, ';' );			if ( !q || !*q ) return 0;			delta = q-p;			--q;			while ( *q != '#' )			{				if ( *q >= '0' && *q <= '9' )					ucs += mult * (*q - '0');				else 					return 0;				mult *= 10;				--q;			}		}		if ( encoding == TIXML_ENCODING_UTF8 )		{			// convert the UCS to UTF-8			ConvertUTF32ToUTF8( ucs, value, length );		}		else		{			*value = (char)ucs;			*length = 1;		}		return p + delta + 1;	}	// Now try to match it.	for( i=0; i<NUM_ENTITY; ++i )	{		if ( strncmp( entity[i].str, p, entity[i].strLength ) == 0 )		{			assert( strlen( entity[i].str ) == entity[i].strLength );			*value = entity[i].chr;			*length = 1;			return ( p + entity[i].strLength );		}	}
tinyxmlparser.cpp - 源码说明

本页面展示了「otl简单包装实现类,对数据库进行操作的,简单易用.」中的 tinyxmlparser.cpp 源码文件，采用 C++ 编程语言编写，共 1,494 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫开发者社区收录了大量与OTL相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?