📄 markup.cpp

📁 分别用贪心算法和启发式算法对测试用例集进行了最小化。
💻 CPP
📖 第 1 页 / 共 5 页
字号:
						// Convert to 2-byte UTF-8
						MCD_BLDAPPEND1(strText,((nUnicode&0x7c0)>>6)|0xc0);
						MCD_BLDAPPEND1(strText,(nUnicode&0x3f)|0x80);
					}
					else
					{
						// Convert to 3-byte UTF-8
						MCD_BLDAPPEND1(strText,((nUnicode&0xf000)>>12)|0xe0);
						MCD_BLDAPPEND1(strText,((nUnicode&0xfc0)>>6)|0x80);
						MCD_BLDAPPEND1(strText,(nUnicode&0x3f)|0x80);
					}
#endif
					if ( nUnicode )
					{
						// Increment index past ampersand semi-colon
						nChar = nNumericChar + nCodeLen + 1;
						bCodeConverted = true;
					}
				}
			}
			else // does not start with #
			{
				// Look for matching &code;
				for ( int nMatch = 0; nMatch < 5; ++nMatch )
				{
					if ( nChar < nTextLength - anCodeLen[nMatch]
						&& MCD_PSZNCMP(szaCode[nMatch],&pSource[nChar+1],anCodeLen[nMatch]) == 0 )
					{
						// Insert symbol and increment index past ampersand semi-colon
						MCD_BLDAPPEND1(strText,szSymbol[nMatch]);
						nChar += anCodeLen[nMatch] + 1;
						bCodeConverted = true;
						break;
					}
				}
			}

			// If the code is not converted, leave it as is
			if ( ! bCodeConverted )
			{
				MCD_BLDAPPEND1(strText,_T('&'));
				++nChar;
			}
		}
		else // not &
		{
			nCharLen = MCD_CLEN(&pSource[nChar]);
			MCD_BLDAPPENDN(strText,&pSource[nChar],nCharLen);
			nChar += nCharLen;
		}
	}
	MCD_BLDRELEASE(strText);
	return strText;
}

int CMarkup::UTF16To8( char* pszUTF8, const wchar_t* pwszUTF16, int nUTF8Count )
{
	// Supports the same arguments as wcstombs
	// the pwszUTF16 source must be a NULL-terminated UTF-16 string
	// if pszUTF8 is NULL, the number of bytes required is returned and nUTF8Count is ignored
	// otherwise pszUTF8 is filled with the result string and NULL-terminated if nUTF8Count allows
	// nUTF8Count is the byte size of pszUTF8 and must be large enough for the NULL if NULL desired
	// and the number of bytes (excluding NULL) is returned
	//
	int nUChar, nUTF8Len = 0;
	while ( *pwszUTF16 )
	{
		// Decode UTF-16
		nUChar = DecodeCharUTF16( pwszUTF16 );
		if ( nUChar == -1 )
			nUChar = '?';

		// Encode UTF-8
		if ( pszUTF8 && nUTF8Len + 4 > nUTF8Count )
		{
			int nUTF8LenSoFar = nUTF8Len;
			EncodeCharUTF8( nUChar, NULL, nUTF8Len );
			if ( nUTF8Len > nUTF8Count )
				return nUTF8LenSoFar;
			nUTF8Len = nUTF8LenSoFar;
		}
		EncodeCharUTF8( nUChar, pszUTF8, nUTF8Len );
	}
	if ( pszUTF8 && nUTF8Len < nUTF8Count )
		pszUTF8[nUTF8Len] = 0;
	return nUTF8Len;
}

int CMarkup::DecodeCharUTF8( const char*& pszUTF8 )
{
	// Return Unicode code point and increment pszUTF8 past 1-4 bytes
	int nUChar = (unsigned char)*pszUTF8;
	++pszUTF8;
	if ( nUChar & 0x80 )
	{
		int nExtraChars;
		if ( ! (nUChar & 0x20) )
		{
			nExtraChars = 1;
			nUChar &= 0x1f;
		}
		else if ( ! (nUChar & 0x10) )
		{
			nExtraChars = 2;
			nUChar &= 0x0f;
		}
		else if ( ! (nUChar & 0x08) )
		{
			nExtraChars = 3;
			nUChar &= 0x07;
		}
		else
			return -1;
		while ( nExtraChars-- )
		{
			if ( (*pszUTF8 & 0x80) )
			{
				nUChar = nUChar<<6;
				nUChar |= *pszUTF8 & 0x3f;
			}
			else
				return -1;
			++pszUTF8;
		}
	}
	return nUChar;
}

void CMarkup::EncodeCharUTF16( int nUChar, wchar_t* pwszUTF16, int& nWideLen )
{
	// Write UTF-16 sequence to pwszUTF16 for Unicode code point nUChar and update nWideLen
	// Be sure pwszUTF16 has room for up to 2 wide chars
	//
	if ( nUChar & ~0xffff )
	{
		if ( pwszUTF16 )
		{
			// Surrogate pair
			nUChar -= 0x10000;
			pwszUTF16[nWideLen++] = (wchar_t)(((nUChar>>10) & 0x3ff) | 0xd800); // W1
			pwszUTF16[nWideLen++] = (wchar_t)((nUChar & 0x3ff) | 0xdc00); // W2
		}
		else
			nWideLen += 2;
	}
	else
	{
		if ( pwszUTF16 )
			pwszUTF16[nWideLen++] = (wchar_t)nUChar;
		else
			++nWideLen;
	}
}

int CMarkup::UTF8To16( wchar_t* pwszUTF16, const char* pszUTF8, int nUTF8Count )
{
	// Supports the same arguments as mbstowcs
	// the pszUTF8 source must be a UTF-8 string which will be processed up to NULL-terminator or nUTF8Count
	// if pwszUTF16 is NULL, the number of wide chars required is returned
	// nUTF8Count is maximum UTF-8 bytes to convert and should include NULL if NULL desired in result
	// if pwszUTF16 is not NULL it is filled with the result string and it must be large enough
	// result will be NULL-terminated if NULL encountered in pszUTF8 before nUTF8Count
	// and the number of UTF-8 bytes converted is returned
	//
	const char* pszPosUTF8 = pszUTF8;
	int nUChar, nUTF8Len = 0, nWideLen = 0;
	while ( nUTF8Len < nUTF8Count )
	{
		// Decode UTF-8
		if ( nUTF8Len + 4 > nUTF8Count )
		{
			// Pre-examine UTF-8 character using temporary null-terminated copy
			// to see if this UTF-8 character boundary is within nUTF8Count
			char szUTF8Copy[5];
			const char* pszPosUTF8Copy = szUTF8Copy;
			int nUTF8EndCount = nUTF8Count - nUTF8Len;
			strncpy( szUTF8Copy, pszPosUTF8, nUTF8EndCount );
			szUTF8Copy[nUTF8EndCount] = '\0';
			nUChar = DecodeCharUTF8( pszPosUTF8Copy );
			int nUTF8EndLen = (int)(pszPosUTF8Copy - szUTF8Copy);
			if ( nUTF8Len + nUTF8EndLen > nUTF8Count )
				break;
		}
		nUChar = DecodeCharUTF8( pszPosUTF8 );
		nUTF8Len = (int)(pszPosUTF8 - pszUTF8);
		if ( ! nUChar )
		{
			if ( pwszUTF16 )
				pwszUTF16[nWideLen] = 0;
			break;
		}
		else if ( nUChar == -1 )
			nUChar = '?';

		// Encode UTF-16
		EncodeCharUTF16( nUChar, pwszUTF16, nWideLen );
	}
	if ( ! pwszUTF16 )
		return nWideLen;
	return nUTF8Len;
}

int CMarkup::DecodeCharUTF16( const wchar_t*& pwszUTF16 )
{
	// Return Unicode code point and increment pwszUTF16 past 1 or 2 (if surrogrates) wide chars
	int nUChar = *pwszUTF16;
	if ( (nUChar & ~0x000007ff) == 0xd800 ) // W1
	{
		++pwszUTF16;
		if ( ! *pwszUTF16 ) // W2
			return -1; // incorrect UTF-16
		nUChar = (((nUChar & 0x3ff) << 10) | (*pwszUTF16 & 0x3ff)) + 0x10000;
	}
	++pwszUTF16;
	return nUChar;
}

void CMarkup::EncodeCharUTF8( int nUChar, char* pszUTF8, int& nUTF8Len )
{
	// Write UTF-8 sequence to pszUTF8 for Unicode code point nUChar and update nUTF8Len
	// Be sure pszUTF8 has room for up to 4 bytes
	//
	if ( ! (nUChar & ~0x0000007f) ) // < 0x80
	{
		if ( pszUTF8 )
			pszUTF8[nUTF8Len++] = (char)nUChar;
		else
			++nUTF8Len;
	}
	else if ( ! (nUChar & ~0x000007ff) ) // < 0x800
	{
		if ( pszUTF8 )
		{
			pszUTF8[nUTF8Len++] = (char)(((nUChar&0x7c0)>>6)|0xc0);
			pszUTF8[nUTF8Len++] = (char)((nUChar&0x3f)|0x80);
		}
		else
			nUTF8Len += 2;
	}
	else if ( ! (nUChar & ~0x0000ffff) ) // < 0x10000
	{
		if ( pszUTF8 )
		{
			pszUTF8[nUTF8Len++] = (char)(((nUChar&0xf000)>>12)|0xe0);
			pszUTF8[nUTF8Len++] = (char)(((nUChar&0xfc0)>>6)|0x80);
			pszUTF8[nUTF8Len++] = (char)((nUChar&0x3f)|0x80);
		}
		else
			nUTF8Len += 3;
	}
	else // < 0x110000
	{
		if ( pszUTF8 )
		{
			pszUTF8[nUTF8Len++] = (char)(((nUChar&0x1c0000)>>18)|0xf0);
			pszUTF8[nUTF8Len++] = (char)(((nUChar&0x3f000)>>12)|0x80);
			pszUTF8[nUTF8Len++] = (char)(((nUChar&0xfc0)>>6)|0x80);
			pszUTF8[nUTF8Len++] = (char)((nUChar&0x3f)|0x80);
		}
		else
			nUTF8Len += 4;
	}
}

#if ! defined( UNICODE )
MCD_STR CMarkup::UTF8ToA( MCD_CSTR pszUTF8, int* pnFailed/*=NULL*/ )
{
	// Converts from UTF-8 directly to locale ANSI charset
	// this uses wctomb which requires setlocale other than minimal "C" locale
	// e.g. setlocale(LC_ALL, "") enables the OS system locale settings
	MCD_STR strANSI;
	int nBufferLen = (int)strlen( pszUTF8 ) + 4;
	MCD_BLDRESERVE(strANSI,nBufferLen);
	int nUChar, nCharLen;
	MCD_CHAR szANSI[2];
	if ( pnFailed )
		*pnFailed = 0;
	MCD_PCSZ pUTF8 = pszUTF8;
	while ( *pUTF8 )
	{
		MCD_BLDCHECK(strANSI,nBufferLen,4); // was grow by (nBufferLen / 2 + 4)
		nUChar = DecodeCharUTF8( pUTF8 );
		if ( nUChar & ~0xffff )
			nCharLen = -1;
		else
			nCharLen = wctomb( szANSI, (wchar_t)nUChar );
		if ( nCharLen == -1 )
		{
			if ( pnFailed )
				++(*pnFailed);
			MCD_BLDAPPEND1(strANSI,_T('?'));
		}
		else
		{
			MCD_BLDAPPENDN(strANSI,szANSI,nCharLen);
		}
	}
	MCD_BLDRELEASE(strANSI);
	return strANSI;
}

MCD_STR CMarkup::AToUTF8( MCD_CSTR pszANSI )
{
	// Converts locale ANSI charset directly to UTF-8
	// this uses mbtowc which requires setlocale other than minimal "C" locale
	// e.g. setlocale(LC_ALL, "") enables the OS system locale settings
	MCD_STR strUTF8;
	int nBufferLen = (int)strlen( pszANSI ) * 2 + 4;
	MCD_BLDRESERVE(strUTF8,nBufferLen);
	int nUChar, nCharLen;
	wchar_t wcChar;
	MCD_CHAR szUTF8Char[4];
	MCD_PCSZ pANSI = pszANSI;
	while ( *pANSI )
	{
		MCD_BLDCHECK(strUTF8,nBufferLen,4);
		nCharLen = mbtowc( &wcChar, pANSI, 5 );
		if ( nCharLen < 1 )
		{
			nCharLen = 1;
			wcChar = (wchar_t)'?';
		}
		pANSI += nCharLen;
		nUChar = (int)wcChar;
		nCharLen = 0;
		EncodeCharUTF8( nUChar, szUTF8Char, nCharLen );
		MCD_BLDAPPENDN(strUTF8,szUTF8Char,nCharLen);
	}
	MCD_BLDRELEASE(strUTF8);
	return strUTF8;
}
#endif

MCD_STR CMarkup::GetDeclaredEncoding( MCD_CSTR szDoc )
{
	// Extract encoding attribute from XML Declaration
	MCD_STR strEncoding;
	MCD_PCSZ pStart = MCD_PSZCHR( szDoc, _T('<') );
	if ( pStart && pStart[1] == _T('?') )
	{
		MCD_PCSZ pEnd = MCD_PSZSTR( szDoc, _T("?>") );
		if ( pEnd )
		{
			MCD_STR strXMLDecl( pStart, (int)(pEnd-pStart)+2 );
			CMarkup xmlDecl( strXMLDecl );
			if ( xmlDecl.FindNode() )
				strEncoding = xmlDecl.GetAttrib( _T("encoding") );
		}
	}
	return strEncoding;
}


int CMarkup::FindNode( int nType )
{
	// Change current node position only if a node is found
	// If nType is 0 find any node, otherwise find node of type nType
	// Return type of node or 0 if not found
	// If found node is an element, change m_iPos

	// Determine where in document to start scanning for node
	int nTypeFound = 0;
	int nNodeOffset = m_nNodeOffset;
	if ( m_nNodeType > 1 )
	{
		// By-pass current node
		nNodeOffset += m_nNodeLength;
	}
	else
	{
		// Set position to begin looking for node
		nNodeOffset = 0; // default to start of document
		if ( m_iPos )
		{
			// After element
			nNodeOffset = m_aPos[m_iPos].StartAfter();
		}
		else if ( m_iPosParent )
		{
			// Immediately after start tag of parent
			if ( m_aPos[m_iPosParent].IsEmptyElement() )
				return 0;
			else
				nNodeOffset = m_aPos[m_iPosParent].StartContent();
		}
	}

	// Get nodes until we find what we're looking for
	int iPosNew = m_iPos;
	TokenPos token( m_strDoc, m_nFlags );
	NodePos node;
	token.nNext = nNodeOffset;
	do
	{
		nNodeOffset = token.nNext;
		nTypeFound = x_ParseNode( token, node );
		if ( nTypeFound == 0 )
		{
			// Check if we have reached the end of the parent element
			// Otherwise it is a lone end tag
			if ( m_iPosParent && nNodeOffset == m_aPos[m_iPosParent].StartContent()
					+ m_aPos[m_iPosParent].ContentLen() )
				return 0;
			nTypeFound = MNT_LONE_END_TAG;
		}
		else if ( nTypeFound < 0 )
		{
			if ( nTypeFound == -2 )
				return 0;
			// -1 is node error
			nTypeFound = MNT_NODE_ERROR;
		}
		else if ( nTypeFound == MNT_ELEMENT )
		{
			if ( iPosNew )
				iPosNew = m_aPos[iPosNew].iElemNext;
			else
				iPosNew = m_aPos[m_iPosParent].iElemChild;
			if ( ! iPosNew )
				return 0;
			if ( ! nType || (nType & nTypeFound) )
			{
				// Found element node, move position to this element
				x_SetPos( m_iPosParent, iPosNew, 0 );
				return m_nNodeType;
			}
			token.nNext = m_aPos[iPosNew].StartAfter();
		}
	}
	while ( nType && ! (nType & nTypeFound) );

	m_iPos = iPosNew;
	m_iPosChild = 0;
	m_nNodeOffset = nNodeOffset;
	m_nNodeLength = token.nNext - nNodeOffset;
	m_nNodeType = nTypeFound;
	MARKUP_SETDEBUGSTATE;
	return m_nNodeType;
}

bool CMarkup::RemoveNode()
{
	if ( m_iPos || m_nNodeLength )
	{
		x_RemoveNode( m_iPosParent, m_iPos, m_nNodeType, m_nNodeOffset, m_nNodeLength );
		m_iPosChild = 0;
		MARKUP_SETDEBUGSTATE;
		return true;
	}
	return false;
}

MCD_STR CMarkup::GetTagName() const
{
	// Return the tag name at the current main position
	MCD_STR strTagName;

	// This method is primarily for elements, however
	// it does return something for certain other nodes
	if ( m_nNodeLength )
	{
		switch ( m_nNodeType )
		{
💿 文件大小 3755 K
👤 上传用户 WOKAORIPI
📂 所属分类其他
🏷️ 相关标签

#分 #算法 #启发式算法 #测试
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -