📄 markup.cpp
字号:
// Convert to 2-byte UTF-8
MCD_BLDAPPEND1(strText,((nUnicode&0x7c0)>>6)|0xc0);
MCD_BLDAPPEND1(strText,(nUnicode&0x3f)|0x80);
}
else
{
// Convert to 3-byte UTF-8
MCD_BLDAPPEND1(strText,((nUnicode&0xf000)>>12)|0xe0);
MCD_BLDAPPEND1(strText,((nUnicode&0xfc0)>>6)|0x80);
MCD_BLDAPPEND1(strText,(nUnicode&0x3f)|0x80);
}
#endif
if ( nUnicode )
{
// Increment index past ampersand semi-colon
nChar = nNumericChar + nCodeLen + 1;
bCodeConverted = true;
}
}
}
else // does not start with #
{
// Look for matching &code;
for ( int nMatch = 0; nMatch < 5; ++nMatch )
{
if ( nChar < nTextLength - anCodeLen[nMatch]
&& MCD_PSZNCMP(szaCode[nMatch],&pSource[nChar+1],anCodeLen[nMatch]) == 0 )
{
// Insert symbol and increment index past ampersand semi-colon
MCD_BLDAPPEND1(strText,szSymbol[nMatch]);
nChar += anCodeLen[nMatch] + 1;
bCodeConverted = true;
break;
}
}
}
// If the code is not converted, leave it as is
if ( ! bCodeConverted )
{
MCD_BLDAPPEND1(strText,_T('&'));
++nChar;
}
}
else // not &
{
nCharLen = MCD_CLEN(&pSource[nChar]);
MCD_BLDAPPENDN(strText,&pSource[nChar],nCharLen);
nChar += nCharLen;
}
}
MCD_BLDRELEASE(strText);
return strText;
}
int CMarkup::UTF16To8( char* pszUTF8, const wchar_t* pwszUTF16, int nUTF8Count )
{
// Supports the same arguments as wcstombs
// the pwszUTF16 source must be a NULL-terminated UTF-16 string
// if pszUTF8 is NULL, the number of bytes required is returned and nUTF8Count is ignored
// otherwise pszUTF8 is filled with the result string and NULL-terminated if nUTF8Count allows
// nUTF8Count is the byte size of pszUTF8 and must be large enough for the NULL if NULL desired
// and the number of bytes (excluding NULL) is returned
//
int nUChar, nUTF8Len = 0;
while ( *pwszUTF16 )
{
// Decode UTF-16
nUChar = DecodeCharUTF16( pwszUTF16 );
if ( nUChar == -1 )
nUChar = '?';
// Encode UTF-8
if ( pszUTF8 && nUTF8Len + 4 > nUTF8Count )
{
int nUTF8LenSoFar = nUTF8Len;
EncodeCharUTF8( nUChar, NULL, nUTF8Len );
if ( nUTF8Len > nUTF8Count )
return nUTF8LenSoFar;
nUTF8Len = nUTF8LenSoFar;
}
EncodeCharUTF8( nUChar, pszUTF8, nUTF8Len );
}
if ( pszUTF8 && nUTF8Len < nUTF8Count )
pszUTF8[nUTF8Len] = 0;
return nUTF8Len;
}
int CMarkup::DecodeCharUTF8( const char*& pszUTF8 )
{
// Return Unicode code point and increment pszUTF8 past 1-4 bytes
int nUChar = (unsigned char)*pszUTF8;
++pszUTF8;
if ( nUChar & 0x80 )
{
int nExtraChars;
if ( ! (nUChar & 0x20) )
{
nExtraChars = 1;
nUChar &= 0x1f;
}
else if ( ! (nUChar & 0x10) )
{
nExtraChars = 2;
nUChar &= 0x0f;
}
else if ( ! (nUChar & 0x08) )
{
nExtraChars = 3;
nUChar &= 0x07;
}
else
return -1;
while ( nExtraChars-- )
{
if ( (*pszUTF8 & 0x80) )
{
nUChar = nUChar<<6;
nUChar |= *pszUTF8 & 0x3f;
}
else
return -1;
++pszUTF8;
}
}
return nUChar;
}
void CMarkup::EncodeCharUTF16( int nUChar, wchar_t* pwszUTF16, int& nWideLen )
{
// Write UTF-16 sequence to pwszUTF16 for Unicode code point nUChar and update nWideLen
// Be sure pwszUTF16 has room for up to 2 wide chars
//
if ( nUChar & ~0xffff )
{
if ( pwszUTF16 )
{
// Surrogate pair
nUChar -= 0x10000;
pwszUTF16[nWideLen++] = (wchar_t)(((nUChar>>10) & 0x3ff) | 0xd800); // W1
pwszUTF16[nWideLen++] = (wchar_t)((nUChar & 0x3ff) | 0xdc00); // W2
}
else
nWideLen += 2;
}
else
{
if ( pwszUTF16 )
pwszUTF16[nWideLen++] = (wchar_t)nUChar;
else
++nWideLen;
}
}
int CMarkup::UTF8To16( wchar_t* pwszUTF16, const char* pszUTF8, int nUTF8Count )
{
// Supports the same arguments as mbstowcs
// the pszUTF8 source must be a UTF-8 string which will be processed up to NULL-terminator or nUTF8Count
// if pwszUTF16 is NULL, the number of wide chars required is returned
// nUTF8Count is maximum UTF-8 bytes to convert and should include NULL if NULL desired in result
// if pwszUTF16 is not NULL it is filled with the result string and it must be large enough
// result will be NULL-terminated if NULL encountered in pszUTF8 before nUTF8Count
// and the number of UTF-8 bytes converted is returned
//
const char* pszPosUTF8 = pszUTF8;
int nUChar, nUTF8Len = 0, nWideLen = 0;
while ( nUTF8Len < nUTF8Count )
{
// Decode UTF-8
if ( nUTF8Len + 4 > nUTF8Count )
{
// Pre-examine UTF-8 character using temporary null-terminated copy
// to see if this UTF-8 character boundary is within nUTF8Count
char szUTF8Copy[5];
const char* pszPosUTF8Copy = szUTF8Copy;
int nUTF8EndCount = nUTF8Count - nUTF8Len;
strncpy( szUTF8Copy, pszPosUTF8, nUTF8EndCount );
szUTF8Copy[nUTF8EndCount] = '\0';
nUChar = DecodeCharUTF8( pszPosUTF8Copy );
int nUTF8EndLen = (int)(pszPosUTF8Copy - szUTF8Copy);
if ( nUTF8Len + nUTF8EndLen > nUTF8Count )
break;
}
nUChar = DecodeCharUTF8( pszPosUTF8 );
nUTF8Len = (int)(pszPosUTF8 - pszUTF8);
if ( ! nUChar )
{
if ( pwszUTF16 )
pwszUTF16[nWideLen] = 0;
break;
}
else if ( nUChar == -1 )
nUChar = '?';
// Encode UTF-16
EncodeCharUTF16( nUChar, pwszUTF16, nWideLen );
}
if ( ! pwszUTF16 )
return nWideLen;
return nUTF8Len;
}
int CMarkup::DecodeCharUTF16( const wchar_t*& pwszUTF16 )
{
// Return Unicode code point and increment pwszUTF16 past 1 or 2 (if surrogrates) wide chars
int nUChar = *pwszUTF16;
if ( (nUChar & ~0x000007ff) == 0xd800 ) // W1
{
++pwszUTF16;
if ( ! *pwszUTF16 ) // W2
return -1; // incorrect UTF-16
nUChar = (((nUChar & 0x3ff) << 10) | (*pwszUTF16 & 0x3ff)) + 0x10000;
}
++pwszUTF16;
return nUChar;
}
void CMarkup::EncodeCharUTF8( int nUChar, char* pszUTF8, int& nUTF8Len )
{
// Write UTF-8 sequence to pszUTF8 for Unicode code point nUChar and update nUTF8Len
// Be sure pszUTF8 has room for up to 4 bytes
//
if ( ! (nUChar & ~0x0000007f) ) // < 0x80
{
if ( pszUTF8 )
pszUTF8[nUTF8Len++] = (char)nUChar;
else
++nUTF8Len;
}
else if ( ! (nUChar & ~0x000007ff) ) // < 0x800
{
if ( pszUTF8 )
{
pszUTF8[nUTF8Len++] = (char)(((nUChar&0x7c0)>>6)|0xc0);
pszUTF8[nUTF8Len++] = (char)((nUChar&0x3f)|0x80);
}
else
nUTF8Len += 2;
}
else if ( ! (nUChar & ~0x0000ffff) ) // < 0x10000
{
if ( pszUTF8 )
{
pszUTF8[nUTF8Len++] = (char)(((nUChar&0xf000)>>12)|0xe0);
pszUTF8[nUTF8Len++] = (char)(((nUChar&0xfc0)>>6)|0x80);
pszUTF8[nUTF8Len++] = (char)((nUChar&0x3f)|0x80);
}
else
nUTF8Len += 3;
}
else // < 0x110000
{
if ( pszUTF8 )
{
pszUTF8[nUTF8Len++] = (char)(((nUChar&0x1c0000)>>18)|0xf0);
pszUTF8[nUTF8Len++] = (char)(((nUChar&0x3f000)>>12)|0x80);
pszUTF8[nUTF8Len++] = (char)(((nUChar&0xfc0)>>6)|0x80);
pszUTF8[nUTF8Len++] = (char)((nUChar&0x3f)|0x80);
}
else
nUTF8Len += 4;
}
}
#if ! defined( UNICODE )
MCD_STR CMarkup::UTF8ToA( MCD_CSTR pszUTF8, int* pnFailed/*=NULL*/ )
{
// Converts from UTF-8 directly to locale ANSI charset
// this uses wctomb which requires setlocale other than minimal "C" locale
// e.g. setlocale(LC_ALL, "") enables the OS system locale settings
MCD_STR strANSI;
int nBufferLen = (int)strlen( pszUTF8 ) + 4;
MCD_BLDRESERVE(strANSI,nBufferLen);
int nUChar, nCharLen;
MCD_CHAR szANSI[2];
if ( pnFailed )
*pnFailed = 0;
MCD_PCSZ pUTF8 = pszUTF8;
while ( *pUTF8 )
{
MCD_BLDCHECK(strANSI,nBufferLen,4); // was grow by (nBufferLen / 2 + 4)
nUChar = DecodeCharUTF8( pUTF8 );
if ( nUChar & ~0xffff )
nCharLen = -1;
else
nCharLen = wctomb( szANSI, (wchar_t)nUChar );
if ( nCharLen == -1 )
{
if ( pnFailed )
++(*pnFailed);
MCD_BLDAPPEND1(strANSI,_T('?'));
}
else
{
MCD_BLDAPPENDN(strANSI,szANSI,nCharLen);
}
}
MCD_BLDRELEASE(strANSI);
return strANSI;
}
MCD_STR CMarkup::AToUTF8( MCD_CSTR pszANSI )
{
// Converts locale ANSI charset directly to UTF-8
// this uses mbtowc which requires setlocale other than minimal "C" locale
// e.g. setlocale(LC_ALL, "") enables the OS system locale settings
MCD_STR strUTF8;
int nBufferLen = (int)strlen( pszANSI ) * 2 + 4;
MCD_BLDRESERVE(strUTF8,nBufferLen);
int nUChar, nCharLen;
wchar_t wcChar;
MCD_CHAR szUTF8Char[4];
MCD_PCSZ pANSI = pszANSI;
while ( *pANSI )
{
MCD_BLDCHECK(strUTF8,nBufferLen,4);
nCharLen = mbtowc( &wcChar, pANSI, 5 );
if ( nCharLen < 1 )
{
nCharLen = 1;
wcChar = (wchar_t)'?';
}
pANSI += nCharLen;
nUChar = (int)wcChar;
nCharLen = 0;
EncodeCharUTF8( nUChar, szUTF8Char, nCharLen );
MCD_BLDAPPENDN(strUTF8,szUTF8Char,nCharLen);
}
MCD_BLDRELEASE(strUTF8);
return strUTF8;
}
#endif
MCD_STR CMarkup::GetDeclaredEncoding( MCD_CSTR szDoc )
{
// Extract encoding attribute from XML Declaration
MCD_STR strEncoding;
MCD_PCSZ pStart = MCD_PSZCHR( szDoc, _T('<') );
if ( pStart && pStart[1] == _T('?') )
{
MCD_PCSZ pEnd = MCD_PSZSTR( szDoc, _T("?>") );
if ( pEnd )
{
MCD_STR strXMLDecl( pStart, (int)(pEnd-pStart)+2 );
CMarkup xmlDecl( strXMLDecl );
if ( xmlDecl.FindNode() )
strEncoding = xmlDecl.GetAttrib( _T("encoding") );
}
}
return strEncoding;
}
int CMarkup::FindNode( int nType )
{
// Change current node position only if a node is found
// If nType is 0 find any node, otherwise find node of type nType
// Return type of node or 0 if not found
// If found node is an element, change m_iPos
// Determine where in document to start scanning for node
int nTypeFound = 0;
int nNodeOffset = m_nNodeOffset;
if ( m_nNodeType > 1 )
{
// By-pass current node
nNodeOffset += m_nNodeLength;
}
else
{
// Set position to begin looking for node
nNodeOffset = 0; // default to start of document
if ( m_iPos )
{
// After element
nNodeOffset = m_aPos[m_iPos].StartAfter();
}
else if ( m_iPosParent )
{
// Immediately after start tag of parent
if ( m_aPos[m_iPosParent].IsEmptyElement() )
return 0;
else
nNodeOffset = m_aPos[m_iPosParent].StartContent();
}
}
// Get nodes until we find what we're looking for
int iPosNew = m_iPos;
TokenPos token( m_strDoc, m_nFlags );
NodePos node;
token.nNext = nNodeOffset;
do
{
nNodeOffset = token.nNext;
nTypeFound = x_ParseNode( token, node );
if ( nTypeFound == 0 )
{
// Check if we have reached the end of the parent element
// Otherwise it is a lone end tag
if ( m_iPosParent && nNodeOffset == m_aPos[m_iPosParent].StartContent()
+ m_aPos[m_iPosParent].ContentLen() )
return 0;
nTypeFound = MNT_LONE_END_TAG;
}
else if ( nTypeFound < 0 )
{
if ( nTypeFound == -2 )
return 0;
// -1 is node error
nTypeFound = MNT_NODE_ERROR;
}
else if ( nTypeFound == MNT_ELEMENT )
{
if ( iPosNew )
iPosNew = m_aPos[iPosNew].iElemNext;
else
iPosNew = m_aPos[m_iPosParent].iElemChild;
if ( ! iPosNew )
return 0;
if ( ! nType || (nType & nTypeFound) )
{
// Found element node, move position to this element
x_SetPos( m_iPosParent, iPosNew, 0 );
return m_nNodeType;
}
token.nNext = m_aPos[iPosNew].StartAfter();
}
}
while ( nType && ! (nType & nTypeFound) );
m_iPos = iPosNew;
m_iPosChild = 0;
m_nNodeOffset = nNodeOffset;
m_nNodeLength = token.nNext - nNodeOffset;
m_nNodeType = nTypeFound;
MARKUP_SETDEBUGSTATE;
return m_nNodeType;
}
bool CMarkup::RemoveNode()
{
if ( m_iPos || m_nNodeLength )
{
x_RemoveNode( m_iPosParent, m_iPos, m_nNodeType, m_nNodeOffset, m_nNodeLength );
m_iPosChild = 0;
MARKUP_SETDEBUGSTATE;
return true;
}
return false;
}
MCD_STR CMarkup::GetTagName() const
{
// Return the tag name at the current main position
MCD_STR strTagName;
// This method is primarily for elements, however
// it does return something for certain other nodes
if ( m_nNodeLength )
{
switch ( m_nNodeType )
{
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -