📄 markup.cpp
字号:
// Markup.cpp: implementation of the CMarkup class.
//
// Markup Release 9.0
// Copyright (C) 1999-2007 First Objective Software, Inc. All rights reserved
// Go to www.firstobject.com for the latest CMarkup and EDOM documentation
// Use in commercial applications requires written permission
// This software is provided "as is", with no warranty.
//
#include <stdio.h>
#include "Markup.h"
#ifdef MCD_STRERROR
#include <string.h>
#include <errno.h>
#else
#include <windows.h>
#endif
#if defined(_DEBUG) && ! defined(MARKUP_STL) && ! defined(MARKUP_STDC)
#undef THIS_FILE
static char THIS_FILE[]=__FILE__;
#define new DEBUG_NEW
#endif
#ifdef _MBCS
#pragma message( "Note: MBCS build (not UTF-8)" )
// For UTF-8, remove _MBCS from project settings C/C++ preprocessor definitions
#endif
// Customization
#define x_EOL _T("\r\n") // can be \r\n or \n or empty
#define x_EOLLEN (sizeof(x_EOL)/sizeof(MCD_CHAR)-1) // string length of x_EOL
#define x_ATTRIBQUOTE _T("\"") // can be double or single quote
void CMarkup::operator=( const CMarkup& markup )
{
m_iPosParent = markup.m_iPosParent;
m_iPos = markup.m_iPos;
m_iPosChild = markup.m_iPosChild;
m_iPosFree = markup.m_iPosFree;
m_iPosDeleted = markup.m_iPosDeleted;
m_nNodeType = markup.m_nNodeType;
m_nNodeOffset = markup.m_nNodeOffset;
m_nNodeLength = markup.m_nNodeLength;
m_strDoc = markup.m_strDoc;
m_strError = markup.m_strError;
m_nFlags = markup.m_nFlags;
// Copy used part of the index array
m_aPos.RemoveAll();
m_aPos.nSize = m_iPosFree;
if ( m_aPos.nSize < 8 )
m_aPos.nSize = 8;
m_aPos.nSegs = m_aPos.SegsUsed();
if ( m_aPos.nSegs )
{
m_aPos.pSegs = (ElemPos**)(new char[m_aPos.nSegs*sizeof(char*)]);
int nSegSize = 1 << m_aPos.PA_SEGBITS;
for ( int nSeg=0; nSeg < m_aPos.nSegs; ++nSeg )
{
if ( nSeg + 1 == m_aPos.nSegs )
nSegSize = m_aPos.GetSize() - (nSeg << m_aPos.PA_SEGBITS);
m_aPos.pSegs[nSeg] = (ElemPos*)(new char[nSegSize*sizeof(ElemPos)]);
memcpy( m_aPos.pSegs[nSeg], markup.m_aPos.pSegs[nSeg], nSegSize*sizeof(ElemPos) );
}
}
// Copy SavedPos map
m_mapSavedPos.RemoveAll();
if ( markup.m_mapSavedPos.pTable )
{
m_mapSavedPos.AllocMapTable();
for ( int nSlot=0; nSlot < SavedPosMap::SPM_SIZE; ++nSlot )
{
SavedPos* pCopySavedPos = markup.m_mapSavedPos.pTable[nSlot];
if ( pCopySavedPos )
{
int nCount = 0;
while ( pCopySavedPos[nCount].nSavedPosFlags & SavedPosMap::SPM_USED )
{
++nCount;
if ( pCopySavedPos[nCount-1].nSavedPosFlags & SavedPosMap::SPM_LAST )
break;
}
if ( nCount )
{
SavedPos* pNewSavedPos = new SavedPos[nCount];
for ( int nCopy=0; nCopy<nCount; ++nCopy )
pNewSavedPos[nCopy] = pCopySavedPos[nCopy];
pNewSavedPos[nCount-1].nSavedPosFlags |= SavedPosMap::SPM_LAST;
m_mapSavedPos.pTable[nSlot] = pNewSavedPos;
}
}
}
}
MARKUP_SETDEBUGSTATE;
}
bool CMarkup::SetDoc( MCD_PCSZ szDoc )
{
// Set document text
if ( szDoc )
m_strDoc = szDoc;
else
MCD_STRCLEAR(m_strDoc);
MCD_STRCLEAR(m_strError);
return x_ParseDoc();
};
bool CMarkup::SetDoc( const MCD_STR& strDoc )
{
m_strDoc = strDoc;
MCD_STRCLEAR(m_strError);
return x_ParseDoc();
}
bool CMarkup::IsWellFormed()
{
if ( m_aPos.GetSize()
&& ! (m_aPos[0].nFlags & MNF_ILLFORMED)
&& m_aPos[0].iElemChild
&& ! m_aPos[m_aPos[0].iElemChild].iElemNext )
return true;
return false;
}
bool CMarkup::Load( MCD_CSTR szFileName )
{
if ( ! ReadTextFile(szFileName, m_strDoc, &m_strError, &m_nFlags) )
return false;
return x_ParseDoc();
}
bool CMarkup::ReadTextFile( MCD_CSTR szFileName, MCD_STR& strDoc, MCD_STR* pstrError, int* pnFlags )
{
// Static utility method to load text file into strDoc
//
// Open file to read binary
FILE* fp = MCD_FOPEN( szFileName, _T("rb") );
if ( ! fp )
{
if ( pstrError )
*pstrError = x_GetLastError();
return false;
}
// Set flags to 0 unless flags argument provided
int nFlags = pnFlags?*pnFlags:0;
MCD_CHAR szDescBOM[20] = {0};
MCD_CHAR szResult[100];
MCD_STRCLEAR(strDoc);
// Get file length
fseek( fp, 0, SEEK_END );
int nFileByteLen = ftell( fp );
fseek( fp, 0, SEEK_SET );
#if defined(UNICODE) // convert file to wide char
int nWideLen = 0;
if ( nFileByteLen )
{
char* pBuffer = new char[nFileByteLen];
fread( pBuffer, nFileByteLen, 1, fp );
/*
// Alternative: use these 3 lines instead of 3 lines below using UTF8To16
// For ANSI files, replace CP_UTF8 with CP_ACP in both places
nWideLen = MultiByteToWideChar(CP_UTF8,0,pBuffer,nFileByteLen,NULL,0);
MCD_CHAR* pUTF16Buffer = MCD_GETBUFFER(strDoc,nWideLen);
MultiByteToWideChar(CP_UTF8,0,pBuffer,nFileByteLen,pUTF16Buffer,nWideLen);
*/
// For ANSI files, replace both UTF8To16 calls with mbstowcs (arguments are the same)
nWideLen = UTF8To16(NULL,pBuffer,nFileByteLen);
MCD_CHAR* pUTF16Buffer = MCD_GETBUFFER(strDoc,nWideLen);
UTF8To16(pUTF16Buffer,pBuffer,nFileByteLen);
MCD_RELEASEBUFFER( strDoc, pUTF16Buffer, nWideLen );
delete [] pBuffer;
}
MCD_SPRINTF( szResult, _T("%s%d bytes to %d wide chars"), szDescBOM, nFileByteLen, nWideLen );
if ( pstrError )
*pstrError = szResult;
#else // read file directly
if ( nFileByteLen )
{
MCD_CHAR* pUTF8Buffer = MCD_GETBUFFER(strDoc,nFileByteLen);
fread( pUTF8Buffer, nFileByteLen, 1, fp );
MCD_RELEASEBUFFER( strDoc, pUTF8Buffer, nFileByteLen );
#if defined(_MBCS) // needs to be in memory as MBCS
MCD_STR strEncoding = GetDeclaredEncoding( strDoc );
if ( MCD_STRISEMPTY(strEncoding) || MCD_PSZNICMP(MCD_2PCSZ(strEncoding),_T("UTF-8"),5)==0 )
strDoc = UTF8ToA( strDoc );
#endif
}
MCD_SPRINTF( szResult, _T("%s%d bytes"), szDescBOM, nFileByteLen );
if ( pstrError )
*pstrError = szResult;
#endif
fclose( fp );
if ( pnFlags )
*pnFlags = nFlags;
return true;
}
bool CMarkup::Save( MCD_CSTR szFileName )
{
return WriteTextFile( szFileName, m_strDoc, &m_strError, &m_nFlags );
}
bool CMarkup::WriteTextFile( MCD_CSTR szFileName, MCD_STR& strDoc, MCD_STR* pstrError, int* pnFlags )
{
// Static utility method to save strDoc to text file
//
// Open file to write binary
bool bSuccess = true;
FILE* fp = MCD_FOPEN( szFileName, _T("wb") );
if ( ! fp )
{
if ( pstrError )
*pstrError = x_GetLastError();
return false;
}
// Set flags to 0 unless flags argument provided
int nFlags = pnFlags?*pnFlags:0;
MCD_CHAR szDescBOM[20] = {0};
MCD_CHAR szResult[100];
// Get document length
int nDocLength = MCD_STRLENGTH(strDoc);
#if defined( UNICODE )
int nMBLen = 0;
if ( nDocLength )
{
/*
// Alternative: use these 3 lines instead of 3 lines below using UTF16To8
// For ANSI files, replace CP_UTF8 with CP_ACP in both places
nMBLen = WideCharToMultiByte(CP_UTF8,0,strDoc,nDocLength,NULL,0,NULL,NULL);
char* pBuffer = new char[nMBLen+1];
WideCharToMultiByte(CP_UTF8,0,strDoc,nDocLength,pBuffer,nMBLen+1,NULL,NULL);
*/
// For ANSI files, replace both UTF16To8 calls with wcstombs (arguments are the same)
nMBLen = UTF16To8(NULL,MCD_2PCSZ(strDoc),0);
char* pBuffer = new char[nMBLen+1];
UTF16To8(pBuffer,MCD_2PCSZ(strDoc),nMBLen);
bSuccess = ( fwrite( pBuffer, nMBLen, 1, fp ) == 1 );
delete [] pBuffer;
}
MCD_SPRINTF( szResult, _T("%d wide chars to %s%d bytes"), nDocLength, szDescBOM, nMBLen );
if ( pstrError )
*pstrError = szResult;
#else // MBCS or UTF-8
if ( nDocLength )
{
MCD_STR strDocWrite = strDoc; // reference unless converted
#if defined(_MBCS) // is in memory as MBCS
MCD_STR strEncoding = GetDeclaredEncoding( strDoc );
if ( MCD_STRISEMPTY(strEncoding) || MCD_PSZNICMP(MCD_2PCSZ(strEncoding),_T("UTF-8"),5)==0 )
strDocWrite = AToUTF8( strDoc );
#endif
nDocLength = MCD_STRLENGTH(strDocWrite);
bSuccess = ( fwrite( MCD_2PCSZ(strDocWrite), nDocLength, 1, fp ) == 1 );
}
MCD_SPRINTF( szResult, _T("%s%d bytes"), szDescBOM, nDocLength );
if ( pstrError )
*pstrError = szResult;
#endif
if ( ! bSuccess && pstrError )
*pstrError = x_GetLastError();
fclose(fp);
if ( pnFlags )
*pnFlags = nFlags;
return bSuccess;
}
bool CMarkup::FindElem( MCD_CSTR szName )
{
// Change current position only if found
//
if ( m_aPos.GetSize() )
{
int iPos = x_FindElem( m_iPosParent, m_iPos, szName );
if ( iPos )
{
// Assign new position
x_SetPos( m_aPos[iPos].iElemParent, iPos, 0 );
return true;
}
}
return false;
}
bool CMarkup::FindChildElem( MCD_CSTR szName )
{
// Change current child position only if found
//
// Shorthand: call this with no current main position
// means find child under root element
if ( ! m_iPos )
FindElem();
int iPosChild = x_FindElem( m_iPos, m_iPosChild, szName );
if ( iPosChild )
{
// Assign new position
int iPos = m_aPos[iPosChild].iElemParent;
x_SetPos( m_aPos[iPos].iElemParent, iPos, iPosChild );
return true;
}
return false;
}
MCD_STR CMarkup::EscapeText( MCD_CSTR szText, int nFlags )
{
// Convert text as seen outside XML document to XML friendly
// replacing special characters with ampersand escape codes
// E.g. convert "6>7" to "6>7"
//
// < less than
// & ampersand
// > greater than
//
// and for attributes:
//
// ' apostrophe or single quote
// " double quote
//
static MCD_PCSZ szaReplace[] = { _T("<"),_T("&"),_T(">"),_T("'"),_T(""") };
MCD_PCSZ pFind = (nFlags&MNF_ESCAPEQUOTES)?_T("<&>\'\""):_T("<&>");
MCD_STR strText;
MCD_PCSZ pSource = szText;
int nDestSize = MCD_PSZLEN(pSource);
nDestSize += nDestSize / 10 + 7;
MCD_BLDRESERVE(strText,nDestSize);
MCD_CHAR cSource = *pSource;
MCD_PCSZ pFound;
int nCharLen;
while ( cSource )
{
MCD_BLDCHECK(strText,nDestSize,6);
if ( (pFound=MCD_PSZCHR(pFind,cSource)) != NULL )
{
bool bIgnoreAmpersand = false;
if ( (nFlags&MNF_WITHREFS) && *pFound == _T('&') )
{
// Do not replace ampersand if it is start of any entity reference
// &[#_:A-Za-zU][_:-.A-Za-z0-9U]*; where U is > 0x7f
MCD_PCSZ pCheckEntity = pSource;
++pCheckEntity;
MCD_CHAR c = *pCheckEntity;
if ( (c>=_T('A')&&c<=_T('Z')) || (c>=_T('a')&&c<=_T('z'))
|| c==_T('#') || c==_T('_') || c==_T(':') || ((unsigned int)c)>0x7f )
{
while ( 1 )
{
pCheckEntity += MCD_CLEN( pCheckEntity );
c = *pCheckEntity;
if ( c == _T(';') )
{
int nEntityLen = (int)(pCheckEntity - pSource) + 1;
MCD_BLDAPPENDN(strText,pSource,nEntityLen);
pSource = pCheckEntity;
bIgnoreAmpersand = true;
}
else if ( (c>=_T('A')&&c<=_T('Z')) || (c>=_T('a')&&c<=_T('z')) || (c>=_T('0')&&c<=_T('9'))
|| c==_T('_') || c==_T(':') || c==_T('-') || c==_T('.') || ((unsigned int)c)>0x7f )
continue;
break;
}
}
}
if ( ! bIgnoreAmpersand )
{
pFound = szaReplace[pFound-pFind];
MCD_BLDAPPEND(strText,pFound);
}
++pSource; // ASCII, so 1 byte
}
else
{
nCharLen = MCD_CLEN( pSource );
MCD_BLDAPPENDN(strText,pSource,nCharLen);
pSource += nCharLen;
}
cSource = *pSource;
}
MCD_BLDRELEASE(strText);
return strText;
}
MCD_STR CMarkup::UnescapeText( MCD_CSTR szText, int nTextLength /*=-1*/ )
{
// Convert XML friendly text to text as seen outside XML document
// ampersand escape codes replaced with special characters e.g. convert "6>7" to "6>7"
// ampersand numeric codes replaced with character e.g. convert < to <
// Conveniently the result is always the same or shorter in byte length
//
static MCD_PCSZ szaCode[] = { _T("lt;"),_T("amp;"),_T("gt;"),_T("apos;"),_T("quot;") };
static int anCodeLen[] = { 3,4,3,5,5 };
static MCD_PCSZ szSymbol = _T("<&>\'\"");
MCD_STR strText;
MCD_PCSZ pSource = szText;
if ( nTextLength == -1 )
nTextLength = MCD_PSZLEN(szText);
MCD_BLDRESERVE(strText,nTextLength);
int nCharLen;
int nChar = 0;
while ( nChar < nTextLength )
{
if ( pSource[nChar] == _T('&') )
{
bool bCodeConverted = false;
// Is it a numeric character reference?
if ( pSource[nChar+1] == _T('#') )
{
// Is it a hex number?
int nBase = 10;
int nNumericChar = nChar + 2;
MCD_CHAR cChar = pSource[nNumericChar];
if ( cChar == _T('x') )
{
++nNumericChar;
cChar = pSource[nNumericChar];
nBase = 16;
}
// Look for terminating semi-colon within 7 characters
int nCodeLen = 0;
while ( nCodeLen < 7 && cChar && cChar != _T(';') )
{
// only ASCII digits 0-9, A-F, a-f expected
nCodeLen += MCD_CLEN( &pSource[nNumericChar+nCodeLen] );
cChar = pSource[nNumericChar + nCodeLen];
}
// Process unicode
if ( cChar == _T(';') )
{
int nUnicode = MCD_PSZTOL( &pSource[nNumericChar], NULL, nBase );
#if defined(UNICODE)
MCD_BLDAPPEND1(strText,nUnicode);
#elif defined(_MBCS)
MCD_CHAR szANSI[2];
int nMBLen = wctomb( szANSI, (wchar_t)nUnicode );
if ( nMBLen > 0 )
{
MCD_BLDAPPENDN(strText,szANSI,nMBLen);
}
else
nUnicode = 0;
#else
if ( nUnicode < 0x80 )
MCD_BLDAPPEND1(strText,nUnicode);
else if ( nUnicode < 0x800 )
{
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -