📄 htmlparserx.cpp
字号:
#include "StdAfx.h"
#include ".\htmlparserx.h"
#include "..\xlibplus\xextractstring.h"
xUnlimitedObjectPool<CHtmlParserX::CHtmlTag> CHtmlParserX::CHtmlTag::m_xHtmlTagPool;
xUnlimitedObjectPool<CHtmlParserX::CTagProperty> CHtmlParserX::CTagProperty::m_xTagPropertyPool;
VOID PreProcessHTMLCode( char * pszContent )
{
char * p1= pszContent;
char * p2= pszContent;
bool bInTag = false;
bool bNorecording = false;
while( *p1 )
{
if( bNorecording )
{
if( *p1 == '-' && strncmp( p1, "-->", 3 ) == 0 )
{
bNorecording = false;
p1 += 2;
}
p1 ++;
continue;
}
if( *p1 == '<' )
{
if( strncmp( p1+1, "!--", 3 ) == 0 )
{
bNorecording = true;
p1++;
continue;
}
else if( !bInTag )
{
bInTag = true;
}
else
printf( "Met tag two times\n" );
}
else if( *p1 == '>' )
{
if( bInTag )
bInTag = false;
else
printf( "Met tag out two times\n" );
}
else if( *p1 == ' ' || *p1 == '\t' )
{
if( !bInTag || (*(p2-1) == '\t' || *(p2-1) == ' ' || *(p2-1) == '<' || *(p2-1) == '/' ) )
{
p1 ++;
continue;
}
}
else if( *p1 == '\r' || *p1 == '\n' )
{
if( !bInTag || (*(p2-1) == '\t' || *(p2-1) == ' ' || *(p2-1) == '<' || *(p2-1) == '/' ) )
{
p1 ++;
continue;
}
*p2++ = ' ';
p1++;
continue;
}
*p2++=*p1++;
}
*p2 = 0;
}
VOID CHtmlParserX::CHtmlTag::Show()
{
printf( "TAG name: %s\n", m_sName.c_str() );
for( int i = 0;i < m_vPropertys.size();i ++ )
printf( "PROP: %s = %s\n", m_vPropertys[i]->m_sKey.c_str(), m_vPropertys[i]->m_sValue.c_str() );
if( this->m_vChildTags.size() == 0 )
{
std::string sContent = GetContent();
printf( "Content: %s\n", sContent.c_str() );
}
else
{
for( int i = 0;i < m_vChildTags.size();i ++ )
m_vChildTags[i]->Show();
}
}
std::string CHtmlParserX::CHtmlTag::GetContent()
{
std::string sContent;
if( this->m_pParser == NULL ||
this->m_iContentStart < 0 || this->m_iContentStart > m_pParser->m_sContent.size() ||
this->m_iContentEnd < 0 || this->m_iContentEnd > m_pParser->m_sContent.size() ||
this->m_iContentEnd < this->m_iContentStart )return "";
sContent.assign( &this->m_pParser->m_sContent[this->m_iContentStart],
&this->m_pParser->m_sContent[this->m_iContentEnd] );
return sContent;
}
BOOL CHtmlParserX::CHtmlTag::Parse( CHtmlParserX * pParser, char * &pszContent )
{
this->Clear();
xCharSet csTagLeft( "<" );
xCharSet csTagRight( ">" );
xCharSet csTagL( "/" );
m_pParser = pParser;
this->m_iContentStart = this->m_pParser->m_sContent.size();
if( *pszContent != '<' )return FALSE;
bool bSingleTag = false;
char * pTagRight = xstrchr( pszContent,csTagRight );
if( pTagRight == NULL )return FALSE;
*pTagRight++ = 0;
//char * pL = xstrchr( pszContent, csTagL ); // 检查是否是单独的一个tag<tagname proplist />,没有</tagname>
//if( pL )
//{
// *pL = 0;
// bSingleTag = true;
//}
pszContent = TrimEx( pszContent + 1 );
char * pTagProp = xstrchr( pszContent, CharSetWhite );
m_bClosed = false;
if( pTagProp == NULL )
{
this->m_sName = pszContent;
}
else
{
// 得到名字
*pTagProp++ = 0;
this->m_sName = pszContent;
xCharSet csEqu( "=" );
// 解析TAG属性
while( pTagProp )
{
char * p = xstrchr( pTagProp, csEqu );
if( p )
{
*p ++ = 0;
char * p1 = xstrchr( p, CharSetWhite );
if( p1 != NULL )
{
*p1++ = 0;
}
CHtmlParserX::CTagProperty * ptagprop = CHtmlParserX::CTagProperty::New();
ptagprop->m_sKey = TrimEx( pTagProp );
ptagprop->m_sValue = TrimEx( p );
this->m_vPropertys.push_back( ptagprop );
pTagProp = p1;
}
else
break;
}
if( bSingleTag )
{
this->m_bClosed = false;
pszContent = pTagRight;
return TRUE;
}
}
char * pNextTag = pTagRight;
while( TRUE )
{
pTagRight = pNextTag;
pNextTag = xstrchr( pNextTag, csTagLeft );
if( pNextTag == NULL )
return FALSE;
// 把文本内容存放在这个内容里
this->m_pParser->m_sContent.append( pTagRight, pNextTag );
if( *(pNextTag+1) == '/' )
{
xCharSet csTgEnd( " \t>" );
char * pTgEnd = xstrchr( pNextTag, csTgEnd ); // 取得TAG的名字
if( pTgEnd == NULL )return FALSE;
char c = *pTgEnd;
*pTgEnd = 0;
m_bClosed = (stricmp( m_sName.c_str(), pNextTag+2 ) == 0 ); // 检查是否关闭
*pTgEnd = c;
// 如果是</tagname>这个标签,那么就结束
if( m_bClosed )
{
pNextTag = xstrchr( pNextTag, csTagRight );
if( pNextTag == NULL )
return FALSE;
pszContent = pNextTag +1;
break;
}
else
{
pszContent = pNextTag;
this->m_bClosed = false;
break;
}
}
// 处理一个子tag
CHtmlParserX::CHtmlTag * pTag = CHtmlParserX::CHtmlTag::New();
if( pTag )
{
if( !pTag->Parse( pParser, pNextTag ) )
{
CHtmlParserX::CHtmlTag::Delete( pTag );
return FALSE;
}
this->m_vChildTags.push_back( pTag );
if( !pTag->m_bClosed ) // 没有关闭,那么它的子Tag全部都应该移动到当前Tag中来,然后内容缩小到0
{
// 如果没有关闭,那么就表示,该tag是一个单行,
// 不能有内容和子Tag
pTag->m_iContentStart = pTag->m_iContentEnd;
for( int i = 0;i < pTag->m_vChildTags.size();i ++ )
{
m_vChildTags.push_back( pTag->m_vChildTags[i] );
}
pTag->m_vChildTags.clear();
}
}
else
return FALSE;
}
this->m_iContentEnd = this->m_pParser->m_sContent.size();
return TRUE;
}
CHtmlParserX::CHtmlParserX(void)
{
}
CHtmlParserX::~CHtmlParserX(void)
{
}
BOOL CHtmlParserX::Parse( std::string & sHtmlContent )
{
this->m_sContent.clear();
char * pHtmlContent = (char*)sHtmlContent.c_str();
PreProcessHTMLCode( pHtmlContent ); // 对HTML进行预处理
char * p = strchr( pHtmlContent, '<' ); // 跳过头
//printf( pHtmlContent );
if( p == NULL )
{
return FALSE;
}
this->m_pHtmlTags = CHtmlTag::New();
if( !m_pHtmlTags->Parse( this, p ) )
{
CHtmlTag::Delete( m_pHtmlTags );
return FALSE;
}
return TRUE;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -