htmlpars.cpp
来自「A*算法 A*算法 A*算法 A*算法A*算法A*算法」· C++ 代码 · 共 946 行 · 第 1/2 页
CPP
946 行
/////////////////////////////////////////////////////////////////////////////
// Name: htmlpars.cpp
// Purpose: wxHtmlParser class (generic parser)
// Author: Vaclav Slavik
// RCS-ID: $Id: htmlpars.cpp,v 1.48.2.2 2006/03/09 14:16:49 VZ Exp $
// Copyright: (c) 1999 Vaclav Slavik
// Licence: wxWindows licence
/////////////////////////////////////////////////////////////////////////////
#if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
#pragma implementation "htmlpars.h"
#endif
#include "wx/wxprec.h"
#include "wx/defs.h"
#if wxUSE_HTML && wxUSE_STREAMS
#ifdef __BORLANDC__
#pragma hdrstop
#endif
#ifndef WXPRECOMP
#include "wx/log.h"
#include "wx/intl.h"
#endif
#include "wx/tokenzr.h"
#include "wx/wfstream.h"
#include "wx/url.h"
#include "wx/fontmap.h"
#include "wx/html/htmldefs.h"
#include "wx/html/htmlpars.h"
#include "wx/dynarray.h"
#include "wx/arrimpl.cpp"
#ifdef __WXWINCE__
#include "wx/msw/wince/missing.h" // for bsearch()
#endif
// DLL options compatibility check:
#include "wx/app.h"
WX_CHECK_BUILD_OPTIONS("wxHTML")
const wxChar *wxTRACE_HTML_DEBUG = _T("htmldebug");
//-----------------------------------------------------------------------------
// wxHtmlParser helpers
//-----------------------------------------------------------------------------
class wxHtmlTextPiece
{
public:
wxHtmlTextPiece(int pos, int lng) : m_pos(pos), m_lng(lng) {}
int m_pos, m_lng;
};
WX_DECLARE_OBJARRAY(wxHtmlTextPiece, wxHtmlTextPieces);
WX_DEFINE_OBJARRAY(wxHtmlTextPieces);
class wxHtmlParserState
{
public:
wxHtmlTag *m_curTag;
wxHtmlTag *m_tags;
wxHtmlTextPieces *m_textPieces;
int m_curTextPiece;
wxString m_source;
wxHtmlParserState *m_nextState;
};
//-----------------------------------------------------------------------------
// wxHtmlParser
//-----------------------------------------------------------------------------
IMPLEMENT_ABSTRACT_CLASS(wxHtmlParser,wxObject)
wxHtmlParser::wxHtmlParser()
: wxObject(), m_HandlersHash(wxKEY_STRING),
m_FS(NULL), m_HandlersStack(NULL)
{
m_entitiesParser = new wxHtmlEntitiesParser;
m_Tags = NULL;
m_CurTag = NULL;
m_TextPieces = NULL;
m_CurTextPiece = 0;
m_SavedStates = NULL;
}
wxHtmlParser::~wxHtmlParser()
{
while (RestoreState()) {}
DestroyDOMTree();
if (m_HandlersStack)
{
wxList& tmp = *m_HandlersStack;
wxList::iterator it, en;
for( it = tmp.begin(), en = tmp.end(); it != en; ++it )
delete (wxHashTable*)*it;
tmp.clear();
}
delete m_HandlersStack;
m_HandlersHash.Clear();
WX_CLEAR_LIST(wxList, m_HandlersList);
delete m_entitiesParser;
}
wxObject* wxHtmlParser::Parse(const wxString& source)
{
InitParser(source);
DoParsing();
wxObject *result = GetProduct();
DoneParser();
return result;
}
void wxHtmlParser::InitParser(const wxString& source)
{
SetSource(source);
m_stopParsing = false;
}
void wxHtmlParser::DoneParser()
{
DestroyDOMTree();
}
void wxHtmlParser::SetSource(const wxString& src)
{
DestroyDOMTree();
m_Source = src;
CreateDOMTree();
m_CurTag = NULL;
m_CurTextPiece = 0;
}
void wxHtmlParser::CreateDOMTree()
{
wxHtmlTagsCache cache(m_Source);
m_TextPieces = new wxHtmlTextPieces;
CreateDOMSubTree(NULL, 0, m_Source.Length(), &cache);
m_CurTextPiece = 0;
}
extern bool wxIsCDATAElement(const wxChar *tag);
void wxHtmlParser::CreateDOMSubTree(wxHtmlTag *cur,
int begin_pos, int end_pos,
wxHtmlTagsCache *cache)
{
if (end_pos <= begin_pos) return;
wxChar c;
int i = begin_pos;
int textBeginning = begin_pos;
// If the tag contains CDATA text, we include the text between beginning
// and ending tag verbosely. Setting i=end_pos will skip to the very
// end of this function where text piece is added, bypassing any child
// tags parsing (CDATA element can't have child elements by definition):
if (cur != NULL && wxIsCDATAElement(cur->GetName().c_str()))
{
i = end_pos;
}
while (i < end_pos)
{
c = m_Source.GetChar(i);
if (c == wxT('<'))
{
// add text to m_TextPieces:
if (i - textBeginning > 0)
m_TextPieces->Add(
wxHtmlTextPiece(textBeginning, i - textBeginning));
// if it is a comment, skip it:
if (i < end_pos-6 && m_Source.GetChar(i+1) == wxT('!') &&
m_Source.GetChar(i+2) == wxT('-') &&
m_Source.GetChar(i+3) == wxT('-'))
{
// Comments begin with "<!--" and end with "--[ \t\r\n]*>"
// according to HTML 4.0
int dashes = 0;
i += 4;
while (i < end_pos)
{
c = m_Source.GetChar(i++);
if ((c == wxT(' ') || c == wxT('\n') ||
c == wxT('\r') || c == wxT('\t')) && dashes >= 2) {}
else if (c == wxT('>') && dashes >= 2)
{
textBeginning = i;
break;
}
else if (c == wxT('-'))
dashes++;
else
dashes = 0;
}
}
// add another tag to the tree:
else if (i < end_pos-1 && m_Source.GetChar(i+1) != wxT('/'))
{
wxHtmlTag *chd;
if (cur)
chd = new wxHtmlTag(cur, m_Source,
i, end_pos, cache, m_entitiesParser);
else
{
chd = new wxHtmlTag(NULL, m_Source,
i, end_pos, cache, m_entitiesParser);
if (!m_Tags)
{
// if this is the first tag to be created make the root
// m_Tags point to it:
m_Tags = chd;
}
else
{
// if there is already a root tag add this tag as
// the last sibling:
chd->m_Prev = m_Tags->GetLastSibling();
chd->m_Prev->m_Next = chd;
}
}
if (chd->HasEnding())
{
CreateDOMSubTree(chd,
chd->GetBeginPos(), chd->GetEndPos1(),
cache);
i = chd->GetEndPos2();
}
else
i = chd->GetBeginPos();
textBeginning = i;
}
// ... or skip ending tag:
else
{
while (i < end_pos && m_Source.GetChar(i) != wxT('>')) i++;
textBeginning = i+1;
}
}
else i++;
}
// add remaining text to m_TextPieces:
if (end_pos - textBeginning > 0)
m_TextPieces->Add(
wxHtmlTextPiece(textBeginning, end_pos - textBeginning));
}
void wxHtmlParser::DestroyDOMTree()
{
wxHtmlTag *t1, *t2;
t1 = m_Tags;
while (t1)
{
t2 = t1->GetNextSibling();
delete t1;
t1 = t2;
}
m_Tags = m_CurTag = NULL;
delete m_TextPieces;
m_TextPieces = NULL;
}
void wxHtmlParser::DoParsing()
{
m_CurTag = m_Tags;
m_CurTextPiece = 0;
DoParsing(0, m_Source.Length());
}
void wxHtmlParser::DoParsing(int begin_pos, int end_pos)
{
if (end_pos <= begin_pos) return;
wxHtmlTextPieces& pieces = *m_TextPieces;
size_t piecesCnt = pieces.GetCount();
while (begin_pos < end_pos)
{
while (m_CurTag && m_CurTag->GetBeginPos() < begin_pos)
m_CurTag = m_CurTag->GetNextTag();
while (m_CurTextPiece < piecesCnt &&
pieces[m_CurTextPiece].m_pos < begin_pos)
m_CurTextPiece++;
if (m_CurTextPiece < piecesCnt &&
(!m_CurTag ||
pieces[m_CurTextPiece].m_pos < m_CurTag->GetBeginPos()))
{
// Add text:
AddText(GetEntitiesParser()->Parse(
m_Source.Mid(pieces[m_CurTextPiece].m_pos,
pieces[m_CurTextPiece].m_lng)));
begin_pos = pieces[m_CurTextPiece].m_pos +
pieces[m_CurTextPiece].m_lng;
m_CurTextPiece++;
}
else if (m_CurTag)
{
if (m_CurTag->HasEnding())
begin_pos = m_CurTag->GetEndPos2();
else
begin_pos = m_CurTag->GetBeginPos();
wxHtmlTag *t = m_CurTag;
m_CurTag = m_CurTag->GetNextTag();
AddTag(*t);
if (m_stopParsing)
return;
}
else break;
}
}
void wxHtmlParser::AddTag(const wxHtmlTag& tag)
{
wxHtmlTagHandler *h;
bool inner = false;
h = (wxHtmlTagHandler*) m_HandlersHash.Get(tag.GetName());
if (h)
{
inner = h->HandleTag(tag);
if (m_stopParsing)
return;
}
if (!inner)
{
if (tag.HasEnding())
DoParsing(tag.GetBeginPos(), tag.GetEndPos1());
}
}
void wxHtmlParser::AddTagHandler(wxHtmlTagHandler *handler)
{
wxString s(handler->GetSupportedTags());
wxStringTokenizer tokenizer(s, wxT(", "));
while (tokenizer.HasMoreTokens())
m_HandlersHash.Put(tokenizer.GetNextToken(), handler);
if (m_HandlersList.IndexOf(handler) == wxNOT_FOUND)
m_HandlersList.Append(handler);
handler->SetParser(this);
}
void wxHtmlParser::PushTagHandler(wxHtmlTagHandler *handler, wxString tags)
{
wxStringTokenizer tokenizer(tags, wxT(", "));
wxString key;
if (m_HandlersStack == NULL)
{
m_HandlersStack = new wxList;
}
m_HandlersStack->Insert((wxObject*)new wxHashTable(m_HandlersHash));
while (tokenizer.HasMoreTokens())
{
key = tokenizer.GetNextToken();
m_HandlersHash.Delete(key);
m_HandlersHash.Put(key, handler);
}
}
void wxHtmlParser::PopTagHandler()
{
wxList::compatibility_iterator first;
if ( !m_HandlersStack ||
#if wxUSE_STL
!(first = m_HandlersStack->GetFirst())
#else // !wxUSE_STL
((first = m_HandlersStack->GetFirst()) == NULL)
#endif // wxUSE_STL/!wxUSE_STL
)
{
wxLogWarning(_("Warning: attempt to remove HTML tag handler from empty stack."));
return;
}
m_HandlersHash = *((wxHashTable*) first->GetData());
delete (wxHashTable*) first->GetData();
m_HandlersStack->Erase(first);
}
void wxHtmlParser::SetSourceAndSaveState(const wxString& src)
{
wxHtmlParserState *s = new wxHtmlParserState;
s->m_curTag = m_CurTag;
s->m_tags = m_Tags;
s->m_textPieces = m_TextPieces;
s->m_curTextPiece = m_CurTextPiece;
s->m_source = m_Source;
s->m_nextState = m_SavedStates;
m_SavedStates = s;
m_CurTag = NULL;
m_Tags = NULL;
m_TextPieces = NULL;
m_CurTextPiece = 0;
m_Source = wxEmptyString;
SetSource(src);
}
bool wxHtmlParser::RestoreState()
{
if (!m_SavedStates) return false;
DestroyDOMTree();
wxHtmlParserState *s = m_SavedStates;
m_SavedStates = s->m_nextState;
m_CurTag = s->m_curTag;
m_Tags = s->m_tags;
m_TextPieces = s->m_textPieces;
m_CurTextPiece = s->m_curTextPiece;
m_Source = s->m_source;
delete s;
return true;
}
//-----------------------------------------------------------------------------
// wxHtmlTagHandler
//-----------------------------------------------------------------------------
IMPLEMENT_ABSTRACT_CLASS(wxHtmlTagHandler,wxObject)
//-----------------------------------------------------------------------------
// wxHtmlEntitiesParser
//-----------------------------------------------------------------------------
IMPLEMENT_DYNAMIC_CLASS(wxHtmlEntitiesParser,wxObject)
wxHtmlEntitiesParser::wxHtmlEntitiesParser()
#if wxUSE_WCHAR_T && !wxUSE_UNICODE
: m_conv(NULL), m_encoding(wxFONTENCODING_SYSTEM)
#endif
{
}
wxHtmlEntitiesParser::~wxHtmlEntitiesParser()
{
#if wxUSE_WCHAR_T && !wxUSE_UNICODE
delete m_conv;
#endif
}
void wxHtmlEntitiesParser::SetEncoding(wxFontEncoding encoding)
{
#if wxUSE_WCHAR_T && !wxUSE_UNICODE
if (encoding == m_encoding)
return;
delete m_conv;
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?