htmlpars.cpp

来自「A*算法 A*算法 A*算法 A*算法A*算法A*算法」· C++ 代码 · 共 946 行 · 第 1/2 页

CPP
946
字号
/////////////////////////////////////////////////////////////////////////////
// Name:        htmlpars.cpp
// Purpose:     wxHtmlParser class (generic parser)
// Author:      Vaclav Slavik
// RCS-ID:      $Id: htmlpars.cpp,v 1.48.2.2 2006/03/09 14:16:49 VZ Exp $
// Copyright:   (c) 1999 Vaclav Slavik
// Licence:     wxWindows licence
/////////////////////////////////////////////////////////////////////////////


#if defined(__GNUG__) && !defined(NO_GCC_PRAGMA)
#pragma implementation "htmlpars.h"
#endif

#include "wx/wxprec.h"

#include "wx/defs.h"
#if wxUSE_HTML && wxUSE_STREAMS

#ifdef __BORLANDC__
#pragma hdrstop
#endif

#ifndef WXPRECOMP
    #include "wx/log.h"
    #include "wx/intl.h"
#endif

#include "wx/tokenzr.h"
#include "wx/wfstream.h"
#include "wx/url.h"
#include "wx/fontmap.h"
#include "wx/html/htmldefs.h"
#include "wx/html/htmlpars.h"
#include "wx/dynarray.h"
#include "wx/arrimpl.cpp"

#ifdef __WXWINCE__
    #include "wx/msw/wince/missing.h"       // for bsearch()
#endif

// DLL options compatibility check:
#include "wx/app.h"
WX_CHECK_BUILD_OPTIONS("wxHTML")

const wxChar *wxTRACE_HTML_DEBUG = _T("htmldebug");

//-----------------------------------------------------------------------------
// wxHtmlParser helpers
//-----------------------------------------------------------------------------

class wxHtmlTextPiece
{
public:
    wxHtmlTextPiece(int pos, int lng) : m_pos(pos), m_lng(lng) {}
    int m_pos, m_lng;
};

WX_DECLARE_OBJARRAY(wxHtmlTextPiece, wxHtmlTextPieces);
WX_DEFINE_OBJARRAY(wxHtmlTextPieces);

class wxHtmlParserState
{
public:
    wxHtmlTag         *m_curTag;
    wxHtmlTag         *m_tags;
    wxHtmlTextPieces  *m_textPieces;
    int                m_curTextPiece;
    wxString           m_source;
    wxHtmlParserState *m_nextState;
};

//-----------------------------------------------------------------------------
// wxHtmlParser
//-----------------------------------------------------------------------------

IMPLEMENT_ABSTRACT_CLASS(wxHtmlParser,wxObject)

wxHtmlParser::wxHtmlParser()
    : wxObject(), m_HandlersHash(wxKEY_STRING),
      m_FS(NULL), m_HandlersStack(NULL)
{
    m_entitiesParser = new wxHtmlEntitiesParser;
    m_Tags = NULL;
    m_CurTag = NULL;
    m_TextPieces = NULL;
    m_CurTextPiece = 0;
    m_SavedStates = NULL;
}

wxHtmlParser::~wxHtmlParser()
{
    while (RestoreState()) {}
    DestroyDOMTree();

    if (m_HandlersStack)
    {
        wxList& tmp = *m_HandlersStack;
        wxList::iterator it, en;
        for( it = tmp.begin(), en = tmp.end(); it != en; ++it )
            delete (wxHashTable*)*it;
        tmp.clear();
    }
    delete m_HandlersStack;
    m_HandlersHash.Clear();
    WX_CLEAR_LIST(wxList, m_HandlersList);
    delete m_entitiesParser;
}

wxObject* wxHtmlParser::Parse(const wxString& source)
{
    InitParser(source);
    DoParsing();
    wxObject *result = GetProduct();
    DoneParser();
    return result;
}

void wxHtmlParser::InitParser(const wxString& source)
{
    SetSource(source);
    m_stopParsing = false;
}

void wxHtmlParser::DoneParser()
{
    DestroyDOMTree();
}

void wxHtmlParser::SetSource(const wxString& src)
{
    DestroyDOMTree();
    m_Source = src;
    CreateDOMTree();
    m_CurTag = NULL;
    m_CurTextPiece = 0;
}

void wxHtmlParser::CreateDOMTree()
{
    wxHtmlTagsCache cache(m_Source);
    m_TextPieces = new wxHtmlTextPieces;
    CreateDOMSubTree(NULL, 0, m_Source.Length(), &cache);
    m_CurTextPiece = 0;
}

extern bool wxIsCDATAElement(const wxChar *tag);

void wxHtmlParser::CreateDOMSubTree(wxHtmlTag *cur,
                                    int begin_pos, int end_pos,
                                    wxHtmlTagsCache *cache)
{
    if (end_pos <= begin_pos) return;

    wxChar c;
    int i = begin_pos;
    int textBeginning = begin_pos;

    // If the tag contains CDATA text, we include the text between beginning
    // and ending tag verbosely. Setting i=end_pos will skip to the very
    // end of this function where text piece is added, bypassing any child
    // tags parsing (CDATA element can't have child elements by definition):
    if (cur != NULL && wxIsCDATAElement(cur->GetName().c_str()))
    {
        i = end_pos;
    }

    while (i < end_pos)
    {
        c = m_Source.GetChar(i);

        if (c == wxT('<'))
        {
            // add text to m_TextPieces:
            if (i - textBeginning > 0)
                m_TextPieces->Add(
                    wxHtmlTextPiece(textBeginning, i - textBeginning));

            // if it is a comment, skip it:
            if (i < end_pos-6 && m_Source.GetChar(i+1) == wxT('!') &&
                                 m_Source.GetChar(i+2) == wxT('-') &&
                                 m_Source.GetChar(i+3) == wxT('-'))
            {
                // Comments begin with "<!--" and end with "--[ \t\r\n]*>"
                // according to HTML 4.0
                int dashes = 0;
                i += 4;
                while (i < end_pos)
                {
                    c = m_Source.GetChar(i++);
                    if ((c == wxT(' ') || c == wxT('\n') ||
                        c == wxT('\r') || c == wxT('\t')) && dashes >= 2) {}
                    else if (c == wxT('>') && dashes >= 2)
                    {
                        textBeginning = i;
                        break;
                    }
                    else if (c == wxT('-'))
                        dashes++;
                    else
                        dashes = 0;
                }
            }

            // add another tag to the tree:
            else if (i < end_pos-1 && m_Source.GetChar(i+1) != wxT('/'))
            {
                wxHtmlTag *chd;
                if (cur)
                    chd = new wxHtmlTag(cur, m_Source,
                                        i, end_pos, cache, m_entitiesParser);
                else
                {
                    chd = new wxHtmlTag(NULL, m_Source,
                                        i, end_pos, cache, m_entitiesParser);
                    if (!m_Tags)
                    {
                        // if this is the first tag to be created make the root
                        // m_Tags point to it:
                        m_Tags = chd;
                    }
                    else
                    {
                        // if there is already a root tag add this tag as
                        // the last sibling:
                        chd->m_Prev = m_Tags->GetLastSibling();
                        chd->m_Prev->m_Next = chd;
                    }
                }

                if (chd->HasEnding())
                {
                    CreateDOMSubTree(chd,
                                     chd->GetBeginPos(), chd->GetEndPos1(),
                                     cache);
                    i = chd->GetEndPos2();
                }
                else
                    i = chd->GetBeginPos();

                textBeginning = i;
            }

            // ... or skip ending tag:
            else
            {
                while (i < end_pos && m_Source.GetChar(i) != wxT('>')) i++;
                textBeginning = i+1;
            }
        }
        else i++;
    }

    // add remaining text to m_TextPieces:
    if (end_pos - textBeginning > 0)
        m_TextPieces->Add(
            wxHtmlTextPiece(textBeginning, end_pos - textBeginning));
}

void wxHtmlParser::DestroyDOMTree()
{
    wxHtmlTag *t1, *t2;
    t1 = m_Tags;
    while (t1)
    {
        t2 = t1->GetNextSibling();
        delete t1;
        t1 = t2;
    }
    m_Tags = m_CurTag = NULL;

    delete m_TextPieces;
    m_TextPieces = NULL;
}

void wxHtmlParser::DoParsing()
{
    m_CurTag = m_Tags;
    m_CurTextPiece = 0;
    DoParsing(0, m_Source.Length());
}

void wxHtmlParser::DoParsing(int begin_pos, int end_pos)
{
    if (end_pos <= begin_pos) return;

    wxHtmlTextPieces& pieces = *m_TextPieces;
    size_t piecesCnt = pieces.GetCount();

    while (begin_pos < end_pos)
    {
        while (m_CurTag && m_CurTag->GetBeginPos() < begin_pos)
            m_CurTag = m_CurTag->GetNextTag();
        while (m_CurTextPiece < piecesCnt &&
               pieces[m_CurTextPiece].m_pos < begin_pos)
            m_CurTextPiece++;

        if (m_CurTextPiece < piecesCnt &&
            (!m_CurTag ||
             pieces[m_CurTextPiece].m_pos < m_CurTag->GetBeginPos()))
        {
            // Add text:
            AddText(GetEntitiesParser()->Parse(
                       m_Source.Mid(pieces[m_CurTextPiece].m_pos,
                                    pieces[m_CurTextPiece].m_lng)));
            begin_pos = pieces[m_CurTextPiece].m_pos +
                        pieces[m_CurTextPiece].m_lng;
            m_CurTextPiece++;
        }
        else if (m_CurTag)
        {
            if (m_CurTag->HasEnding())
                begin_pos = m_CurTag->GetEndPos2();
            else
                begin_pos = m_CurTag->GetBeginPos();
            wxHtmlTag *t = m_CurTag;
            m_CurTag = m_CurTag->GetNextTag();
            AddTag(*t);
            if (m_stopParsing)
                return;
        }
        else break;
    }
}

void wxHtmlParser::AddTag(const wxHtmlTag& tag)
{
    wxHtmlTagHandler *h;
    bool inner = false;

    h = (wxHtmlTagHandler*) m_HandlersHash.Get(tag.GetName());
    if (h)
    {
        inner = h->HandleTag(tag);
        if (m_stopParsing)
            return;
    }
    if (!inner)
    {
        if (tag.HasEnding())
            DoParsing(tag.GetBeginPos(), tag.GetEndPos1());
    }
}

void wxHtmlParser::AddTagHandler(wxHtmlTagHandler *handler)
{
    wxString s(handler->GetSupportedTags());
    wxStringTokenizer tokenizer(s, wxT(", "));

    while (tokenizer.HasMoreTokens())
        m_HandlersHash.Put(tokenizer.GetNextToken(), handler);

    if (m_HandlersList.IndexOf(handler) == wxNOT_FOUND)
        m_HandlersList.Append(handler);

    handler->SetParser(this);
}

void wxHtmlParser::PushTagHandler(wxHtmlTagHandler *handler, wxString tags)
{
    wxStringTokenizer tokenizer(tags, wxT(", "));
    wxString key;

    if (m_HandlersStack == NULL)
    {
        m_HandlersStack = new wxList;
    }

    m_HandlersStack->Insert((wxObject*)new wxHashTable(m_HandlersHash));

    while (tokenizer.HasMoreTokens())
    {
        key = tokenizer.GetNextToken();
        m_HandlersHash.Delete(key);
        m_HandlersHash.Put(key, handler);
    }
}

void wxHtmlParser::PopTagHandler()
{
    wxList::compatibility_iterator first;

    if ( !m_HandlersStack ||
#if wxUSE_STL
         !(first = m_HandlersStack->GetFirst())
#else // !wxUSE_STL
         ((first = m_HandlersStack->GetFirst()) == NULL)
#endif // wxUSE_STL/!wxUSE_STL
        )
    {
        wxLogWarning(_("Warning: attempt to remove HTML tag handler from empty stack."));
        return;
    }
    m_HandlersHash = *((wxHashTable*) first->GetData());
    delete (wxHashTable*) first->GetData();
    m_HandlersStack->Erase(first);
}

void wxHtmlParser::SetSourceAndSaveState(const wxString& src)
{
    wxHtmlParserState *s = new wxHtmlParserState;

    s->m_curTag = m_CurTag;
    s->m_tags = m_Tags;
    s->m_textPieces = m_TextPieces;
    s->m_curTextPiece = m_CurTextPiece;
    s->m_source = m_Source;

    s->m_nextState = m_SavedStates;
    m_SavedStates = s;

    m_CurTag = NULL;
    m_Tags = NULL;
    m_TextPieces = NULL;
    m_CurTextPiece = 0;
    m_Source = wxEmptyString;

    SetSource(src);
}

bool wxHtmlParser::RestoreState()
{
    if (!m_SavedStates) return false;

    DestroyDOMTree();

    wxHtmlParserState *s = m_SavedStates;
    m_SavedStates = s->m_nextState;

    m_CurTag = s->m_curTag;
    m_Tags = s->m_tags;
    m_TextPieces = s->m_textPieces;
    m_CurTextPiece = s->m_curTextPiece;
    m_Source = s->m_source;

    delete s;
    return true;
}

//-----------------------------------------------------------------------------
// wxHtmlTagHandler
//-----------------------------------------------------------------------------

IMPLEMENT_ABSTRACT_CLASS(wxHtmlTagHandler,wxObject)


//-----------------------------------------------------------------------------
// wxHtmlEntitiesParser
//-----------------------------------------------------------------------------

IMPLEMENT_DYNAMIC_CLASS(wxHtmlEntitiesParser,wxObject)

wxHtmlEntitiesParser::wxHtmlEntitiesParser()
#if wxUSE_WCHAR_T && !wxUSE_UNICODE
    : m_conv(NULL), m_encoding(wxFONTENCODING_SYSTEM)
#endif
{
}

wxHtmlEntitiesParser::~wxHtmlEntitiesParser()
{
#if wxUSE_WCHAR_T && !wxUSE_UNICODE
    delete m_conv;
#endif
}

void wxHtmlEntitiesParser::SetEncoding(wxFontEncoding encoding)
{
#if wxUSE_WCHAR_T && !wxUSE_UNICODE
    if (encoding == m_encoding)
        return;

    delete m_conv;

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?