⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 htmlparser.cpp

📁 Wxpython Implemented on Windows CE, Source code
💻 CPP
📖 第 1 页 / 共 2 页
字号:
/////////////////////////////////////////////////////////////////////////////
// Name:        htmlparser.cpp
// Purpose:     Simple HTML parser
// Author:      Julian Smart
// Modified by:
// Created:     2002-09-25
// RCS-ID:      $Id: htmlparser.cpp,v 1.6 2005/09/23 12:56:23 MR Exp $
// Copyright:   (c) Julian Smart
// Licence:     wxWindows license
/////////////////////////////////////////////////////////////////////////////

// ----------------------------------------------------------------------------
// headers
// ----------------------------------------------------------------------------

// For compilers that support precompilation, includes "wx/wx.h".
#include "wx/wxprec.h"

#ifdef __BORLANDC__
#pragma hdrstop
#endif

#include "wx/wfstream.h"
#include "wx/textfile.h"
#include "wx/txtstrm.h"
#include "htmlparser.h"

/// Useful insertion operators for wxOutputStream.
static wxOutputStream& operator <<(wxOutputStream& stream, const wxString& s)
{
    wxTextOutputStream txt(stream); // This is to make sure the line-ending is native!

    txt.WriteString(s);
    return stream;
}

#if 0 // Gives warning because not used...
static wxOutputStream& operator <<(wxOutputStream& stream, long l)
{
    wxString str;
    str.Printf("%ld", l);
    return stream << str;
}

static wxOutputStream& operator <<(wxOutputStream& stream, const char c)
{
    wxString str;
    str.Printf("%c", c);
    return stream << str;
}
#endif // 0

/*
 * wxSimpleHtmlAttribute
 * Representation of an attribute
 */

wxSimpleHtmlParser::wxSimpleHtmlParser()
{
    m_topLevel = NULL;
    m_pos = 0;
}


wxSimpleHtmlParser::~wxSimpleHtmlParser()
{
    Clear();
}

bool wxSimpleHtmlParser::ParseFile(const wxString& filename)
{
    wxTextFile textFile;

    if (textFile.Open(filename))
    {
        wxString text;
        wxString line;
        int i;
        int count = textFile.GetLineCount();
        for (i = 0; i < count; i++)
        {
            if (i == 0)
                line = textFile.GetFirstLine();
            else
                line = textFile.GetNextLine();

            text += line;
            if (i != (count - 1))
                text += wxT("\n");
        }

#if 0
        for ( line = textFile.GetFirstLine(); !textFile.Eof(); line = textFile.GetNextLine() )
        {
            text += line;
            if (!textFile.Eof())
                text += wxT("\n");
        }
#endif

        return ParseString(text);
    }
    else
        return false;
}

bool wxSimpleHtmlParser::ParseString(const wxString& str)
{
    Clear();

    m_pos = 0;
    m_text = str;
    m_length = str.Length();

    m_topLevel = new wxSimpleHtmlTag(wxT("TOPLEVEL"), wxSimpleHtmlTag_TopLevel);

    bool bResult = ParseHtml(m_topLevel);

    wxASSERT(bResult); // Failed to parse the TAGs.
                       // Hint: Check if every open tag has a close tag!

    return bResult;
}

// Main recursive parsing function
bool wxSimpleHtmlParser::ParseHtml(wxSimpleHtmlTag* parent)
{
    if (!parent)
        return false;

    while (!Eof())
    {
        EatWhitespace();
        if (IsComment())
        {
            ParseComment();
        }
        else if (IsDirective())
        {
            wxSimpleHtmlTag* tag = ParseDirective();
            if (tag)
                parent->AppendTag(tag);
        }
        else if (IsXMLDeclaration())
        {
            wxSimpleHtmlTag* tag = ParseXMLDeclaration();
            if (tag)
                parent->AppendTag(tag);
        }
        else if (IsTagClose())
        {
            wxSimpleHtmlTag* tag = ParseTagClose();
            if (tag)
            {
                if (IsCloseTagNeeded(tag->GetName()))
                {
                    if (!parent->GetParent())
                        return false;
                    parent->GetParent()->AppendTag(tag);
                    return true;
                }
                else
                    parent->AppendTag(tag);
            }
        }
        else if (IsTagStartBracket(GetChar(m_pos)))
        {
            wxSimpleHtmlTag* tag = ParseTagHeader();
            if (tag)
                parent->AppendTag(tag);

            if (IsCloseTagNeeded(tag->GetName()))
            {
                if (!ParseHtml(tag))
                    return false;   // Something didn't go ok, so don't continue.
            }
        }
        else
        {
            // Just a text string
            wxString text;
            ParseText(text);

            wxSimpleHtmlTag* tag = new wxSimpleHtmlTag(wxT("TEXT"), wxSimpleHtmlTag_Text);
            tag->SetText(text);
            if(parent->GetParent())
                parent->GetParent()->AppendTag(tag);
            else
                parent->AppendTag(tag); // When this occurs it is probably the
                                        // empty lines at the end of the file...
        }
    }
    return true;
}

// Plain text, up until an angled bracket
bool wxSimpleHtmlParser::ParseText(wxString& text)
{
    while (!Eof() && GetChar(m_pos) != wxT('<'))
    {
        text += (wxChar)GetChar(m_pos);
        m_pos ++;
    }
    DecodeSpecialChars(text);
    return true;
}

wxSimpleHtmlTag* wxSimpleHtmlParser::ParseTagHeader()
{
    if (IsTagStartBracket(GetChar(m_pos)))
    {
        m_pos ++;
        EatWhitespace();

        wxString word;
        ReadWord(word, true);

        EatWhitespace();

        wxSimpleHtmlTag* tag = new wxSimpleHtmlTag(word, wxSimpleHtmlTag_Open);

        ParseAttributes(tag);

        EatWhitespace();

        if (IsTagEndBracket(GetChar(m_pos)))
            m_pos ++;

        return tag;
    }
    else
        return NULL;
}

wxSimpleHtmlTag* wxSimpleHtmlParser::ParseTagClose()
{
    Matches(wxT("</"), true);

    EatWhitespace();

    wxString word;
    ReadWord(word, true);

    EatWhitespace();
    m_pos ++;

    wxSimpleHtmlTag* tag = new wxSimpleHtmlTag(word, wxSimpleHtmlTag_Close);
    return tag;
}

bool wxSimpleHtmlParser::ParseAttributes(wxSimpleHtmlTag* tag)
{
    // Parse attributes of a tag header until we reach >
    while (!IsTagEndBracket(GetChar(m_pos)) && !Eof())
    {
        EatWhitespace();

        wxString attrName, attrValue;

        if (IsString())
        {
            ReadString(attrName, true);
            tag->AppendAttribute(attrName, wxEmptyString);
        }
        else if (IsNumeric(GetChar(m_pos)))
        {
            ReadNumber(attrName, true);
            tag->AppendAttribute(attrName, wxEmptyString);
        }
        else
        {
            // Try to read an attribute name/value pair, or at least a name
            // without the value
            ReadLiteral(attrName, true);
            EatWhitespace();

            if (GetChar(m_pos) == wxT('='))
            {
                m_pos ++;
                EatWhitespace();

                if (IsString())
                    ReadString(attrValue, true);
                else if (!Eof() && !IsTagEndBracket(GetChar(m_pos)))
                    ReadLiteral(attrValue, true);
            }
            if (!attrName.IsEmpty())
                tag->AppendAttribute(attrName, attrValue);
        }
    }
    return true;
}

// e.g. <!DOCTYPE ....>
wxSimpleHtmlTag* wxSimpleHtmlParser::ParseDirective()
{
    Matches(wxT("<!"), true);

    EatWhitespace();

    wxString word;
    ReadWord(word, true);

    EatWhitespace();

    wxSimpleHtmlTag* tag = new wxSimpleHtmlTag(word, wxSimpleHtmlTag_Directive);

    ParseAttributes(tag);

    EatWhitespace();

    if (IsTagEndBracket(GetChar(m_pos)))
        m_pos ++;

    return tag;
}

// e.g. <?xml .... ?>
wxSimpleHtmlTag* wxSimpleHtmlParser::ParseXMLDeclaration()
{
    Matches(wxT("<?"), true);

    EatWhitespace();

    wxString word;
    ReadWord(word, true);

    EatWhitespace();

    wxSimpleHtmlTag* tag = new wxSimpleHtmlTag(word, wxSimpleHtmlTag_XMLDeclaration);

    ParseAttributes(tag);

    EatWhitespace();

    if (IsTagEndBracket(GetChar(m_pos)))
        m_pos ++;

    return tag;
}

bool wxSimpleHtmlParser::ParseComment()
{
    // Eat the comment tag start
    Matches(wxT("<!--"), true);

    while (!Eof() && !Matches(wxT("-->"), true))
    {
        m_pos ++;
    }

    return true;
}

bool wxSimpleHtmlParser::EatWhitespace()
{
    while (!Eof() && IsWhitespace(GetChar(m_pos)))
        m_pos ++;
    return true;
}

bool wxSimpleHtmlParser::EatWhitespace(int& pos)
{
    while (!Eof(pos) && IsWhitespace(GetChar(pos)))
        pos ++;
    return true;
}

bool wxSimpleHtmlParser::ReadString(wxString& str, bool eatIt)
{
    int pos = m_pos;
    if (GetChar(pos) == (int) '"')
    {
        pos ++;
        while (!Eof(pos) && GetChar(pos) != (int) '"')
        {
            // TODO: how are quotes escaped in HTML?
            str += (wxChar) GetChar(pos);
            pos ++;
        }
        if (GetChar(pos) == (int) '"')
            pos ++;
        if (eatIt)
            m_pos = pos;
        DecodeSpecialChars(str);
        return true;
    }
    else
        return false;
}

bool wxSimpleHtmlParser::ReadWord(wxString& str, bool eatIt)
{
    int pos = m_pos;

    if (!IsAlpha(GetChar(pos)))
        return false;

    str += (wxChar) GetChar(pos) ;
    pos ++;

    while (!Eof(pos) && IsWordChar(GetChar(pos)))
    {
        str += (wxChar) GetChar(pos);
        pos ++;
    }
    if (eatIt)
        m_pos = pos;
    DecodeSpecialChars(str);
    return true;
}

bool wxSimpleHtmlParser::ReadNumber(wxString& str, bool eatIt)
{
    int pos = m_pos;

    if (!IsNumeric(GetChar(pos)))
        return false;

    str += (wxChar) GetChar(pos) ;
    pos ++;

    while (!Eof(pos) && IsNumeric(GetChar(pos)))
    {
        str += (wxChar) GetChar(pos);
        pos ++;
    }
    if (eatIt)
        m_pos = pos;
    DecodeSpecialChars(str);
    return true;
}

// Could be number, string, whatever, but read up until whitespace or end of tag (but not a quoted string)
bool wxSimpleHtmlParser::ReadLiteral(wxString& str, bool eatIt)
{
    int pos = m_pos;

    while (!Eof(pos) && !IsWhitespace(GetChar(pos)) && !IsTagEndBracket(GetChar(pos)) && GetChar(pos) != wxT('='))
    {
        str += (wxChar)GetChar(pos);
        pos ++;
    }
    if (eatIt)
        m_pos = pos;
    DecodeSpecialChars(str);
    return true;
}

bool wxSimpleHtmlParser::IsComment()
{
    return Matches(wxT("<!--"));
}

bool wxSimpleHtmlParser::IsDirective()
{
    return Matches(wxT("<!"));
}

bool wxSimpleHtmlParser::IsXMLDeclaration()
{
    return Matches(wxT("<?xml"));
}

bool wxSimpleHtmlParser::IsString()
{
    return (GetChar(m_pos) == (int) '"') ;
}

bool wxSimpleHtmlParser::IsWord()
{
    return (IsAlpha(GetChar(m_pos)));
}

bool wxSimpleHtmlParser::IsTagClose()
{
    return Matches(wxT("</"));
}

bool wxSimpleHtmlParser::IsTagStartBracket(int ch)
{
    return (ch == wxT('<'));
}

bool wxSimpleHtmlParser::IsTagEndBracket(int ch)
{
    return (ch == wxT('>'));
}

bool wxSimpleHtmlParser::IsWhitespace(int ch)
{
    return ((ch == 13) || (ch == 10) || (ch == 32) || (ch == (int) '\t')) ;
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -