📄 htmlparser.cpp
字号:
/////////////////////////////////////////////////////////////////////////////
// Name: htmlparser.cpp
// Purpose: Simple HTML parser
// Author: Julian Smart
// Modified by:
// Created: 2002-09-25
// RCS-ID: $Id: htmlparser.cpp,v 1.6 2005/09/23 12:56:23 MR Exp $
// Copyright: (c) Julian Smart
// Licence: wxWindows license
/////////////////////////////////////////////////////////////////////////////
// ----------------------------------------------------------------------------
// headers
// ----------------------------------------------------------------------------
// For compilers that support precompilation, includes "wx/wx.h".
#include "wx/wxprec.h"
#ifdef __BORLANDC__
#pragma hdrstop
#endif
#include "wx/wfstream.h"
#include "wx/textfile.h"
#include "wx/txtstrm.h"
#include "htmlparser.h"
/// Useful insertion operators for wxOutputStream.
static wxOutputStream& operator <<(wxOutputStream& stream, const wxString& s)
{
wxTextOutputStream txt(stream); // This is to make sure the line-ending is native!
txt.WriteString(s);
return stream;
}
#if 0 // Gives warning because not used...
static wxOutputStream& operator <<(wxOutputStream& stream, long l)
{
wxString str;
str.Printf("%ld", l);
return stream << str;
}
static wxOutputStream& operator <<(wxOutputStream& stream, const char c)
{
wxString str;
str.Printf("%c", c);
return stream << str;
}
#endif // 0
/*
* wxSimpleHtmlAttribute
* Representation of an attribute
*/
wxSimpleHtmlParser::wxSimpleHtmlParser()
{
m_topLevel = NULL;
m_pos = 0;
}
wxSimpleHtmlParser::~wxSimpleHtmlParser()
{
Clear();
}
bool wxSimpleHtmlParser::ParseFile(const wxString& filename)
{
wxTextFile textFile;
if (textFile.Open(filename))
{
wxString text;
wxString line;
int i;
int count = textFile.GetLineCount();
for (i = 0; i < count; i++)
{
if (i == 0)
line = textFile.GetFirstLine();
else
line = textFile.GetNextLine();
text += line;
if (i != (count - 1))
text += wxT("\n");
}
#if 0
for ( line = textFile.GetFirstLine(); !textFile.Eof(); line = textFile.GetNextLine() )
{
text += line;
if (!textFile.Eof())
text += wxT("\n");
}
#endif
return ParseString(text);
}
else
return false;
}
bool wxSimpleHtmlParser::ParseString(const wxString& str)
{
Clear();
m_pos = 0;
m_text = str;
m_length = str.Length();
m_topLevel = new wxSimpleHtmlTag(wxT("TOPLEVEL"), wxSimpleHtmlTag_TopLevel);
bool bResult = ParseHtml(m_topLevel);
wxASSERT(bResult); // Failed to parse the TAGs.
// Hint: Check if every open tag has a close tag!
return bResult;
}
// Main recursive parsing function
bool wxSimpleHtmlParser::ParseHtml(wxSimpleHtmlTag* parent)
{
if (!parent)
return false;
while (!Eof())
{
EatWhitespace();
if (IsComment())
{
ParseComment();
}
else if (IsDirective())
{
wxSimpleHtmlTag* tag = ParseDirective();
if (tag)
parent->AppendTag(tag);
}
else if (IsXMLDeclaration())
{
wxSimpleHtmlTag* tag = ParseXMLDeclaration();
if (tag)
parent->AppendTag(tag);
}
else if (IsTagClose())
{
wxSimpleHtmlTag* tag = ParseTagClose();
if (tag)
{
if (IsCloseTagNeeded(tag->GetName()))
{
if (!parent->GetParent())
return false;
parent->GetParent()->AppendTag(tag);
return true;
}
else
parent->AppendTag(tag);
}
}
else if (IsTagStartBracket(GetChar(m_pos)))
{
wxSimpleHtmlTag* tag = ParseTagHeader();
if (tag)
parent->AppendTag(tag);
if (IsCloseTagNeeded(tag->GetName()))
{
if (!ParseHtml(tag))
return false; // Something didn't go ok, so don't continue.
}
}
else
{
// Just a text string
wxString text;
ParseText(text);
wxSimpleHtmlTag* tag = new wxSimpleHtmlTag(wxT("TEXT"), wxSimpleHtmlTag_Text);
tag->SetText(text);
if(parent->GetParent())
parent->GetParent()->AppendTag(tag);
else
parent->AppendTag(tag); // When this occurs it is probably the
// empty lines at the end of the file...
}
}
return true;
}
// Plain text, up until an angled bracket
bool wxSimpleHtmlParser::ParseText(wxString& text)
{
while (!Eof() && GetChar(m_pos) != wxT('<'))
{
text += (wxChar)GetChar(m_pos);
m_pos ++;
}
DecodeSpecialChars(text);
return true;
}
wxSimpleHtmlTag* wxSimpleHtmlParser::ParseTagHeader()
{
if (IsTagStartBracket(GetChar(m_pos)))
{
m_pos ++;
EatWhitespace();
wxString word;
ReadWord(word, true);
EatWhitespace();
wxSimpleHtmlTag* tag = new wxSimpleHtmlTag(word, wxSimpleHtmlTag_Open);
ParseAttributes(tag);
EatWhitespace();
if (IsTagEndBracket(GetChar(m_pos)))
m_pos ++;
return tag;
}
else
return NULL;
}
wxSimpleHtmlTag* wxSimpleHtmlParser::ParseTagClose()
{
Matches(wxT("</"), true);
EatWhitespace();
wxString word;
ReadWord(word, true);
EatWhitespace();
m_pos ++;
wxSimpleHtmlTag* tag = new wxSimpleHtmlTag(word, wxSimpleHtmlTag_Close);
return tag;
}
bool wxSimpleHtmlParser::ParseAttributes(wxSimpleHtmlTag* tag)
{
// Parse attributes of a tag header until we reach >
while (!IsTagEndBracket(GetChar(m_pos)) && !Eof())
{
EatWhitespace();
wxString attrName, attrValue;
if (IsString())
{
ReadString(attrName, true);
tag->AppendAttribute(attrName, wxEmptyString);
}
else if (IsNumeric(GetChar(m_pos)))
{
ReadNumber(attrName, true);
tag->AppendAttribute(attrName, wxEmptyString);
}
else
{
// Try to read an attribute name/value pair, or at least a name
// without the value
ReadLiteral(attrName, true);
EatWhitespace();
if (GetChar(m_pos) == wxT('='))
{
m_pos ++;
EatWhitespace();
if (IsString())
ReadString(attrValue, true);
else if (!Eof() && !IsTagEndBracket(GetChar(m_pos)))
ReadLiteral(attrValue, true);
}
if (!attrName.IsEmpty())
tag->AppendAttribute(attrName, attrValue);
}
}
return true;
}
// e.g. <!DOCTYPE ....>
wxSimpleHtmlTag* wxSimpleHtmlParser::ParseDirective()
{
Matches(wxT("<!"), true);
EatWhitespace();
wxString word;
ReadWord(word, true);
EatWhitespace();
wxSimpleHtmlTag* tag = new wxSimpleHtmlTag(word, wxSimpleHtmlTag_Directive);
ParseAttributes(tag);
EatWhitespace();
if (IsTagEndBracket(GetChar(m_pos)))
m_pos ++;
return tag;
}
// e.g. <?xml .... ?>
wxSimpleHtmlTag* wxSimpleHtmlParser::ParseXMLDeclaration()
{
Matches(wxT("<?"), true);
EatWhitespace();
wxString word;
ReadWord(word, true);
EatWhitespace();
wxSimpleHtmlTag* tag = new wxSimpleHtmlTag(word, wxSimpleHtmlTag_XMLDeclaration);
ParseAttributes(tag);
EatWhitespace();
if (IsTagEndBracket(GetChar(m_pos)))
m_pos ++;
return tag;
}
bool wxSimpleHtmlParser::ParseComment()
{
// Eat the comment tag start
Matches(wxT("<!--"), true);
while (!Eof() && !Matches(wxT("-->"), true))
{
m_pos ++;
}
return true;
}
bool wxSimpleHtmlParser::EatWhitespace()
{
while (!Eof() && IsWhitespace(GetChar(m_pos)))
m_pos ++;
return true;
}
bool wxSimpleHtmlParser::EatWhitespace(int& pos)
{
while (!Eof(pos) && IsWhitespace(GetChar(pos)))
pos ++;
return true;
}
bool wxSimpleHtmlParser::ReadString(wxString& str, bool eatIt)
{
int pos = m_pos;
if (GetChar(pos) == (int) '"')
{
pos ++;
while (!Eof(pos) && GetChar(pos) != (int) '"')
{
// TODO: how are quotes escaped in HTML?
str += (wxChar) GetChar(pos);
pos ++;
}
if (GetChar(pos) == (int) '"')
pos ++;
if (eatIt)
m_pos = pos;
DecodeSpecialChars(str);
return true;
}
else
return false;
}
bool wxSimpleHtmlParser::ReadWord(wxString& str, bool eatIt)
{
int pos = m_pos;
if (!IsAlpha(GetChar(pos)))
return false;
str += (wxChar) GetChar(pos) ;
pos ++;
while (!Eof(pos) && IsWordChar(GetChar(pos)))
{
str += (wxChar) GetChar(pos);
pos ++;
}
if (eatIt)
m_pos = pos;
DecodeSpecialChars(str);
return true;
}
bool wxSimpleHtmlParser::ReadNumber(wxString& str, bool eatIt)
{
int pos = m_pos;
if (!IsNumeric(GetChar(pos)))
return false;
str += (wxChar) GetChar(pos) ;
pos ++;
while (!Eof(pos) && IsNumeric(GetChar(pos)))
{
str += (wxChar) GetChar(pos);
pos ++;
}
if (eatIt)
m_pos = pos;
DecodeSpecialChars(str);
return true;
}
// Could be number, string, whatever, but read up until whitespace or end of tag (but not a quoted string)
bool wxSimpleHtmlParser::ReadLiteral(wxString& str, bool eatIt)
{
int pos = m_pos;
while (!Eof(pos) && !IsWhitespace(GetChar(pos)) && !IsTagEndBracket(GetChar(pos)) && GetChar(pos) != wxT('='))
{
str += (wxChar)GetChar(pos);
pos ++;
}
if (eatIt)
m_pos = pos;
DecodeSpecialChars(str);
return true;
}
bool wxSimpleHtmlParser::IsComment()
{
return Matches(wxT("<!--"));
}
bool wxSimpleHtmlParser::IsDirective()
{
return Matches(wxT("<!"));
}
bool wxSimpleHtmlParser::IsXMLDeclaration()
{
return Matches(wxT("<?xml"));
}
bool wxSimpleHtmlParser::IsString()
{
return (GetChar(m_pos) == (int) '"') ;
}
bool wxSimpleHtmlParser::IsWord()
{
return (IsAlpha(GetChar(m_pos)));
}
bool wxSimpleHtmlParser::IsTagClose()
{
return Matches(wxT("</"));
}
bool wxSimpleHtmlParser::IsTagStartBracket(int ch)
{
return (ch == wxT('<'));
}
bool wxSimpleHtmlParser::IsTagEndBracket(int ch)
{
return (ch == wxT('>'));
}
bool wxSimpleHtmlParser::IsWhitespace(int ch)
{
return ((ch == 13) || (ch == 10) || (ch == 32) || (ch == (int) '\t')) ;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -