📄 jimparse.pas
字号:
{******************************************************************
JEDI-VCL Demo
Copyright (C) 2002 Project JEDI
Original author:
Contributor(s):
You may retrieve the latest version of this file at the JEDI-JVCL
home page, located at http://jvcl.sourceforge.net
The contents of this file are used with permission, subject to
the Mozilla Public License Version 1.1 (the "License"); you may
not use this file except in compliance with the License. You may
obtain a copy of the License at
http://www.mozilla.org/MPL/MPL-1_1Final.html
Software distributed under the License is distributed on an
"AS IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
implied. See the License for the specific language governing
rights and limitations under the License.
******************************************************************}
unit JimParse;
interface
uses
SysUtils, Classes;
type
TjimToken = class(TObject)
private
FTokenType : Integer;
FAsString : string;
public
property TokenType : Integer read FTokenType write FTokenType;
property AsString : string read FAsString write FAsString;
end;
TjimLexicalAnalyser = class(TObject)
private
FText : string;
FPosition : Integer;
procedure SetText(const Value : string);
public
constructor Create;
procedure GetNextToken(NextToken : TjimToken);
property Text : string read FText write SetText;
end;
TjimSymbolType = (stTitle,stBase,stLink,stImage);
TjimSymbol = class(TCollectionItem)
private
FSymbolType : TjimSymbolType;
FSymbolValue : string;
public
procedure Assign(Source : TPersistent); override;
property SymbolType : TjimSymbolType read FSymbolType write FSymbolType;
property SymbolValue : string read FSymbolValue write FSymbolValue;
end;
TjimSymbolTable = class(TCollection)
private
function GetItem(Index : Integer) : TjimSymbol;
procedure SetItem(Index : Integer;Value : TjimSymbol);
public
function Add : TjimSymbol;
function AddSymbol(SymType : TjimSymbolType;SymValue : string) : TjimSymbol;
property Items[Index : Integer] : TjimSymbol read GetItem write SetItem; default;
end;
TjimHtmlParser = class(TObject)
private
FLookahead : TjimToken;
FLexAnalyser : TjimLexicalAnalyser;
FSymbolTable : TjimSymbolTable;
FLastTag : string;
procedure Match(T : Integer);
procedure ConsumeWhiteSpace;
procedure Document;
procedure Tag;
procedure Data;
procedure TagName;
procedure AttributeList;
function AttributeName : string;
function Value : string;
function Identifier : string;
function QuotedValue : string;
function PlainValue : string;
public
constructor Create;
destructor Destroy; override;
procedure Parse(const DocString : string);
property SymbolTable : TjimSymbolTable read FSymbolTable;
end;
EjimHtmlParserError = class(Exception);
implementation
const
// Token types are the characters 0 to 255, along with the following
ttEndOfDoc = -1;
// --------------------------- TjimLexicalAnalyser ---------------------------
constructor TjimLexicalAnalyser.Create;
begin {Create}
FText := '';
FPosition := 0;
end; {Create}
procedure TjimLexicalAnalyser.SetText(const Value : string);
begin {SetText}
if FText = Value then begin
// Only proceed if setting a new string
Exit;
end;
FPosition := 1;
FText := Value;
end; {SetText}
procedure TjimLexicalAnalyser.GetNextToken(NextToken : TjimToken);
begin {GetNextToken}
// Read the next character
if FPosition > Length(FText) then begin
// At the end of the document
NextToken.AsString := #0;
NextToken.TokenType := ttEndOfDoc;
end else begin
// Return the character
NextToken.AsString := FText[FPosition];
NextToken.TokenType := Integer(FText[FPosition]);
Inc(FPosition);
end;
end; {GetNextToken}
// ------------------------------- TjimSymbol -------------------------------
procedure TjimSymbol.Assign(Source : TPersistent);
begin {Assign}
if Source is TjimSymbol then begin
SymbolType := TjimSymbol(Source).SymbolType;
SymbolValue := TjimSymbol(Source).SymbolValue;
Exit;
end;
inherited Assign(Source);
end; {Assign}
// ------------------------------ TjimSymbolTable ----------------------------
function TjimSymbolTable.GetItem(Index : Integer) : TjimSymbol;
begin {GetItem}
Result := TjimSymbol(inherited GetItem(Index));
end; {GetItem}
procedure TjimSymbolTable.SetItem(Index : Integer;Value : TjimSymbol);
begin {SetItem}
inherited SetItem(Index,Value);
end; {SetItem}
function TjimSymbolTable.Add : TjimSymbol;
begin {Add}
Result := TjimSymbol(inherited Add);
end; {Add}
function TjimSymbolTable.AddSymbol(SymType : TjimSymbolType;SymValue : string) : TjimSymbol;
var
i : Integer;
begin {AddSymbol}
Result := nil;
// Check whether symbol is already in the list
for i := 0 to Count - 1 do begin
if (Items[i].SymbolType = SymType) and (Items[i].SymbolValue = SymValue) then begin
Exit;
end;
end;
Result := Add;
Result.SymbolType := SymType;
result.SymbolValue := SymValue;
end; {AddSymbol}
// ------------------------------ TjimHtmlParser -----------------------------
procedure TjimHtmlParser.ConsumeWhiteSpace;
// Eats 'whitespace' ie chars 0 to 32 inclusive. Here instead of lexical
// analyser because white space is allowed sometimes.
begin {ConsumeWhiteSpace}
while (FLookahead.TokenType <> ttEndOfDoc) and
(FLookAhead.AsString <= ' ') do begin
FLexAnalyser.GetNextToken(FLookAhead);
end;
end; {ConsumeWhiteSpace}
procedure TjimHtmlParser.Match(T : Integer);
// If the token type T matches the FLookahead token type then FLookAhead is
// set to the next token, otherwise an exception is raised
begin {Match}
if FLookahead.TokenType = T then begin
FLexAnalyser.GetNextToken(FLookahead);
end else begin
raise EjimHtmlParserError.Create('HTML syntax error. Expected ' +
IntToStr(FLookahead.TokenType));
end;
end; {Match}
procedure TjimHtmlParser.Document;
begin {Document}
while FLookahead.TokenType <> ttEndOfDoc do begin
ConsumeWhiteSpace;
if FLookahead.AsString = '<' then begin
Tag;
end else begin
Data;
end;
end;
Match(ttEndOfDoc);
end; {Document}
procedure TjimHtmlParser.Tag;
begin {Tag}
Match(Ord('<'));
ConsumeWhiteSpace;
if FLookahead.AsString = '/' then begin
// Finding an end tag
Match(Ord('/'));
FLastTag := '/';
ConsumeWhiteSpace;
TagName;
end else begin
// Finding a start tag, or a tag that doesn't enclose anything
FLastTag := '';
ConsumeWhiteSpace;
TagName;
ConsumeWhiteSpace;
AttributeList;
end;
Match(Ord('>'));
end; {Tag}
procedure TjimHtmlParser.Data;
var
TitleStr : string;
begin {Data}
TitleStr := '';
while (FLookahead.AsString <> '<') and
(FLookahead.TokenType <> ttEndOfDoc) do begin
// Collect the title string. It is ok to search like this because no other
// tags are allowed in a title
if CompareText(FLastTag,'Title') = 0 then begin
TitleStr := TitleStr + FLookahead.AsString;
end;
Match(FLookahead.TokenType);
end;
if TitleStr > '' then begin
FSymbolTable.AddSymbol(stTitle,TitleStr);
end;
end; {Data}
procedure TjimHtmlParser.TagName;
begin {TagName}
FLastTag := FLastTag + Identifier;
if FLastTag = '!--' then begin
// In a comment tag. Treat this specially by ignoring all characters
// until the end of the comment tag
repeat
if FLookahead.AsString = '-' then begin
FLastTag := FLastTag + FLookahead.AsString;
end else begin
FLastTag := '';
end;
Match(FLookahead.TokenType);
until FLastTag = '--';
end else if CompareText(FLastTag,'META') = 0 then begin
// In a META tag. There is all sorts of rubbish here, so consume it all
// until the end of the tag
while FLookahead.AsString <> '>' do begin
Match(FLookahead.TokenType);
end;
end;
end; {TagName}
procedure TjimHtmlParser.AttributeList;
var
FLastAttribute : string;
FLastValue : string;
begin {AttributeList}
while FLookahead.AsString <> '>' do begin
FLastAttribute := AttributeName;
ConsumeWhiteSpace;
if FLookahead.AsString = '=' then begin
Match(Ord('='));
ConsumeWhiteSpace;
FLastValue := Value;
ConsumeWhiteSpace;
// Should only get here if FLastAttribute is not an empty string
if (CompareText(FLastTag,'BASE') = 0) and
(CompareText('HREF',FLastAttribute) = 0) then begin
// Special case when found the HREF attribute of a BASE tag
FSymbolTable.AddSymbol(stBase,FLastValue);
end else if (CompareText(FLastTag,'IMG') = 0) and
(CompareText('SRC',FLastAttribute) = 0) then begin
// Found an image
FSymbolTable.AddSymbol(stImage,FLastValue);
end else if ((CompareText(FLastTag,'A') = 0) or
(CompareText(FLastTag,'AREA') = 0) or
(CompareText(FLastTag,'LINK') = 0)) and
(CompareText('HREF',FLastAttribute) = 0) then begin
// Found an ordinary link
FSymbolTable.AddSymbol(stLink,FLastValue);
end else if (CompareText(FLastTag,'FRAME') = 0) and
(CompareText('SRC',FLastAttribute) = 0) then begin
// Found an ordinary link
FSymbolTable.AddSymbol(stLink,FLastValue);
end;
end;
end;
end; {AttributeList}
function TjimHtmlParser.AttributeName : string;
begin {AttributeName}
Result := '';
if FLookahead.AsString = '"' then begin
Result := QuotedValue;
end else begin
Result := Identifier;
end;
end; {AttributeName}
function TjimHtmlParser.Value : string;
begin {Value}
Result := '';
if FLookahead.AsString = '"' then begin
Result := QuotedValue;
end else begin
Result := PlainValue;
end;
end; {Value}
function TjimHtmlParser.Identifier : string;
const
IdentifierSet = ['A'..'Z','a'..'z','0'..'9','-','!',':','/'];
begin {Identifier}
Result := '';
if (Length(FLookahead.AsString) >= 1) and
(not (FLookahead.AsString[1] in IdentifierSet)) then begin
raise EjimHtmlParserError.Create('HTML syntax error. Expected identifier, ' +
'but got : ' + FLookahead.AsString +
' in tag ' + FLastTag);
end;
repeat
Result := Result + FLookahead.AsString;
if Result = '!--' then begin
// Found a comment tag. Some people eg Microsoft, don't put a space after
// this part of the tag
Exit;
end;
Match(FLookahead.TokenType);
until not (FLookahead.AsString[1] in IdentifierSet);
end; {Identifier}
function TjimHtmlParser.QuotedValue : string;
begin {QuotedValue}
Result := '';
Match(Ord('"'));
while FLookahead.AsString <> '"' do begin
Result := Result + FLookahead.AsString;
Match(FLookahead.TokenType);
end;
Match(Ord('"'));
end; {QuotedValue}
function TjimHtmlParser.PlainValue : string;
const
PlainValueSet = ['A'..'Z','a'..'z','0'..'9','-','.','+','-',':','/','?',
''''];
begin {PlainValue}
Result := '';
if (Length(FLookahead.AsString) >= 1) and
(not (FLookahead.AsString[1] in PlainValueSet)) then begin
raise EjimHtmlParserError.Create('HTML syntax error. Expected plain value, ' +
'but got : ' + FLookahead.AsString +
' in tag ' + FLastTag);
end;
repeat
Result := Result + FLookahead.AsString;
Match(FLookahead.TokenType);
until not (FLookahead.AsString[1] in PlainValueSet);
end; {PlainValue}
constructor TjimHtmlParser.Create;
begin {Create}
FLookahead := TjimToken.Create;
FLexAnalyser := TjimLexicalAnalyser.Create;
FSymbolTable := TjimSymbolTable.Create(TjimSymbol);
end; {Create}
destructor TjimHtmlParser.Destroy;
begin {Destroy}
FLookahead.Free;
FLexAnalyser.Free;
FSymbolTable.Free;
end; {Destroy}
procedure TjimHtmlParser.Parse(const DocString : string);
begin {Parse}
if DocString = '' then begin
Exit;
end;
FLastTag := '';
FLexAnalyser.Text := DocString;
FLexAnalyser.GetNextToken(FLookahead);
Document;
end; {Parse}
end.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -