📄 readhtml.pas
字号:
{Version 9.4}
{*********************************************************}
{* READHTML.PAS *}
{* Copyright (c) 1995-2006 by *}
{* L. David Baldwin *}
{* All rights reserved. *}
{* *}
{* Thanks to Mike Lischke for his *}
{* assistance with the Unicode conversion *}
{* *}
{*********************************************************}
{$i htmlcons.inc}
{ The Parser
This module contains the parser which reads thru the document. It divides it
into sections storing the pertinent information in Section objects. The
document itself is then a TList of section objects. See the HTMLSubs unit for
the definition of the section objects.
Key Variables:
Sy:
An enumerated type which indicates what the current token is. For
example, a value of TextSy would indicate a hunk of text, PSy that a <P>
tag was encountered, etc.
LCh:
The next character in the stream to be analyzed. In mixed case.
Ch:
The same character in upper case.
LCToken:
A string which is associated with the current token. If Sy is TextSy,
then LCToken contains the text.
Attributes:
A list of TAttribute's for tokens such as <img>, <a>, which have
attributes.
Section:
The current section being built.
SectionList:
The list of sections which form the document. When in a Table,
SectionList will contain the list that makes up the current cell.
Key Routines:
GetCh:
Gets the next character from the stream. Fills Ch and LCh. Skips
comments.
Next:
Gets the next token. Fills Sy, LCToken, Attributes. Calls GetCh so the
next character after the present token is available. Each part of the
parser is responsible for calling Next after it does its thing.
}
unit Readhtml;
interface
uses
SysUtils, WinTypes, WinProcs, Messages, Classes, Graphics, Controls,
Dialogs, StdCtrls, HTMLUn2, StyleUn;
type
LoadStyleType = (lsFile, lsString, lsInclude);
TIncludeType = procedure(Sender: TObject; const Command: string;
Params: TStrings; var IString: string) of Object;
TSoundType = procedure(Sender: TObject; const SRC: string; Loop: integer;
Terminate: boolean) of Object;
TMetaType = procedure(Sender: TObject; const HttpEq, Name, Content: string) of Object;
TLinkType = procedure(Sender: TObject; const Rel, Rev, Href: string) of Object;
TGetStreamEvent = procedure(Sender: TObject; const SRC: string;
var Stream: TMemoryStream) of Object;
TFrameViewerBase = class(TWinControl)
private
procedure wmerase(var msg:TMessage); message wm_erasebkgnd;
protected
FOnInclude: TIncludeType;
FOnSoundRequest: TSoundType;
FOnScript: TScriptEvent;
FOnLink: TLinkType;
procedure AddFrame(FrameSet: TObject; Attr: TAttributeList; const FName: string); virtual; abstract;
function CreateSubFrameSet(FrameSet: TObject): TObject; virtual; abstract;
procedure DoAttributes(FrameSet: TObject; Attr: TAttributeList); virtual; abstract;
procedure EndFrameSet(FrameSet: TObject); virtual; abstract;
end;
TPropStack = class(TFreeList)
private
function GetProp(Index: integer): TProperties;
public
property AnItem[Index: integer]: TProperties read GetProp; default;
function Last: TProperties;
procedure Delete(Index: integer);
end;
var
PropStack: TPropStack;
Title: string;
Base: string;
CodePage: integer;
BaseTarget: string;
NoBreak: boolean; {set when in <NoBr>}
procedure ParseHTMLString(const S: string; ASectionList: TList;
AIncludeEvent: TIncludeType;
ASoundEvent: TSoundType; AMetaEvent: TMetaType; ALinkEvent: TLinkType);
procedure ParseTextString(const S: string; ASectionList: TList);
procedure FrameParseString(FrameViewer: TFrameViewerBase; FrameSet: TObject;
ALoadStyle: LoadStyleType; const FName, S: string; AMetaEvent: TMetaType);
function IsFrameString(ALoadStyle: LoadStyleType; const FName, S : string;
FrameViewer: TObject): boolean;
function TranslateCharset(const Content: string; var Charset: TFontCharset): boolean;
procedure InitializeFontSizes(Size: integer);
function PushNewProp(const Tag, AClass, AnID, APseudo, ATitle: string; AProp: TProperties): boolean;
procedure PopAProp(Tag: string);
implementation
uses
htmlsubs, htmlsbs1, htmlview, StylePars, UrlSubs;
Const
Tab = #9;
EofChar = #0;
var
Sy : Symb;
Section : TSection;
SectionList: TCellBasic;
MasterList: TSectionList;
CurrentURLTarget: TURLTarget;
InHref: boolean;
Attributes : TAttributeList;
BaseFontSize: integer;
InScript: boolean; {when in a <SCRIPT>}
TagIndex: integer;
BodyBlock: TBodyBlock;
ListLevel: integer;
TableLevel: integer;
Entities: TStringList;
InComment: boolean;
LinkSearch: boolean;
SIndex: integer;
IsUTF8: boolean;
type
SymString = string[12];
Const
MaxRes = 80;
MaxEndRes = 57;
ResWords : array[1..MaxRes] of SymString =
('HTML', 'TITLE', 'BODY', 'HEAD', 'B', 'I', 'H', 'EM', 'STRONG',
'U', 'CITE', 'VAR', 'TT', 'CODE', 'KBD', 'SAMP', 'OL', 'UL', 'DIR',
'MENU', 'DL',
'A', 'ADDRESS', 'BLOCKQUOTE', 'PRE', 'CENTER', 'TABLE', 'TD', 'TH',
'CAPTION', 'FORM', 'TEXTAREA', 'SELECT', 'OPTION', 'FONT', 'SUB', 'SUP',
'BIG', 'SMALL', 'P', 'MAP', 'FRAMESET', 'NOFRAMES', 'SCRIPT', 'DIV',
'S', 'STRIKE', 'TR', 'NOBR', 'STYLE', 'SPAN', 'COLGROUP', 'LABEL',
'THEAD', 'TBODY', 'TFOOT', 'OBJECT',
'LI', 'BR', 'HR', 'DD', 'DT', 'IMG', 'BASE', 'BUTTON','INPUT',
'SELECTED', 'BASEFONT', 'AREA', 'FRAME', 'PAGE', 'BGSOUND', 'WRAP',
'META', 'PANEL', 'WBR', 'LINK', 'COL', 'PARAM', 'READONLY');
ResSy : array[1..MaxRes] of Symb =
(htmlSy, TitleSy, BodySy, HeadSy, BSy, ISy, HeadingSy, EmSy, StrongSy,
USy, CiteSy, VarSy, TTSy, CodeSy, KbdSy, SampSy, OLSy, ULSy, DirSy,
MenuSy, DLSy, ASy, AddressSy, BlockQuoteSy, PreSy, CenterSy,TableSy,
TDsy, THSy, CaptionSy, FormSy, TextAreaSy, SelectSy, OptionSy, FontSy,
SubSy, SupSy, BigSy, SmallSy, PSy, MapSy, FrameSetSy, NoFramesSy,
ScriptSy, DivSy, SSy, StrikeSy, TRSy, NoBrSy, StyleSy, SpanSy, ColGroupSy,
LabelSy, THeadSy, TBodySy, TFootSy, ObjectSy,
LISy, BRSy, HRSy, DDSy, DTSy, ImageSy, BaseSy, ButtonSy,
InputSy, SelectedSy, BaseFontSy, AreaSy, FrameSy, PageSy, BgSoundSy,
WrapSy, MetaSy, PanelSy, WbrSy, LinkSy, ColSy, ParamSy, ReadonlySy);
{keep these in order with those above}
EndResSy : array[1..MaxEndRes] of Symb =
(HtmlEndSy, TitleEndSy, BodyEndSy, HeadEndSy, BEndSy, IEndSy, HeadingEndSy,
EmEndSy, StrongEndSy, UEndSy, CiteEndSy, VarEndSy, TTEndSy, CodeEndSy,
KbdEndSy, SampEndSy,
OLEndSy, ULEndSy, DirEndSy, MenuEndSy, DLEndSy, AEndSy, AddressEndSy,
BlockQuoteEndSy, PreEndSy, CenterEndSy, TableEndSy, TDEndSy, THEndSy,
CaptionEndSy, FormEndSy, TextAreaEndSy, SelectEndSy, OptionEndSy, FontEndSy,
SubEndSy, SupEndSy, BigEndSy, SmallEndSy, PEndSy, MapEndSy, FrameSetEndSy,
NoFramesEndSy, ScriptEndSy, DivEndSy, SEndSy, StrikeEndSy, TREndSy,
NoBrEndSy, StyleEndSy, SpanEndSy, ColGroupEndSy, LabelEndSy,
THeadEndSy, TBodyEndSy, TFootEndSy, ObjectEndSy);
Type
EParseError = class(Exception);
Var
LCh, Ch: Char;
LastChar: (lcOther, lcCR, lcLF);
Value : integer;
LCToken : TokenObj;
LoadStyle: LoadStyleType;
Buff, BuffEnd: PChar;
DocS: string;
HaveTranslated: boolean;
IBuff, IBuffEnd: PChar;
SIBuff: string;
IncludeEvent: TIncludeType;
CallingObject: TObject;
SaveLoadStyle: LoadStyleType;
SoundEvent: TSoundType;
MetaEvent: TMetaType;
LinkEvent: TLinkType;
function PropStackIndex: integer;
begin
Result := PropStack.Count-1;
end;
function SymbToStr(Sy: Symb): string;
var
I: integer;
begin
for I := 1 to MaxRes do
if ResSy[I] = Sy then
begin
Result := Lowercase(ResWords[I]);
Exit;
end;
Result := '';
end;
function EndSymbToStr(Sy: Symb): string;
var
I: integer;
begin
for I := 1 to MaxEndRes do
if EndResSy[I] = Sy then
begin
Result := Lowercase(ResWords[I]);
Exit;
end;
Result := '';
end;
function EndSymbFromSymb(Sy: Symb): Symb;
var
I: integer;
begin
for I := 1 to MaxEndRes do
if ResSy[I] = Sy then
begin
Result := EndResSy[I];
Exit;
end;
Result := HtmlSy; {won't match}
end;
function StrToSymb(const S: string): Symb;
var
I: integer;
S1: string;
begin
S1 := UpperCase(S);
for I := 1 to MaxRes do
if ResWords[I] = S1 then
begin
Result := ResSy[I];
Exit;
end;
Result := OtherSy;
end;
function GetNameValueParameter(var Name, Value: String): boolean; forward;
function ReadChar: char;
begin
case LoadStyle of
lsString:
begin
if Buff < BuffEnd then
begin
Result := Buff^;
Inc(Buff);
Inc(SIndex);
end
else
Result := EOFChar;
end;
lsInclude:
if IBuff < IBuffEnd then
begin
Result := IBuff^;
Inc(IBuff);
end
else
begin
IBuff := Nil; {reset for next include}
LoadStyle := SaveLoadStyle;
Result := ReadChar;
end;
else Result := #0; {to prevent warning msg}
end;
if (Integer(Buff) and $FFF = 0) {about every 4000 chars}
and not LinkSearch and Assigned(MasterList) and (DocS <> '') then
ThtmlViewer(CallingObject).htProgress(((Buff-PChar(DocS)) *MasterList.ProgressStart) div (BuffEnd-PChar(DocS)));
end;
{----------------GetchBasic; }
function GetchBasic: char; {read a character}
begin
LCh := ReadChar;
case LCh of {skip a ^J after a ^M or a ^M after a ^J}
^M: if LastChar = lcLF then
LCh := ReadChar;
^J: if LastChar = lcCR then
LCh := ReadChar;
end;
case LCh of
^M: LastChar := lcCR;
^J: begin
LastChar := lcLF;
LCh := ^M;
end;
else
begin
LastChar := lcOther;
if LCh = Tab then LCh := ' ';
end;
end;
Ch := UpCase(LCh);
if (LCh = EofChar) and InComment then
Raise EParseError.Create('Open Comment at End of HTML File');
Result := LCh
end;
{-------------GetCh}
PROCEDURE GetCh;
{Return next char in Lch, its uppercase value in Ch. Ignore comments}
var
Done, Comment: boolean;
function Peek: char; {take a look at the next char}
begin
case LoadStyle of
lsString:
begin
if Buff < BuffEnd then
Result := Buff^
else
Result := EOFChar;
end;
lsInclude:
if IBuff < IBuffEnd then
Result := IBuff^
else
begin
IBuff := Nil;
LoadStyle := SaveLoadStyle;
Result := Peek;
end;
else Result := #0; {to prevent warning msg}
end;
end;
procedure DoDashDash; {do the comment after a <!-- }
begin
repeat
while Ch <> '-' do GetChBasic; {get first '-'}
GetChBasic;
if Ch = '-' then {second '-'}
begin
while Ch = '-' do GetChBasic; {any number of '-'}
while (Ch = ' ') or (Ch = ^M) do GetChBasic; {eat white space}
if Ch = '!' then GetChBasic; {accept --!> also}
Done := Ch = '>';
end
else Done := False;
until Done;
InComment := False;
end;
procedure ReadToGT; {read to the next '>' }
begin
while Ch <> '>' do
GetChBasic;
InComment := False;
end;
procedure DoInclude;
{recursive suggestions by Ben Geerdes}
var
S, Name, Value: string;
Rest : string;
SL: TStringList;
SaveLCToken: TokenObj;
begin
S := '';
SaveLCToken := LCToken;
LCToken := TokenObj.Create;
try
GetChBasic;
while Ch in ['A'..'Z', '_', '0'..'9'] do
begin
S := S + LCh;
GetChBasic;
end;
SL := TStringList.Create;
while GetNameValueParameter(Name, Value) do
SL.Add(Name+'='+Value);
DoDashDash;
Rest := IBuff;
SIBuff := '';
IncludeEvent(CallingObject, S, SL, SIBuff);
if Length(SIBuff) > 0 then
begin
if LoadStyle <> lsInclude then
SaveLoadStyle := LoadStyle;
LoadStyle := lsInclude;
SIBuff := SIBuff + Rest;
IBuff := PAnsiChar(SIBuff);
IBuffEnd := IBuff+Length(SIBuff);
end;
finally
LCToken.Free;
LCToken := SaveLCToken;
end;
end;
begin {Getch}
repeat {in case a comment immediately follows another comment}
{comments may be either '<! stuff >' or '<!-- stuff -->' }
Comment := False;
GetchBasic;
if (Ch = '<') and not InScript then
begin
if Peek = '!' then
begin
GetChBasic;
Comment:=True;
InComment := True;
GetChBasic;
if Ch = '-' then
begin
GetChBasic;
if Ch = '-' then
begin
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -