📄 html.g
字号:
options {
language = "Sather";
}
/*
Based on the HTML 3.2 spec. by the W3 (http://www.w3.org)
Alexander Hinds & Terence Parr
Magelang Institute, Ltd.
Send comments to: parrt@jguru.com
v1.1 Terence Parr (updated to 2.6.0)
Fixed CCYTE->CCITE
Fixed def of COMMENT_DATA so it scarfs stuff correctly.
Also, fixed refs to (PCDATA)? -> (PCDATA)* because a comment
between PCDATA returns 2 PCDATA--ya need the loop not optional.
v1.0 Terence John Parr (version 2.5.0 of ANTLR required)
Fixed how whitespace as handled, removing some ambiguities; some
because of ANTLR lexical filtering in 2.5.0.
Changed (PCDATA)* loops to (PCDATA)? general since PCDATA matches
everything between valid tags (how could there be more than one
between tags?)
Made the DOCTYPE optional.
Reduced lookahead from k=5 to k=1 on the parser and number
of parser ambiguities to 2. Reduced lexer lookahead from 6
to 4; had to left factor a bunch of stuff.
List items couldn't contain nested lists...fixed it.
Fixed def of WORD so it can't be an INT. Removed '-' from WORD.
Fixed HEXNUM so it will allow letters A..F.
KNOWN ISSUES:
1. Does not handle "staggered" tags, eg: <p> <i> <p> <i>
2. Adhere's somewhat strictly to the html spec, so many pages
won't parse without errors.
3. Doesn't convert &(a signifier) to it's proper single char
representation
4. Checks only the syntax of element attributes, not the semantics,
e.g. won't very that a base element's attribute is actually
called "href"
5. Tags split across lines, for example, <A (NEWLINE) some text >
won't be properly recognized. TJP: I think I fixed this.
7. Lines not counted properly due to the def'n of PCDATA - see the
alternate def'n for a possible fix. TJP: I think I fixed this.
*/
class HTML_PARSER extends Parser;
options {
exportVocab=HTML;
k = 1;
}
document
: (PCDATA)? (DOCTYPE (PCDATA)?)?
(OHTML (PCDATA)?)?
(head)?
(body)?
(CHTML (PCDATA)?)?
;
head: (OHEAD (PCDATA)?)?
head_element
(PCDATA | head_element)*
(CHEAD (PCDATA)?)?
;
head_element
: title //bug need at least a title, rest optional
| script
| style
| ISINDEX
| BASE
| META
| LINK
;
title
: OTITLE (PCDATA)? CTITLE
;
script
: OSCRIPT (~CSCRIPT)+ CSCRIPT
;
style
: OSTYLE (~CSTYLE)+ CSTYLE
;
body: ( OBODY (PCDATA)* )?
body_content_no_PCDATA
( body_content )+
( CBODY (PCDATA)* )?
;
body_content_no_PCDATA
: body_tag | text_tag
;
body_tag
: heading | block | ADDRESS
;
body_content
: body_tag | text
;
/*revised*/
heading
: h1 | h2 | h3 | h4 | h5 | h6
;
block
: paragraph | list | preformatted | div |
center | blockquote | HR | table
; //bug - ?FORM v %form, ISINDEX here too?
font: teletype | italic | bold | underline | strike |
big | small | subscript | superscript
;
phrase
: emphasize | strong | definition | code | sample_output|
keyboard_text | variable | citation
;
special
: anchor | IMG | applet | font_dfn | BFONT |
map | BR
;
text_tag
: font | phrase | special | form
;
text: PCDATA | text_tag
;
/*end*/
/*BLOCK ELEMENTS*/
h1 : OH1 (block | text)* CH1
;
h2 : OH2 (block | text)* CH2
;
h3 : OH3 (block | text)* CH3
;
h4 : OH4 (block | text)* CH4
;
h5 : OH5 (block | text)* CH5
;
h6 : OH6 (block | text)* CH6
;
address
: OADDRESS (PCDATA)* CADDRESS
;
//NOTE: according to the standard, paragraphs can't contain block elements
//like HR. Netscape may insert these elements into paragraphs.
//We adhere strictly here.
paragraph
: OPARA
(
/* Rule body_content may also be just plain text because HTML is
so loose. When body puts body_content in a loop, ANTLR
doesn't know whether you want it to match all the text as part
of this paragraph (in the case where the </p> is missing) or
if the body rule should scarf it. This is analogous to the
dangling-else clause. I shut off the warning.
*/
options {
generateAmbigWarnings=false;
}
: text
)*
(CPARA)?
;
list: unordered_list
| ordered_list
| def_list
;
unordered_list
: OULIST (PCDATA)* (list_item)+ CULIST
;
ordered_list
: OOLIST (PCDATA)* (list_item)+ COLIST
;
def_list
: ODLIST (PCDATA)* (def_list_item)+ CDLIST
;
list_item
: OLITEM ( text | list )+ (CLITEM (PCDATA)*)?
;
def_list_item
: dt | dd
;
dt : ODTERM (text)+ CDTERM (PCDATA)*
;
dd : ODDEF (text | block)+ CDTERM (PCDATA)*
;
dir : ODIR (list_item)+ CDIR
;
menu: OMENU (list_item)+ CMENU
;
preformatted
: OPRE (text)+ CPRE
;
div : ODIV (body_content)* CDIV //semi-revised
;
center
: OCENTER (body_content)* CCENTER //semi-revised
;
blockquote
: OBQUOTE PCDATA CBQUOTE
;
form: OFORM (form_field | body_content)* CFORM
;
table
: OTABLE (caption)? (PCDATA)* (tr)+ CTABLE
;
caption
: OCAP (text)* CCAP
;
tr : O_TR (PCDATA)* (th_or_td)* (C_TR (PCDATA)*)?
;
th_or_td
: O_TH_OR_TD (body_content)* (C_TH_OR_TD (PCDATA)*)?
;
/*TEXT ELEMENTS*/
/*font style*/
teletype
: OTTYPE ( text )+ CTTYPE
;
italic
: OITALIC ( text )+ CITALIC
;
bold: OBOLD ( text )+ CBOLD
;
underline
: OUNDER ( text )+ CUNDER
;
strike
: OSTRIKE ( text )+ CSTRIKE
;
big : OBIG ( text )+ CBIG
;
small
: OSMALL ( text )+ CSMALL
;
subscript
: OSUB ( text )+ CSUB
;
superscript
: OSUP ( text )+ CSUP
;
/*phrase elements*/
emphasize
: OEM ( text )+ CEM
;
strong
: OSTRONG ( text )+ CSTRONG
;
definition
: ODEF ( text )+ CDEF
;
code
: OCODE ( text )+ CCODE
;
sample_output
: OSAMP ( text )+ CSAMP
;
keyboard_text
: OKBD ( text )+ CKBD
;
variable
: OVAR ( text )+ CVAR
;
citation
: OCITE ( text )+ CCITE
;
/* form fields (combined with body_content elsewhere so no PCDATA on end) */
form_field
: INPUT | select | textarea
;
select
: OSELECT (PCDATA)* (select_option)+ CSELECT
;
select_option
: SELOPT (PCDATA)*
;
textarea
: OTAREA (PCDATA)* CTAREA
;
/* special text level elements*/
anchor
: OANCHOR (text)* CANCHOR
;
applet
: OAPPLET (APARAM)? (PCDATA)* CAPPLET
;
//not w3-no blocks allowed; www.microsoft.com uses
font_dfn
: OFONT (text)* CFONT
;
map : OMAP (AREA)+ CMAP
;
class HTML_LEXER extends Lexer;
options {
k = 4;
exportVocab=HTML;
charVocabulary = '\3'..'\377';
caseSensitive=false;
filter=UNDEFINED_TOKEN;
}
{
printerr( str : STR ) is
#ERR + str + "\n";
end;
}
/* STRUCTURAL tags
*/
DOCTYPE
options {
ignore=WS;
}
: "<!doctype" "html" "public" STRING '>'
;
OHTML
: "<html>"
;
CHTML
: "</html>"
;
OHEAD
: "<head>"
;
CHEAD
: "</head>"
;
OBODY
: "<body" (WS (ATTR )*)? '>'
;
CBODY
: "</body>"
;
/* HEAD ELEMENTS
*/
OTITLE
: "<title>"
;
CTITLE
: "</title>"
;
OSCRIPT
: "<script>"
;
CSCRIPT
: "</script>"
;
ISINDEX
: "<isindex" WS ATTR '>'
;
META
: "<meta" WS (ATTR)+ '>'
;
LINK
: "<link" WS (ATTR)+ '>'
;
/* headings */
OH1 : "<h1" (WS ATTR)? '>'
;
CH1 : "</h1>"
;
OH2 : "<h2" (WS ATTR)?'>'
;
CH2 : "</h2>"
;
OH3 : "<h3" (WS ATTR)? '>'
;
CH3 : "</h3>"
;
OH4 : "<h4" (WS ATTR)? '>'
;
CH4 : "</h4>"
;
OH5 : "<h5" (WS ATTR)? '>'
;
CH5 : "</h5>"
;
OH6 : "<h6" (WS ATTR)? '>'
;
CH6 : "</h6>"
;
OADDRESS
: "<address>"
;
CADDRESS
: "</address>"
;
OPARA
: "<p" (WS ATTR)? '>'
;
CPARA
: "</p>" //it's optional
;
/*UNORDERED LIST*/
OULIST
: "<ul" (WS ATTR)? '>'
;
CULIST
: "</ul>"
;
/*ORDERED LIST*/
OOLIST
: "<ol" (WS ATTR)? '>'
;
COLIST
: "</ol>"
;
/*LIST ITEM*/
OLITEM
: "<li" (WS ATTR)? '>'
;
CLITEM
: "</li>"
;
/*DEFINITION LIST*/
ODLIST
: "<dl" (WS ATTR)? '>'
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -