📄 html.g

📁 Java写的词法/语法分析器。可生成JAVA语言或者是C++的词法和语法分析器。并可产生语法分析树和对该树进行遍历
💻 G
📖 第 1 页 / 共 2 页
字号:
12 下一页
options {
	language = "Sather";
}

/*	
	Based on the HTML 3.2 spec. by the W3 (http://www.w3.org)
	Alexander Hinds & Terence Parr
	Magelang Institute, Ltd.
	Send comments to:  parrt@jguru.com

	v1.1	Terence Parr (updated to 2.6.0)

	Fixed CCYTE->CCITE
	Fixed def of COMMENT_DATA so it scarfs stuff correctly.
	Also, fixed refs to (PCDATA)? -> (PCDATA)* because a comment
		between PCDATA returns 2 PCDATA--ya need the loop not optional.

	v1.0	Terence John Parr (version 2.5.0 of ANTLR required)

	Fixed how whitespace as handled, removing some ambiguities; some
	because of ANTLR lexical filtering in 2.5.0.

	Changed (PCDATA)* loops to (PCDATA)? general since PCDATA matches
	everything between valid tags (how could there be more than one
	between tags?)

	Made the DOCTYPE optional.

	Reduced lookahead from k=5 to k=1 on the parser and number
	of parser ambiguities to 2.  Reduced lexer lookahead from 6
	to 4; had to left factor a bunch of stuff.

	List items couldn't contain nested lists...fixed it.

	Fixed def of WORD so it can't be an INT.  Removed '-' from WORD.

	Fixed HEXNUM so it will allow letters A..F.

	KNOWN ISSUES:

	1.  Does not handle "staggered" tags, eg: <p> <i> <p> <i>

	2.  Adhere's somewhat strictly to the html spec, so many pages
	won't parse without errors.

	3.  Doesn't convert &(a signifier) to it's proper single char 
	representation

	4.  Checks only the syntax of element attributes, not the semantics,
	e.g. won't very that a base element's attribute is actually
	called "href" 

	5.  Tags split across lines, for example, <A (NEWLINE) some text >
	won't be properly recognized.  TJP: I think I fixed this.

	7.  Lines not counted properly due to the def'n of PCDATA - see the
	alternate def'n for a possible fix.  TJP: I think I fixed this.

*/

class HTML_PARSER extends Parser;
options {
	exportVocab=HTML;
	k = 1;
}


document
	: 	(PCDATA)? (DOCTYPE (PCDATA)?)?
		(OHTML (PCDATA)?)?
		(head)?
		(body)?
		(CHTML (PCDATA)?)?
	;

head: 	(OHEAD (PCDATA)?)?
		head_element
		(PCDATA | head_element)* 
		(CHEAD (PCDATA)?)? 
	;	

head_element
	:	title	//bug need at least a title, rest optional
	|	script
	|	style
	|	ISINDEX
	|	BASE
	|	META
	|	LINK
	;

title
	:	OTITLE (PCDATA)? CTITLE
	;

script
	:	OSCRIPT (~CSCRIPT)+ CSCRIPT
	;

style
	:	OSTYLE (~CSTYLE)+ CSTYLE
	;

body: 	( OBODY (PCDATA)* )? 
		body_content_no_PCDATA
		( body_content )+ 
		( CBODY (PCDATA)* )? 
	;	

body_content_no_PCDATA
	:	body_tag | text_tag
	;

body_tag
	: 	heading | block | ADDRESS
	;

body_content
	: 	body_tag | text
	;


/*revised*/
heading
	:	h1 | h2 | h3 | h4 | h5 | h6
	;

block
	:	paragraph | list | preformatted | div |
		center | blockquote | HR | table
	;	//bug - ?FORM v %form, ISINDEX here too?

font:	teletype | italic | bold | underline | strike | 
		big | small | subscript | superscript
	;

phrase
	:	emphasize | strong | definition | code | sample_output|
		keyboard_text | variable | citation
	;
	
special
	:	anchor | IMG | applet | font_dfn | BFONT |
		map | BR 
	;

text_tag
	:	font | phrase | special | form
	;

text:	PCDATA | text_tag
	;

/*end*/


/*BLOCK ELEMENTS*/

h1	:	OH1 (block | text)* CH1
	;
h2	:	OH2 (block | text)* CH2
	;
h3	:	OH3 (block | text)* CH3
	;
h4	:	OH4 (block | text)* CH4
	;
h5	:	OH5 (block | text)* CH5
	;
h6	:	OH6 (block | text)* CH6
	;

address
	:	OADDRESS (PCDATA)* CADDRESS
	;

//NOTE:  according to the standard, paragraphs can't contain block elements
//like HR.  Netscape may insert these elements into paragraphs.
//We adhere strictly here.

paragraph
	:	OPARA
		(
			/*	Rule body_content may also be just plain text because HTML is
				so loose.  When body puts body_content in a loop, ANTLR
				doesn't know whether you want it to match all the text as part
				of this paragraph (in the case where the </p> is missing) or
				if the body rule should scarf it.  This is analogous to the
				dangling-else clause.  I shut off the warning.
			*/
			options {
				generateAmbigWarnings=false;
			}
		:	text
		)*
		(CPARA)?	
	;

list:	unordered_list
	|	ordered_list
	|	def_list
	;

unordered_list
	:	OULIST (PCDATA)* (list_item)+ CULIST
	;

ordered_list
	:	OOLIST (PCDATA)* (list_item)+ COLIST
	;

def_list
	:	ODLIST (PCDATA)* (def_list_item)+ CDLIST 
	;

list_item
	:	OLITEM ( text | list )+ (CLITEM (PCDATA)*)?
	;
	
def_list_item
	:	dt | dd
	;

dt	:	ODTERM (text)+ CDTERM (PCDATA)*
	;

dd	:	ODDEF (text | block)+ CDTERM (PCDATA)*
	;

dir	:	ODIR (list_item)+ CDIR
	;

menu:	OMENU (list_item)+ CMENU
	;

preformatted
	:	OPRE (text)+ CPRE
	;

div	:	ODIV (body_content)* CDIV		//semi-revised
	;

center
	:	OCENTER (body_content)* CCENTER //semi-revised
	;

blockquote
	:	OBQUOTE PCDATA CBQUOTE
	;

form:	OFORM (form_field | body_content)* CFORM
	;

table
	:	OTABLE (caption)? (PCDATA)* (tr)+ CTABLE
	;

caption
	:	OCAP (text)* CCAP	
	;

tr	:	O_TR (PCDATA)* (th_or_td)* (C_TR (PCDATA)*)? 
	;

th_or_td
	:	O_TH_OR_TD (body_content)* (C_TH_OR_TD (PCDATA)*)?
	;

/*TEXT ELEMENTS*/

/*font style*/

teletype
	:	OTTYPE ( text )+ CTTYPE
	;

italic
	:	OITALIC ( text )+ CITALIC
	;

bold:	OBOLD ( text )+ CBOLD
	;

underline
	:	OUNDER ( text )+ CUNDER
	;

strike
	:	OSTRIKE ( text )+ CSTRIKE
	;

big	:	OBIG ( text )+ CBIG
	;

small
	:	OSMALL ( text )+ CSMALL
	;

subscript
	:	OSUB ( text )+ CSUB
	;

superscript
	:	OSUP ( text )+ CSUP
	;

	/*phrase elements*/

emphasize
	:	OEM ( text )+ CEM
	;

strong
	:	OSTRONG ( text )+ CSTRONG
	;

definition
	:	ODEF ( text )+ CDEF
	;

code
	:	OCODE ( text )+ CCODE
	;

sample_output
	:	OSAMP ( text )+ CSAMP
	;

keyboard_text
	:	OKBD ( text )+ CKBD
	;

variable
	:	OVAR ( text )+ CVAR
	;

citation
	:	OCITE ( text )+ CCITE
	;

/*	form fields (combined with body_content elsewhere so no PCDATA on end) */
form_field
	:	INPUT | select | textarea
	;

select
	:	OSELECT (PCDATA)* (select_option)+ CSELECT
	;

select_option
	:	SELOPT (PCDATA)*
	;

textarea
	:	OTAREA (PCDATA)* CTAREA
	;

/*	special text level elements*/
anchor
	:	OANCHOR (text)* CANCHOR
	;

applet
	:	OAPPLET (APARAM)? (PCDATA)* CAPPLET
	;

//not w3-no blocks allowed; www.microsoft.com uses
font_dfn
	:	OFONT (text)* CFONT	
	;

map	:	OMAP (AREA)+ CMAP
	;

class HTML_LEXER extends Lexer;
options {	
	k = 4;
	exportVocab=HTML;
	charVocabulary = '\3'..'\377';
	caseSensitive=false;
	filter=UNDEFINED_TOKEN;
}
{
  printerr( str : STR ) is 
    #ERR + str + "\n";
  end; 
}

/*	STRUCTURAL tags
*/

DOCTYPE 
options {
ignore=WS;
}
	: "<!doctype" "html" "public" STRING '>'
	;

OHTML
 	: 	"<html>"
	; 

CHTML
	: 	"</html>"
	;

OHEAD
	: 	"<head>"
	;

CHEAD
	: 	"</head>"
	;

OBODY
	:	"<body" (WS (ATTR )*)? '>' 
	;

CBODY
	:	"</body>"
	;


/*	HEAD ELEMENTS
*/

OTITLE
	: "<title>"
	;

CTITLE
	: "</title>"
	;


OSCRIPT
	: 	"<script>" 
	;

CSCRIPT
	:	"</script>"
	;

ISINDEX
 	: 	"<isindex" WS ATTR '>'
	;

META
	: 	"<meta" WS (ATTR)+ '>'
	;

LINK
	:	"<link" WS (ATTR)+ '>'	
	;


/* headings */

OH1	:	"<h1" (WS ATTR)? '>' 
	;

CH1	:	"</h1>" 
	;

OH2	:	"<h2" (WS ATTR)?'>' 
	;

CH2	:	"</h2>" 
	;

OH3	:	"<h3" (WS ATTR)? '>' 
	;

CH3	:	"</h3>" 
	;

OH4	:	"<h4" (WS ATTR)? '>' 
	;

CH4	:	"</h4>" 
	;

OH5	:	"<h5" (WS ATTR)? '>' 
	;

CH5	:	"</h5>" 
	;

OH6	:	"<h6" (WS ATTR)? '>' 
	;

CH6	:	"</h6>" 
	;

OADDRESS
	:	"<address>" 
	;

CADDRESS
	:	"</address>"
	;

OPARA
	:	"<p" (WS ATTR)? '>' 
	;

CPARA
	: 	"</p>"		//it's optional
	;

		/*UNORDERED LIST*/
OULIST
	:	"<ul" (WS ATTR)? '>' 
	;

CULIST
	:	"</ul>"
	;

		/*ORDERED LIST*/
OOLIST
	:	"<ol" (WS ATTR)? '>'
	;

COLIST
	:	"</ol>"
	;

		/*LIST ITEM*/

OLITEM
	:	"<li" (WS ATTR)? '>'
	;

CLITEM
	:	"</li>"
	;

		/*DEFINITION LIST*/ 

ODLIST 
	:	"<dl" (WS ATTR)? '>'
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -