sgmlparser.tcl

来自「Linux下的MSN聊天程序源码」· TCL 代码 · 共 2,131 行 · 第 1/5 页

TCL
2,131
字号
# sgmlparser.tcl --##	This file provides the generic part of a parser for SGML-based#	languages, namely HTML and XML.##	NB.  It is a misnomer.  There is no support for parsing#	arbitrary SGML as such.##	See sgml.tcl for variable definitions.## Copyright (c) 1998-2003 Zveno Pty Ltd# http://www.zveno.com/## See the file "LICENSE" in this distribution for information on usage and# redistribution of this file, and for a DISCLAIMER OF ALL WARRANTIES.## $Id: sgmlparser.tcl 6581 2006-05-13 08:26:49Z tjikkun $package require sgml 1.9package require uri 1.1package provide sgmlparser 1.1.1namespace eval sgml {    namespace export tokenise parseEvent    namespace export parseDTD    # NB. Most namespace variables are defined in sgml-8.[01].tcl    # to account for differences between versions of Tcl.    # This especially includes the regular expressions used.    variable ParseEventNum    if {![info exists ParseEventNum]} {	set ParseEventNum 0    }    variable ParseDTDnum    if {![info exists ParseDTDNum]} {	set ParseDTDNum 0    }    variable declExpr [cl $::sgml::Wsp]*([cl ^$::sgml::Wsp]+)[cl $::sgml::Wsp]*([cl ^>]*)    variable EntityExpr [cl $::sgml::Wsp]*(%[cl $::sgml::Wsp])?[cl $::sgml::Wsp]*($::sgml::Name)[cl $::sgml::Wsp]+(.*)    #variable MarkupDeclExpr <([cl ^$::sgml::Wsp>]+)[cl $::sgml::Wsp]*([cl ^$::sgml::Wsp]+)[cl $::sgml::Wsp]*([cl ^>]*)>    #variable MarkupDeclSub "} {\\1} {\\2} {\\3} {"    variable MarkupDeclExpr <[cl $::sgml::Wsp]*([cl ^$::sgml::Wsp>]+)[cl $::sgml::Wsp]*([cl ^>]*)>    variable MarkupDeclSub "\} {\\1} {\\2} \{"    variable ExternalEntityExpr ^(PUBLIC|SYSTEM)[cl $::sgml::Wsp]+("|')(.*?)\\2([cl $::sgml::Wsp]+("|')(.*?)\\2)?([cl $::sgml::Wsp]+NDATA[cl $::sgml::Wsp]+($::xml::Name))?\$    variable StdOptions    array set StdOptions [list \	-elementstartcommand		[namespace current]::noop	\	-elementendcommand		[namespace current]::noop	\	-characterdatacommand		[namespace current]::noop	\	-processinginstructioncommand	[namespace current]::noop	\	-externalentitycommand		{}				\	-xmldeclcommand			[namespace current]::noop	\	-doctypecommand			[namespace current]::noop	\	-commentcommand			[namespace current]::noop	\	-entitydeclcommand		[namespace current]::noop	\	-unparsedentitydeclcommand	[namespace current]::noop	\	-parameterentitydeclcommand	[namespace current]::noop	\	-notationdeclcommand		[namespace current]::noop	\	-elementdeclcommand		[namespace current]::noop	\	-attlistdeclcommand		[namespace current]::noop	\	-paramentityparsing		1				\	-defaultexpandinternalentities	1				\	-startdoctypedeclcommand	[namespace current]::noop	\	-enddoctypedeclcommand		[namespace current]::noop	\	-entityreferencecommand		{}				\	-warningcommand			[namespace current]::noop	\	-errorcommand			[namespace current]::Error	\	-final				1				\	-validate			0				\	-baseuri			{}				\	-name				{}				\	-cmd				{}				\	-emptyelement			[namespace current]::EmptyElement	\	-parseattributelistcommand	[namespace current]::noop	\	-parseentitydeclcommand		[namespace current]::noop	\	-normalize			1				\	-internaldtd			{}				\	-reportempty			0				\	-ignorewhitespace		0				\    ]}# sgml::tokenise --##	Transform the given HTML/XML text into a Tcl list.## Arguments:#	sgml		text to tokenize#	elemExpr	RE to recognise tags#	elemSub		transform for matched tags#	args		options## Valid Options:#       -internaldtdvariable#	-final		boolean		True if no more data is to be supplied#	-statevariable	varName		Name of a variable used to store info## Results:#	Returns a Tcl list representing the document.proc sgml::tokenise {sgml elemExpr elemSub args} {    array set options {-final 1}    array set options $args    set options(-final) [Boolean $options(-final)]    # If the data is not final then there must be a variable to store    # unused data.    if {!$options(-final) && ![info exists options(-statevariable)]} {	return -code error {option "-statevariable" required if not final}    }    # Pre-process stage    #    # Extract the internal DTD subset, if any    catch {upvar #0 $options(-internaldtdvariable) dtd}    if {[regexp {<!DOCTYPE[^[<]+\[([^]]+)\]} $sgml discard dtd]} {	regsub {(<!DOCTYPE[^[<]+)(\[[^]]+\])} $sgml {\1\&xml:intdtd;} sgml    }    # Protect Tcl special characters    regsub -all {([{}\\])} $sgml {\\\1} sgml    # Do the translation    if {[info exists options(-statevariable)]} {	# Mats: Several rewrites here to handle -final 0 option.	# If any cached unparsed xml (state(leftover)), prepend it.	upvar #0 $options(-statevariable) state	if {[string length $state(leftover)]} {	    regsub -all $elemExpr $state(leftover)$sgml $elemSub sgml	    set state(leftover) {}	} else {	    regsub -all $elemExpr $sgml $elemSub sgml	}	set sgml "{} {} {} \{$sgml\}"	# Performance note (Tcl 8.0):	#	Use of lindex, lreplace will cause parsing to list object	# This RE only fixes chopped inside tags, not chopped text.	if {[regexp {^([^<]*)(<[^>]*$)} [lindex $sgml end] x text rest]} {	    set sgml [lreplace $sgml end end $text]	    # Mats: unmatched stuff means that it is chopped off. Cache it for next round.	    set state(leftover) $rest	}	# Patch from bug report #596959, Marshall Rose	if {[string compare [lindex $sgml 4] ""]} {	    set sgml [linsert $sgml 0 {} {} {} {} {}]	}    } else {	# Performance note (Tcl 8.0):	#	In this case, no conversion to list object is performed	# Mats: This fails if not -final and $sgml is chopped off right in a tag.		regsub -all $elemExpr $sgml $elemSub sgml	set sgml "{} {} {} \{$sgml\}"    }    return $sgml}# sgml::parseEvent --##	Produces an event stream for a XML/HTML document,#	given the Tcl list format returned by tokenise.##	This procedure checks that the document is well-formed,#	and throws an error if the document is found to be not#	well formed.  Warnings are passed via the -warningcommand script.##	The procedure only check for well-formedness,#	no DTD is required.  However, facilities are provided for entity expansion.## Arguments:#	sgml		Instance data, as a Tcl list.#	args		option/value pairs## Valid Options:#	-final			Indicates end of document data#	-validate		Boolean to enable validation#	-baseuri		URL for resolving relative URLs#	-elementstartcommand	Called when an element starts#	-elementendcommand	Called when an element ends#	-characterdatacommand	Called when character data occurs#	-entityreferencecommand	Called when an entity reference occurs#	-processinginstructioncommand	Called when a PI occurs#	-externalentitycommand	Called for an external entity reference##	-xmldeclcommand		Called when the XML declaration occurs#	-doctypecommand		Called when the document type declaration occurs#	-commentcommand		Called when a comment occurs#	-entitydeclcommand	Called when a parsed entity is declared#	-unparsedentitydeclcommand	Called when an unparsed external entity is declared#	-parameterentitydeclcommand	Called when a parameter entity is declared#	-notationdeclcommand	Called when a notation is declared#	-elementdeclcommand	Called when an element is declared#	-attlistdeclcommand	Called when an attribute list is declared#	-paramentityparsing	Boolean to enable/disable parameter entity substitution#	-defaultexpandinternalentities	Boolean to enable/disable expansion of entities declared in internal DTD subset##	-startdoctypedeclcommand	Called when the Doc Type declaration starts (see also -doctypecommand)#	-enddoctypedeclcommand	Called when the Doc Type declaration ends (see also -doctypecommand)##	-errorcommand		Script to evaluate for a fatal error#	-warningcommand		Script to evaluate for a reportable warning#	-statevariable		global state variable#	-normalize		whether to normalize names#	-reportempty		whether to include an indication of empty elements#	-ignorewhitespace	whether to automatically strip whitespace## Results:#	The various callback scripts are invoked.#	Returns empty string.## BUGS:#	If command options are set to empty string then they should not be invoked.proc sgml::parseEvent {sgml args} {    variable Wsp    variable noWsp    variable Nmtoken    variable Name    variable ParseEventNum    variable StdOptions    array set options [array get StdOptions]    catch {array set options $args}    # Mats:    # If the data is not final then there must be a variable to persistently store the parse state.    if {!$options(-final) && ![info exists options(-statevariable)]} {	return -code error {option "-statevariable" required if not final}    }        foreach {opt value} [array get options *command] {	if {[string compare $opt "-externalentitycommand"] && ![string length $value]} {	    set options($opt) [namespace current]::noop	}    }    if {![info exists options(-statevariable)]} {	set options(-statevariable) [namespace current]::ParseEvent[incr ParseEventNum]    }    if {![info exists options(entities)]} {	set options(entities) [namespace current]::Entities$ParseEventNum	array set $options(entities) [array get [namespace current]::EntityPredef]    }    if {![info exists options(extentities)]} {	set options(extentities) [namespace current]::ExtEntities$ParseEventNum    }    if {![info exists options(parameterentities)]} {	set options(parameterentities) [namespace current]::ParamEntities$ParseEventNum    }    if {![info exists options(externalparameterentities)]} {	set options(externalparameterentities) [namespace current]::ExtParamEntities$ParseEventNum    }    if {![info exists options(elementdecls)]} {	set options(elementdecls) [namespace current]::ElementDecls$ParseEventNum    }    if {![info exists options(attlistdecls)]} {	set options(attlistdecls) [namespace current]::AttListDecls$ParseEventNum    }    if {![info exists options(notationdecls)]} {	set options(notationdecls) [namespace current]::NotationDecls$ParseEventNum    }    if {![info exists options(namespaces)]} {	set options(namespaces) [namespace current]::Namespaces$ParseEventNum    }    # For backward-compatibility    catch {set options(-baseuri) $options(-baseurl)}    # Choose an external entity resolver    if {![string length $options(-externalentitycommand)]} {	if {$options(-validate)} {	    set options(-externalentitycommand) [namespace code ResolveEntity]	} else {	    set options(-externalentitycommand) [namespace code noop]	}    }    upvar #0 $options(-statevariable) state    upvar #0 $options(entities) entities    # Mats:    # The problem is that the state is not maintained when -final 0 !    # I've switched back to an older version here.         if {![info exists state(line)]} {	# Initialise the state variable	array set state {	    mode normal	    haveXMLDecl 0	    haveDocElement 0	    inDTD 0	    context {}	    stack {}	    line 0	    defaultNS {}	    defaultNSURI {}	}    }    foreach {tag close param text} $sgml {	# Keep track of lines in the input	incr state(line) [regsub -all \n $param {} discard]	incr state(line) [regsub -all \n $text {} discard]	# If the current mode is cdata or comment then we must undo what the	# regsub has done to reconstitute the data	set empty {}	switch $state(mode) {	    comment {		# This had "[string length $param] && " as a guard -		# can't remember why :-(		if {[regexp ([cl ^-]*)--\$ $tag discard comm1]} {		    # end of comment (in tag)		    set tag {}		    set close {}		    set state(mode) normal		    DeProtect1 $options(-commentcommand) $state(commentdata)<$comm1		    unset state(commentdata)		} elseif {[regexp ([cl ^-]*)--\$ $param discard comm1]} {		    # end of comment (in attributes)		    DeProtect1 $options(-commentcommand) $state(commentdata)<$close$tag>$comm1		    unset state(commentdata)		    set tag {}		    set param {}		    set close {}		    set state(mode) normal		} elseif {[regexp ([cl ^-]*)-->(.*) $text discard comm1 text]} {		    # end of comment (in text)		    DeProtect1 $options(-commentcommand) $state(commentdata)<$close$tag$param>$comm1		    unset state(commentdata)		    set tag {}		    set param {}		    set close {}		    set state(mode) normal		} else {		    # comment continues		    append state(commentdata) <$close$tag$param>$text		    continue		}	    }	    cdata {		if {[string length $param] && [regexp ([cl ^\]]*)\]\][cl $Wsp]*\$ $tag discard cdata1]} {		    # end of CDATA (in tag)		    PCDATA [array get options] $state(cdata)<[subst -nocommand -novariable $close$cdata1]		    set text [subst -novariable -nocommand $text]		    set tag {}		    unset state(cdata)		    set state(mode) normal		} elseif {[regexp ([cl ^\]]*)\]\][cl $Wsp]*\$ $param discard cdata1]} {		    # end of CDATA (in attributes)		    PCDATA [array get options] $state(cdata)<[subst -nocommand -novariable $close$tag$cdata1]		    set text [subst -novariable -nocommand $text]		    set tag {}		    set param {}		    unset state(cdata)		    set state(mode) normal		} elseif {[regexp (.*)\]\][cl $Wsp]*>(.*) $text discard cdata1 text]} {		    # end of CDATA (in text)		    PCDATA [array get options] $state(cdata)<[subst -nocommand -novariable $close$tag$param>$cdata1]		    set text [subst -novariable -nocommand $text]		    set tag {}		    set param {}		    set close {}		    unset state(cdata)		    set state(mode) normal		} else {		    # CDATA continues		    append state(cdata) [subst -nocommand -novariable <$close$tag$param>$text]		    continue		}	    }	    continue {		# We're skipping elements looking for the close tag		switch -glob -- [string length $tag],[regexp {^\?|!.*} $tag],$close {		    0,* {			continue		    }		    *,0, {			if {![string compare $tag $state(continue:tag)]} {			    set empty [uplevel #0 $options(-emptyelement) [list $tag $param $empty]]			    if {![string length $empty]} {				incr state(continue:level)			    }			}			continue		    }		    *,0,/ {			if {![string compare $tag $state(continue:tag)]} {			    incr state(continue:level) -1			}			if {!$state(continue:level)} {			    unset state(continue:tag)			    unset state(continue:level)			    set state(mode) {}			}		    }		    default {			continue		    }		}	    }	    default {		# The trailing slash on empty elements can't be automatically separated out		# in the RE, so we must do it here.		regexp (.*)(/)[cl $Wsp]*$ $param discard param empty	    }	}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?