sgmlparser.tcl
来自「Linux下的MSN聊天程序源码」· TCL 代码 · 共 2,131 行 · 第 1/5 页
TCL
2,131 行
# sgmlparser.tcl --## This file provides the generic part of a parser for SGML-based# languages, namely HTML and XML.## NB. It is a misnomer. There is no support for parsing# arbitrary SGML as such.## See sgml.tcl for variable definitions.## Copyright (c) 1998-2003 Zveno Pty Ltd# http://www.zveno.com/## See the file "LICENSE" in this distribution for information on usage and# redistribution of this file, and for a DISCLAIMER OF ALL WARRANTIES.## $Id: sgmlparser.tcl 6581 2006-05-13 08:26:49Z tjikkun $package require sgml 1.9package require uri 1.1package provide sgmlparser 1.1.1namespace eval sgml { namespace export tokenise parseEvent namespace export parseDTD # NB. Most namespace variables are defined in sgml-8.[01].tcl # to account for differences between versions of Tcl. # This especially includes the regular expressions used. variable ParseEventNum if {![info exists ParseEventNum]} { set ParseEventNum 0 } variable ParseDTDnum if {![info exists ParseDTDNum]} { set ParseDTDNum 0 } variable declExpr [cl $::sgml::Wsp]*([cl ^$::sgml::Wsp]+)[cl $::sgml::Wsp]*([cl ^>]*) variable EntityExpr [cl $::sgml::Wsp]*(%[cl $::sgml::Wsp])?[cl $::sgml::Wsp]*($::sgml::Name)[cl $::sgml::Wsp]+(.*) #variable MarkupDeclExpr <([cl ^$::sgml::Wsp>]+)[cl $::sgml::Wsp]*([cl ^$::sgml::Wsp]+)[cl $::sgml::Wsp]*([cl ^>]*)> #variable MarkupDeclSub "} {\\1} {\\2} {\\3} {" variable MarkupDeclExpr <[cl $::sgml::Wsp]*([cl ^$::sgml::Wsp>]+)[cl $::sgml::Wsp]*([cl ^>]*)> variable MarkupDeclSub "\} {\\1} {\\2} \{" variable ExternalEntityExpr ^(PUBLIC|SYSTEM)[cl $::sgml::Wsp]+("|')(.*?)\\2([cl $::sgml::Wsp]+("|')(.*?)\\2)?([cl $::sgml::Wsp]+NDATA[cl $::sgml::Wsp]+($::xml::Name))?\$ variable StdOptions array set StdOptions [list \ -elementstartcommand [namespace current]::noop \ -elementendcommand [namespace current]::noop \ -characterdatacommand [namespace current]::noop \ -processinginstructioncommand [namespace current]::noop \ -externalentitycommand {} \ -xmldeclcommand [namespace current]::noop \ -doctypecommand [namespace current]::noop \ -commentcommand [namespace current]::noop \ -entitydeclcommand [namespace current]::noop \ -unparsedentitydeclcommand [namespace current]::noop \ -parameterentitydeclcommand [namespace current]::noop \ -notationdeclcommand [namespace current]::noop \ -elementdeclcommand [namespace current]::noop \ -attlistdeclcommand [namespace current]::noop \ -paramentityparsing 1 \ -defaultexpandinternalentities 1 \ -startdoctypedeclcommand [namespace current]::noop \ -enddoctypedeclcommand [namespace current]::noop \ -entityreferencecommand {} \ -warningcommand [namespace current]::noop \ -errorcommand [namespace current]::Error \ -final 1 \ -validate 0 \ -baseuri {} \ -name {} \ -cmd {} \ -emptyelement [namespace current]::EmptyElement \ -parseattributelistcommand [namespace current]::noop \ -parseentitydeclcommand [namespace current]::noop \ -normalize 1 \ -internaldtd {} \ -reportempty 0 \ -ignorewhitespace 0 \ ]}# sgml::tokenise --## Transform the given HTML/XML text into a Tcl list.## Arguments:# sgml text to tokenize# elemExpr RE to recognise tags# elemSub transform for matched tags# args options## Valid Options:# -internaldtdvariable# -final boolean True if no more data is to be supplied# -statevariable varName Name of a variable used to store info## Results:# Returns a Tcl list representing the document.proc sgml::tokenise {sgml elemExpr elemSub args} { array set options {-final 1} array set options $args set options(-final) [Boolean $options(-final)] # If the data is not final then there must be a variable to store # unused data. if {!$options(-final) && ![info exists options(-statevariable)]} { return -code error {option "-statevariable" required if not final} } # Pre-process stage # # Extract the internal DTD subset, if any catch {upvar #0 $options(-internaldtdvariable) dtd} if {[regexp {<!DOCTYPE[^[<]+\[([^]]+)\]} $sgml discard dtd]} { regsub {(<!DOCTYPE[^[<]+)(\[[^]]+\])} $sgml {\1\&xml:intdtd;} sgml } # Protect Tcl special characters regsub -all {([{}\\])} $sgml {\\\1} sgml # Do the translation if {[info exists options(-statevariable)]} { # Mats: Several rewrites here to handle -final 0 option. # If any cached unparsed xml (state(leftover)), prepend it. upvar #0 $options(-statevariable) state if {[string length $state(leftover)]} { regsub -all $elemExpr $state(leftover)$sgml $elemSub sgml set state(leftover) {} } else { regsub -all $elemExpr $sgml $elemSub sgml } set sgml "{} {} {} \{$sgml\}" # Performance note (Tcl 8.0): # Use of lindex, lreplace will cause parsing to list object # This RE only fixes chopped inside tags, not chopped text. if {[regexp {^([^<]*)(<[^>]*$)} [lindex $sgml end] x text rest]} { set sgml [lreplace $sgml end end $text] # Mats: unmatched stuff means that it is chopped off. Cache it for next round. set state(leftover) $rest } # Patch from bug report #596959, Marshall Rose if {[string compare [lindex $sgml 4] ""]} { set sgml [linsert $sgml 0 {} {} {} {} {}] } } else { # Performance note (Tcl 8.0): # In this case, no conversion to list object is performed # Mats: This fails if not -final and $sgml is chopped off right in a tag. regsub -all $elemExpr $sgml $elemSub sgml set sgml "{} {} {} \{$sgml\}" } return $sgml}# sgml::parseEvent --## Produces an event stream for a XML/HTML document,# given the Tcl list format returned by tokenise.## This procedure checks that the document is well-formed,# and throws an error if the document is found to be not# well formed. Warnings are passed via the -warningcommand script.## The procedure only check for well-formedness,# no DTD is required. However, facilities are provided for entity expansion.## Arguments:# sgml Instance data, as a Tcl list.# args option/value pairs## Valid Options:# -final Indicates end of document data# -validate Boolean to enable validation# -baseuri URL for resolving relative URLs# -elementstartcommand Called when an element starts# -elementendcommand Called when an element ends# -characterdatacommand Called when character data occurs# -entityreferencecommand Called when an entity reference occurs# -processinginstructioncommand Called when a PI occurs# -externalentitycommand Called for an external entity reference## -xmldeclcommand Called when the XML declaration occurs# -doctypecommand Called when the document type declaration occurs# -commentcommand Called when a comment occurs# -entitydeclcommand Called when a parsed entity is declared# -unparsedentitydeclcommand Called when an unparsed external entity is declared# -parameterentitydeclcommand Called when a parameter entity is declared# -notationdeclcommand Called when a notation is declared# -elementdeclcommand Called when an element is declared# -attlistdeclcommand Called when an attribute list is declared# -paramentityparsing Boolean to enable/disable parameter entity substitution# -defaultexpandinternalentities Boolean to enable/disable expansion of entities declared in internal DTD subset## -startdoctypedeclcommand Called when the Doc Type declaration starts (see also -doctypecommand)# -enddoctypedeclcommand Called when the Doc Type declaration ends (see also -doctypecommand)## -errorcommand Script to evaluate for a fatal error# -warningcommand Script to evaluate for a reportable warning# -statevariable global state variable# -normalize whether to normalize names# -reportempty whether to include an indication of empty elements# -ignorewhitespace whether to automatically strip whitespace## Results:# The various callback scripts are invoked.# Returns empty string.## BUGS:# If command options are set to empty string then they should not be invoked.proc sgml::parseEvent {sgml args} { variable Wsp variable noWsp variable Nmtoken variable Name variable ParseEventNum variable StdOptions array set options [array get StdOptions] catch {array set options $args} # Mats: # If the data is not final then there must be a variable to persistently store the parse state. if {!$options(-final) && ![info exists options(-statevariable)]} { return -code error {option "-statevariable" required if not final} } foreach {opt value} [array get options *command] { if {[string compare $opt "-externalentitycommand"] && ![string length $value]} { set options($opt) [namespace current]::noop } } if {![info exists options(-statevariable)]} { set options(-statevariable) [namespace current]::ParseEvent[incr ParseEventNum] } if {![info exists options(entities)]} { set options(entities) [namespace current]::Entities$ParseEventNum array set $options(entities) [array get [namespace current]::EntityPredef] } if {![info exists options(extentities)]} { set options(extentities) [namespace current]::ExtEntities$ParseEventNum } if {![info exists options(parameterentities)]} { set options(parameterentities) [namespace current]::ParamEntities$ParseEventNum } if {![info exists options(externalparameterentities)]} { set options(externalparameterentities) [namespace current]::ExtParamEntities$ParseEventNum } if {![info exists options(elementdecls)]} { set options(elementdecls) [namespace current]::ElementDecls$ParseEventNum } if {![info exists options(attlistdecls)]} { set options(attlistdecls) [namespace current]::AttListDecls$ParseEventNum } if {![info exists options(notationdecls)]} { set options(notationdecls) [namespace current]::NotationDecls$ParseEventNum } if {![info exists options(namespaces)]} { set options(namespaces) [namespace current]::Namespaces$ParseEventNum } # For backward-compatibility catch {set options(-baseuri) $options(-baseurl)} # Choose an external entity resolver if {![string length $options(-externalentitycommand)]} { if {$options(-validate)} { set options(-externalentitycommand) [namespace code ResolveEntity] } else { set options(-externalentitycommand) [namespace code noop] } } upvar #0 $options(-statevariable) state upvar #0 $options(entities) entities # Mats: # The problem is that the state is not maintained when -final 0 ! # I've switched back to an older version here. if {![info exists state(line)]} { # Initialise the state variable array set state { mode normal haveXMLDecl 0 haveDocElement 0 inDTD 0 context {} stack {} line 0 defaultNS {} defaultNSURI {} } } foreach {tag close param text} $sgml { # Keep track of lines in the input incr state(line) [regsub -all \n $param {} discard] incr state(line) [regsub -all \n $text {} discard] # If the current mode is cdata or comment then we must undo what the # regsub has done to reconstitute the data set empty {} switch $state(mode) { comment { # This had "[string length $param] && " as a guard - # can't remember why :-( if {[regexp ([cl ^-]*)--\$ $tag discard comm1]} { # end of comment (in tag) set tag {} set close {} set state(mode) normal DeProtect1 $options(-commentcommand) $state(commentdata)<$comm1 unset state(commentdata) } elseif {[regexp ([cl ^-]*)--\$ $param discard comm1]} { # end of comment (in attributes) DeProtect1 $options(-commentcommand) $state(commentdata)<$close$tag>$comm1 unset state(commentdata) set tag {} set param {} set close {} set state(mode) normal } elseif {[regexp ([cl ^-]*)-->(.*) $text discard comm1 text]} { # end of comment (in text) DeProtect1 $options(-commentcommand) $state(commentdata)<$close$tag$param>$comm1 unset state(commentdata) set tag {} set param {} set close {} set state(mode) normal } else { # comment continues append state(commentdata) <$close$tag$param>$text continue } } cdata { if {[string length $param] && [regexp ([cl ^\]]*)\]\][cl $Wsp]*\$ $tag discard cdata1]} { # end of CDATA (in tag) PCDATA [array get options] $state(cdata)<[subst -nocommand -novariable $close$cdata1] set text [subst -novariable -nocommand $text] set tag {} unset state(cdata) set state(mode) normal } elseif {[regexp ([cl ^\]]*)\]\][cl $Wsp]*\$ $param discard cdata1]} { # end of CDATA (in attributes) PCDATA [array get options] $state(cdata)<[subst -nocommand -novariable $close$tag$cdata1] set text [subst -novariable -nocommand $text] set tag {} set param {} unset state(cdata) set state(mode) normal } elseif {[regexp (.*)\]\][cl $Wsp]*>(.*) $text discard cdata1 text]} { # end of CDATA (in text) PCDATA [array get options] $state(cdata)<[subst -nocommand -novariable $close$tag$param>$cdata1] set text [subst -novariable -nocommand $text] set tag {} set param {} set close {} unset state(cdata) set state(mode) normal } else { # CDATA continues append state(cdata) [subst -nocommand -novariable <$close$tag$param>$text] continue } } continue { # We're skipping elements looking for the close tag switch -glob -- [string length $tag],[regexp {^\?|!.*} $tag],$close { 0,* { continue } *,0, { if {![string compare $tag $state(continue:tag)]} { set empty [uplevel #0 $options(-emptyelement) [list $tag $param $empty]] if {![string length $empty]} { incr state(continue:level) } } continue } *,0,/ { if {![string compare $tag $state(continue:tag)]} { incr state(continue:level) -1 } if {!$state(continue:level)} { unset state(continue:tag) unset state(continue:level) set state(mode) {} } } default { continue } } } default { # The trailing slash on empty elements can't be automatically separated out # in the RE, so we must do it here. regexp (.*)(/)[cl $Wsp]*$ $param discard param empty } }
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?