📄 xmlproc.py
字号:
"""
The main module of the parser. All other modules will be imported into this
one, so this module is the only one one needs to import. For validating
parsing, import xmlval instead.
"""
# $Id$
import re,string,sys,urllib,urlparse
string_translate=string.translate # optimization. made 10% difference!
string_find =string.find
from dtdparser import *
from xmlutils import *
from xmlapp import *
from xmldtd import *
version="0.70"
revision="$Revision$"
# ==============================
# A full well-formedness parser
# ==============================
class XMLProcessor(XMLCommonParser):
"A parser that performs a complete well-formedness check."
def __init__(self):
EntityParser.__init__(self)
# Various handlers
self.app=Application()
self.dtd=WFCDTD(self)
self.ent=self.dtd
self.dtd_listener=None
self.stop_on_wf=1
def set_application(self,app):
"Sets the object to send data events to."
self.app=app
app.set_locator(self)
def set_dtd_listener(self,listener):
"Registers an object that listens for DTD parse events."
self.dtd_listener=listener
def set_data_after_wf_error(self,stop_on_wf=0):
"""Sets the parser policy on well-formedness errors. If this is set to
0 data events are still delivered, even after well-formedness errors.
Otherwise no more data events reach the application after such erors.
"""
self.stop_on_wf=stop_on_wf
def set_read_external_subset(self,read_it):
"""Tells the parser whether to read the external subset of documents
or not."""
self.read_external_subset=read_it
def report_error(self,number,args=None):
if self.stop_on_wf and number>2999:
self.app=Application() # No more data events reported
EntityParser.report_error(self,number,args)
def reset(self):
EntityParser.reset(self)
if hasattr(self,"dtd"):
self.dtd.reset()
# State vars
self.stack=[]
self.seen_root=0
self.seen_doctype=0
self.seen_xmldecl=0
self.stop_on_wf=1
self.read_external_subset=0
def deref(self):
"Deletes circular references."
self.dtd = self.ent = self.err = self.app = self.pubres = None
def do_parse(self):
"Does the actual parsing."
try:
while self.pos<self.datasize:
self.prepos=self.pos
if self.data[self.pos]=="<":
try:
t=self.data[self.pos+1] # Optimization
except IndexError,e:
raise OutOfDataException()
if t=="/":
self.parse_end_tag()
elif t!="!" and t!="?":
self.parse_start_tag()
elif self.now_at("<!--"):
self.parse_comment(self.app)
elif self.now_at("<?"): # FIXME: use t and modify self.pos?
self.parse_pi(self.app,1)
elif self.now_at("<![CDATA["):
self.parse_cdata()
elif self.now_at("<!DOCTYPE"):
self.parse_doctype()
else:
self.report_error(3013)
self.scan_to(">") # Avoid endless loops
elif self.data[self.pos]=="&":
if self.now_at("&#"):
self.parse_charref()
else:
self.pos=self.pos+1 # Skipping the '&'
self.parse_ent_ref()
else:
self.parse_data()
except OutOfDataException,e:
if self.final:
raise e
else:
self.pos=self.prepos # Didn't complete the construct
def parseStart(self):
"Must be called before parsing starts. (Notifies application.)"
self.app.doc_start()
def parseEnd(self):
"""Must be called when parsing is finished. (Does some checks and "
"notifies the application.)"""
if self.stack!=[] and self.ent_stack==[]:
self.report_error(3014,self.stack[-1])
elif not self.seen_root:
self.report_error(3015)
self.app.doc_end()
def parse_start_tag(self):
"Parses the start tag."
self.pos=self.pos+1 # Skips the '<'
name=self._get_name()
self.skip_ws()
try:
(attrs,fixeds)=self.dtd.attrinfo[name]
attrs=attrs.copy()
except KeyError:
attrs={}
fixeds={}
if self.data[self.pos]!=">" and self.data[self.pos]!="/":
seen={}
while not self.test_str(">") and not self.test_str("/>"):
a_name=self._get_name()
self.skip_ws()
if not self.now_at("="):
self.report_error(3005,"=")
self.scan_to(">") ## Panic! Get out of the tag!
a_val=""
break
self.skip_ws()
a_val=self.parse_att_val()
if a_val==-1:
# WF error, we've skipped the rest of the tag
self.pos=self.pos-1 # Lets us find the '>'
if self.data[self.pos-1]=="/":
self.pos=self.pos-1 # Gets the '/>' cases right
break
if seen.has_key(a_name):
self.report_error(3016,a_name)
else:
seen[a_name]=1
attrs[a_name]=a_val
if fixeds.has_key(a_name) and fixeds[a_name]!=a_val:
self.report_error(2000,a_name)
self.skip_ws()
# --- Take care of the tag
if self.stack==[] and self.seen_root:
self.report_error(3017)
self.seen_root=1
if self.now_at(">"):
self.app.handle_start_tag(name,attrs)
self.stack.append(name)
elif self.now_at("/>"):
self.app.handle_start_tag(name,attrs)
self.app.handle_end_tag(name)
else:
self.report_error(3004,("'>'","/>"))
def parse_att_val(self):
"Parses an attribute value and resolves all entity references in it."
val=""
if self.now_at('"'):
delim='"'
reg_attval_stop=reg_attval_stop_quote
elif self.now_at("'"):
delim="'"
reg_attval_stop=reg_attval_stop_sing
else:
self.report_error(3004,("'","\""))
self.scan_to(">")
return -1 # FIXME: Ugly. Should throw an exception instead
while 1:
piece=self.find_reg(reg_attval_stop)
val=val+string_translate(piece,ws_trans)
if self.now_at(delim):
break
if self.now_at("&#"):
val=val+self._read_char_ref()
elif self.now_at("&"):
name=self._get_name()
if name in self.open_ents:
self.report_error(3019)
return
else:
self.open_ents.append(name)
try:
ent=self.ent.resolve_ge(name)
if ent.is_internal():
# Doing all this here sucks a bit, but...
self.push_entity(self.get_current_sysid(),\
ent.value,name)
self.final=1 # Only one block
val=val+self.parse_literal_entval()
if not self.pos==self.datasize:
self.report_error(3001) # Thing started, not compl
self.pop_entity()
else:
self.report_error(3020)
except KeyError,e:
self.report_error(3021,name) ## FIXME: Check standalone dcl
del self.open_ents[-1]
elif self.now_at("<"):
self.report_error(3022)
continue
else:
self.report_error(4001)
self.pos=self.pos+1 # Avoid endless loop
continue
if not self.now_at(";"):
self.report_error(3005,";")
return val
def parse_literal_entval(self):
"Parses a literal entity value for insertion in an attribute value."
val=""
reg_stop=re.compile("&")
while 1:
try:
piece=self.find_reg(reg_stop)
except OutOfDataException,e:
# Only character data left
val=val+string_translate(self.data[self.pos:],ws_trans)
self.pos=self.datasize
break
val=val+string_translate(piece,ws_trans)
if self.now_at("&#"):
val=val+self._read_char_ref()
elif self.now_at("&"):
name=self._get_name()
if name in self.open_ents:
self.report_error(3019)
return ""
else:
self.open_ents.append(name)
try:
ent=self.ent.resolve_ge(name)
if ent.is_internal():
# Doing all this here sucks a bit, but...
self.push_entity(self.get_current_sysid(),\
ent.value,name)
self.final=1 # Only one block
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -