📄 xmlutils.py
字号:
msg="%s'%s', " % (msg,wrap)
self.report_error(3004,(msg[:-2],wraps[-1][0]))
data=self.get_match(regexp)
if not self.now_at(wrap):
self.report_error(3005,wrap)
return data
#--- ERROR HANDLING
def report_error(self,number,args=None):
try:
msg=self.errors[number]
if args!=None:
msg=msg % args
except KeyError:
msg=self.errors[4002] # Unknown err msg :-)
if number<2000:
self.err.warning(msg)
elif number<3000:
self.err.error(msg)
else:
self.err.fatal(msg)
#--- USEFUL METHODS
def get_current_sysid(self):
"Returns the sysid of the file we are reading now."
return self.current_sysID
def set_sysid(self,sysID):
"Sets the current system identifier. Does not store the old one."
self.current_sysID=sysID
def get_offset(self):
"Returns the current offset from the start of the stream."
return self.block_offset+self.pos
def get_line(self):
"Returns the current line number."
self.update_pos()
return self.line
def get_column(self):
"Returns the current column position."
self.update_pos()
return self.pos-self.last_break
def is_root_entity(self):
"Returns true if the current entity is the root entity."
return self.ent_stack==[]
def is_external(self):
"""Returns true if the current entity is an external entity. The root
(or document) entity is not considered external."""
return self.ent_stack!=[] and \
self.ent_stack[0][0]!=self.get_current_sysid()
# --- Internal methods
def _push_ent_stack(self,name="None"):
self.ent_stack.append((self.get_current_sysid(),self.data,self.pos,\
self.line,self.last_break,self.datasize,\
self.last_upd_pos,self.block_offset,self.final,
name))
def _pop_ent_stack(self):
(self.current_sysID,self.data,self.pos,self.line,self.last_break,\
self.datasize,self.last_upd_pos,self.block_offset,self.final,dummy)=\
self.ent_stack[-1]
del self.ent_stack[-1]
# ==============================
# Common code for some parsers
# ==============================
class XMLCommonParser(EntityParser):
def parse_external_id(self,required=0,sysidreq=1):
"""Parses an external ID declaration and returns a tuple consisting
of (pubid,sysid). If the required attribute is false neither SYSTEM
nor PUBLIC identifiers are required. If sysidreq is false a SYSTEM
identifier is not required after a PUBLIC one."""
pub_id=None
sys_id=None
if self.now_at("SYSTEM"):
self.skip_ws(1)
sys_id=self.get_wrapped_match([("\"",reg_sysid_quote),\
("'",reg_sysid_apo)])
elif self.now_at("PUBLIC"):
self.skip_ws(1)
pub_id=self.get_wrapped_match([("\"",reg_pubid_quote),\
("'",reg_pubid_apo)])
pub_id=string.join(string.split(pub_id))
if sysidreq:
self.skip_ws(1)
sys_id=self.get_wrapped_match([("\"",reg_sysid_quote),\
("'",reg_sysid_apo)])
else:
if self.test_str("'") or self.test_str('"'):
self.report_error(3002)
self.skip_ws()
if self.test_str("'") or self.test_str('"'):
sys_id=self.get_wrapped_match([("\"",reg_sysid_quote),\
("'",reg_sysid_apo)])
else:
if required: self.report_error(3006)
return (pub_id,sys_id)
def __get_quoted_string(self):
"Returns the contents of a quoted string at current position."
try:
quo=self.data[self.pos]
except IndexError:
raise OutOfDataException()
if not (self.now_at('"') or self.now_at("'")):
self.report_error(3004,("'\"'","'"))
self.scan_to(">")
return ""
return self.scan_to(quo)
def parse_xml_decl(self,handler=None):
"Parses the contents of the XML declaration from after the '<?xml'."
textdecl=self.is_external() # If this is an external entity, then this
# is a text declaration, not an XML decl
self.update_pos()
if self.get_column()!=5 or self.get_line()!=1 or \
(self.ent_stack!=[] and not textdecl):
if textdecl:
self.report_error(3007)
else:
self.report_error(3008)
if self.seen_xmldecl: # Set in parse_pi, to avoid block problems
if textdecl:
self.report_error(3009)
else:
self.report_error(3010)
enc=None
sddecl=None
ver=None
self.skip_ws()
if self.now_at("version"):
self.skip_ws()
if not self.now_at("="): self.report_error(3005,"=")
self.skip_ws()
ver=self.__get_quoted_string()
m=reg_ver.match(ver)
if m==None or m.end()!=len(ver):
self.report_error(3901,ver)
elif ver!="1.0":
self.report_error(3046)
if self.test_str("encoding") or self.test_str("standalone"):
self.report_error(3002)
self.skip_ws()
elif not textdecl:
self.report_error(3011)
if self.now_at("encoding"):
self.skip_ws()
if not self.now_at("="): self.report_error(3005,"=")
self.skip_ws()
enc=self.__get_quoted_string()
if reg_enc_name.match(enc)==None:
self.report_error(3902)
# Setting up correct conversion
if charconv.convdb.can_convert(enc,self.data_charset):
self.charset_converter=\
charconv.convdb.get_converter(enc,self.data_charset)
else:
self.report_error(1002,enc)
self.skip_ws()
if self.now_at("standalone"):
if textdecl:
self.report_error(3012)
sddecl="yes"
else:
self.skip_ws()
if not self.now_at("="): self.report_error(3005,"=")
self.skip_ws()
sddecl=self.__get_quoted_string()
if reg_std_alone.match(sddecl)==None:
self.report_error(3911)
self.standalone= sddecl=="yes"
self.skip_ws()
self.skip_ws()
if handler!=None:
handler.set_entity_info(ver,enc,sddecl)
def parse_pi(self,handler,report_xml_decl=0):
"""Parses a processing instruction from after the '<?' to beyond
the '?>'."""
trgt=self._get_name()
if trgt=="xml":
if report_xml_decl:
self.parse_xml_decl(handler)
else:
self.parse_xml_decl()
if not self.now_at("?>"):
self.report_error(3005,"?>")
self.seen_xmldecl=1
else:
if self.now_at("?>"):
rem=""
else:
self.skip_ws(1)
rem=self.scan_to("?>") # OutOfDataException if not found
if reg_res_pi.match(trgt)!=None:
if trgt=="xml:namespace":
self.report_error(1003)
elif trgt!="xml-stylesheet":
self.report_error(3045)
handler.handle_pi(trgt,rem)
def parse_comment(self,handler):
"Parses the comment from after '<!--' to beyond '-->'."
new_pos = self.get_index("--")
handler.handle_comment(self.data[self.pos : new_pos])
self.pos = new_pos
if not self.now_at("-->"):
self.report_error(3005,"-->")
def _read_char_ref(self):
"Parses a character reference and returns the character."
if self.now_at("x"):
digs=unhex(self.get_match(reg_hex_digits))
else:
digs=int(self.get_match(reg_digits))
if not (digs==9 or digs==10 or digs==13 or \
(digs>=32 and digs<=255)):
if digs>255:
self.report_error(1005,digs)
else:
self.report_error(3018,digs)
return ""
else:
return chr(digs)
def _get_name(self):
"""Parses the name at the current position and returns it. An error
is reported if no name is present."""
if self.pos>self.datasize-5 and not self.final:
raise OutOfDataException()
data=self.data
pos=self.pos
if data[pos] in namestart:
start=pos
pos=pos+1
try:
while data[pos] in namechars:
pos=pos+1
self.pos=pos
return intern(data[start:pos])
except IndexError:
self.pos=pos
if self.final:
return intern(data[start:])
else:
raise OutOfDataException()
else:
self.report_error(3900)
return ""
# --- A collection of useful functions
# Utility functions
def unhex(hex_value):
"Converts a string hex-value to an integer."
sum=0
for char in hex_value:
sum=sum*16
char=ord(char)
if char<58 and char>=48:
sum=sum+(char-48)
elif char>=97 and char<=102:
sum=sum+(char-87)
elif char>=65 and char<=70:
sum=sum+(char-55)
# else ERROR, but it can't occur here
return sum
def matches(regexp,str):
mo=regexp.match(str)
return mo!=None and len(mo.group(0))==len(str)
def join_sysids_general(base,url):
if urlparse.urlparse(base)[0]=="":
if urlparse.urlparse(url)[0]=="":
return os.path.join(os.path.split(base)[0],url)
else:
return url
else:
return urlparse.urljoin(base,url)
def join_sysids_win32(base,url):
if len(urlparse.urlparse(base)[0])<2: # Handles drive identifiers correctly
if len(urlparse.urlparse(url)[0])<2:
return os.path.join(os.path.split(base)[0],url)
else:
return url
else:
return urlparse.urljoin(base,url)
# here join_sysids(base,url): is set to the correct function
if sys.platform=="win32":
join_sysids=join_sysids_win32
else:
join_sysids=join_sysids_general
# --- Some useful regexps
namestart="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_:"+\
"懒旅呐魄壬仕掏蜗醒矣哉重仝圮蒉哙徕沅彐玷殛腱眍镳耱篝貊
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -