xmllib.py
来自「mallet是自然语言处理、机器学习领域的一个开源项目。」· Python 代码 · 共 930 行 · 第 1/3 页
PY
930 行
"""A parser for XML, using the derived class as static DTD."""# Author: Sjoerd Mullender.import reimport stringversion = '0.3'class Error(RuntimeError): pass# Regular expressions used for parsing_S = '[ \t\r\n]+' # white space_opS = '[ \t\r\n]*' # optional white space_Name = '[a-zA-Z_:][-a-zA-Z0-9._:]*' # valid XML name_QStr = "(?:'[^']*'|\"[^\"]*\")" # quoted XML stringillegal = re.compile('[^\t\r\n -\176\240-\377]') # illegal chars in contentinteresting = re.compile('[]&<]')amp = re.compile('&')ref = re.compile('&(' + _Name + '|#[0-9]+|#x[0-9a-fA-F]+)[^-a-zA-Z0-9._:]')entityref = re.compile('&(?P<name>' + _Name + ')[^-a-zA-Z0-9._:]')charref = re.compile('&#(?P<char>[0-9]+[^0-9]|x[0-9a-fA-F]+[^0-9a-fA-F])')space = re.compile(_S + '$')newline = re.compile('\n')attrfind = re.compile( _S + '(?P<name>' + _Name + ')' '(' + _opS + '=' + _opS + '(?P<value>'+_QStr+'|[-a-zA-Z0-9.:+*%?!\(\)_#=~]+))?')starttagopen = re.compile('<' + _Name)starttagend = re.compile(_opS + '(?P<slash>/?)>')starttagmatch = re.compile('<(?P<tagname>'+_Name+')' '(?P<attrs>(?:'+attrfind.pattern+')*)'+ starttagend.pattern)endtagopen = re.compile('</')endbracket = re.compile(_opS + '>')endbracketfind = re.compile('(?:[^>\'"]|'+_QStr+')*>')tagfind = re.compile(_Name)cdataopen = re.compile(r'<!\[CDATA\[')cdataclose = re.compile(r'\]\]>')# this matches one of the following:# SYSTEM SystemLiteral# PUBLIC PubidLiteral SystemLiteral_SystemLiteral = '(?P<%s>'+_QStr+')'_PublicLiteral = '(?P<%s>"[-\'\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*"|' \ "'[-\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*')"_ExternalId = '(?:SYSTEM|' \ 'PUBLIC'+_S+_PublicLiteral%'pubid'+ \ ')'+_S+_SystemLiteral%'syslit'doctype = re.compile('<!DOCTYPE'+_S+'(?P<name>'+_Name+')' '(?:'+_S+_ExternalId+')?'+_opS)xmldecl = re.compile('<\?xml'+_S+ 'version'+_opS+'='+_opS+'(?P<version>'+_QStr+')'+ '(?:'+_S+'encoding'+_opS+'='+_opS+ "(?P<encoding>'[A-Za-z][-A-Za-z0-9._]*'|" '"[A-Za-z][-A-Za-z0-9._]*"))?' '(?:'+_S+'standalone'+_opS+'='+_opS+ '(?P<standalone>\'(?:yes|no)\'|"(?:yes|no)"))?'+ _opS+'\?>')procopen = re.compile(r'<\?(?P<proc>' + _Name + ')' + _opS)procclose = re.compile(_opS + r'\?>')commentopen = re.compile('<!--')commentclose = re.compile('-->')doubledash = re.compile('--')attrtrans = string.maketrans(' \r\n\t', ' ')# definitions for XML namespaces_NCName = '[a-zA-Z_][-a-zA-Z0-9._]*' # XML Name, minus the ":"ncname = re.compile(_NCName + '$')qname = re.compile('(?:(?P<prefix>' + _NCName + '):)?' # optional prefix '(?P<local>' + _NCName + ')$')xmlns = re.compile('xmlns(?::(?P<ncname>'+_NCName+'))?$')# XML parser base class -- find tags and call handler functions.# Usage: p = XMLParser(); p.feed(data); ...; p.close().# The dtd is defined by deriving a class which defines methods with# special names to handle tags: start_foo and end_foo to handle <foo># and </foo>, respectively. The data between tags is passed to the# parser by calling self.handle_data() with some data as argument (the# data may be split up in arbitrary chunks).class XMLParser: attributes = {} # default, to be overridden elements = {} # default, to be overridden # parsing options, settable using keyword args in __init__ __accept_unquoted_attributes = 0 __accept_missing_endtag_name = 0 __map_case = 0 __accept_utf8 = 0 __translate_attribute_references = 1 # Interface -- initialize and reset this instance def __init__(self, **kw): self.__fixed = 0 if kw.has_key('accept_unquoted_attributes'): self.__accept_unquoted_attributes = kw['accept_unquoted_attributes'] if kw.has_key('accept_missing_endtag_name'): self.__accept_missing_endtag_name = kw['accept_missing_endtag_name'] if kw.has_key('map_case'): self.__map_case = kw['map_case'] if kw.has_key('accept_utf8'): self.__accept_utf8 = kw['accept_utf8'] if kw.has_key('translate_attribute_references'): self.__translate_attribute_references = kw['translate_attribute_references'] self.reset() def __fixelements(self): self.__fixed = 1 self.elements = {} self.__fixdict(self.__dict__) self.__fixclass(self.__class__) def __fixclass(self, kl): self.__fixdict(kl.__dict__) for k in kl.__bases__: self.__fixclass(k) def __fixdict(self, dict): for key in dict.keys(): if key[:6] == 'start_': tag = key[6:] start, end = self.elements.get(tag, (None, None)) if start is None: self.elements[tag] = getattr(self, key), end elif key[:4] == 'end_': tag = key[4:] start, end = self.elements.get(tag, (None, None)) if end is None: self.elements[tag] = start, getattr(self, key) # Interface -- reset this instance. Loses all unprocessed data def reset(self): self.rawdata = '' self.stack = [] self.nomoretags = 0 self.literal = 0 self.lineno = 1 self.__at_start = 1 self.__seen_doctype = None self.__seen_starttag = 0 self.__use_namespaces = 0 self.__namespaces = {'xml':None} # xml is implicitly declared # backward compatibility hack: if elements not overridden, # fill it in ourselves if self.elements is XMLParser.elements: self.__fixelements() # For derived classes only -- enter literal mode (CDATA) till EOF def setnomoretags(self): self.nomoretags = self.literal = 1 # For derived classes only -- enter literal mode (CDATA) def setliteral(self, *args): self.literal = 1 # Interface -- feed some data to the parser. Call this as # often as you want, with as little or as much text as you # want (may include '\n'). (This just saves the text, all the # processing is done by goahead().) def feed(self, data): self.rawdata = self.rawdata + data self.goahead(0) # Interface -- handle the remaining data def close(self): self.goahead(1) if self.__fixed: self.__fixed = 0 # remove self.elements so that we don't leak del self.elements # Interface -- translate references def translate_references(self, data, all = 1): if not self.__translate_attribute_references: return data i = 0 while 1: res = amp.search(data, i) if res is None: return data s = res.start(0) res = ref.match(data, s) if res is None: self.syntax_error("bogus `&'") i = s+1 continue i = res.end(0) str = res.group(1) rescan = 0 if str[0] == '#': if str[1] == 'x': str = chr(int(str[2:], 16)) else: str = chr(int(str[1:])) if data[i - 1] != ';': self.syntax_error("`;' missing after char reference") i = i-1 elif all: if self.entitydefs.has_key(str): str = self.entitydefs[str] rescan = 1 elif data[i - 1] != ';': self.syntax_error("bogus `&'") i = s + 1 # just past the & continue else: self.syntax_error("reference to unknown entity `&%s;'" % str) str = '&' + str + ';' elif data[i - 1] != ';': self.syntax_error("bogus `&'") i = s + 1 # just past the & continue # when we get here, str contains the translated text and i points # to the end of the string that is to be replaced data = data[:s] + str + data[i:] if rescan: i = s else: i = s + len(str) # Interface - return a dictionary of all namespaces currently valid def getnamespace(self): nsdict = {} for t, d, nst in self.stack: nsdict.update(d) return nsdict # Internal -- handle data as far as reasonable. May leave state # and data to be processed by a subsequent call. If 'end' is # true, force handling all data as if followed by EOF marker. def goahead(self, end): rawdata = self.rawdata i = 0 n = len(rawdata) while i < n: if i > 0: self.__at_start = 0 if self.nomoretags: data = rawdata[i:n] self.handle_data(data) self.lineno = self.lineno + data.count('\n') i = n break res = interesting.search(rawdata, i) if res: j = res.start(0) else: j = n if i < j: data = rawdata[i:j] if self.__at_start and space.match(data) is None: self.syntax_error('illegal data at start of file') self.__at_start = 0 if not self.stack and space.match(data) is None: self.syntax_error('data not in content') if not self.__accept_utf8 and illegal.search(data): self.syntax_error('illegal character in content') self.handle_data(data) self.lineno = self.lineno + data.count('\n') i = j if i == n: break if rawdata[i] == '<': if starttagopen.match(rawdata, i): if self.literal: data = rawdata[i] self.handle_data(data) self.lineno = self.lineno + data.count('\n') i = i+1 continue k = self.parse_starttag(i) if k < 0: break self.__seen_starttag = 1 self.lineno = self.lineno + rawdata[i:k].count('\n') i = k continue if endtagopen.match(rawdata, i): k = self.parse_endtag(i) if k < 0: break self.lineno = self.lineno + rawdata[i:k].count('\n') i = k continue if commentopen.match(rawdata, i): if self.literal: data = rawdata[i] self.handle_data(data) self.lineno = self.lineno + data.count('\n') i = i+1 continue k = self.parse_comment(i) if k < 0: break self.lineno = self.lineno + rawdata[i:k].count('\n') i = k continue if cdataopen.match(rawdata, i): k = self.parse_cdata(i) if k < 0: break self.lineno = self.lineno + rawdata[i:k].count('\n') i = k continue res = xmldecl.match(rawdata, i) if res: if not self.__at_start: self.syntax_error("<?xml?> declaration not at start of document")
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?