xmllib.py
来自「mallet是自然语言处理、机器学习领域的一个开源项目。」· Python 代码 · 共 930 行 · 第 1/3 页
PY
930 行
version, encoding, standalone = res.group('version', 'encoding', 'standalone') if version[1:-1] != '1.0': raise Error('only XML version 1.0 supported') if encoding: encoding = encoding[1:-1] if standalone: standalone = standalone[1:-1] self.handle_xml(encoding, standalone) i = res.end(0) continue res = procopen.match(rawdata, i) if res: k = self.parse_proc(i) if k < 0: break self.lineno = self.lineno + rawdata[i:k].count('\n') i = k continue res = doctype.match(rawdata, i) if res: if self.literal: data = rawdata[i] self.handle_data(data) self.lineno = self.lineno + data.count('\n') i = i+1 continue if self.__seen_doctype: self.syntax_error('multiple DOCTYPE elements') if self.__seen_starttag: self.syntax_error('DOCTYPE not at beginning of document') k = self.parse_doctype(res) if k < 0: break self.__seen_doctype = res.group('name') if self.__map_case: self.__seen_doctype = self.__seen_doctype.lower() self.lineno = self.lineno + rawdata[i:k].count('\n') i = k continue elif rawdata[i] == '&': if self.literal: data = rawdata[i] self.handle_data(data) i = i+1 continue res = charref.match(rawdata, i) if res is not None: i = res.end(0) if rawdata[i-1] != ';': self.syntax_error("`;' missing in charref") i = i-1 if not self.stack: self.syntax_error('data not in content') self.handle_charref(res.group('char')[:-1]) self.lineno = self.lineno + res.group(0).count('\n') continue res = entityref.match(rawdata, i) if res is not None: i = res.end(0) if rawdata[i-1] != ';': self.syntax_error("`;' missing in entityref") i = i-1 name = res.group('name') if self.__map_case: name = name.lower() if self.entitydefs.has_key(name): self.rawdata = rawdata = rawdata[:res.start(0)] + self.entitydefs[name] + rawdata[i:] n = len(rawdata) i = res.start(0) else: self.unknown_entityref(name) self.lineno = self.lineno + res.group(0).count('\n') continue elif rawdata[i] == ']': if self.literal: data = rawdata[i] self.handle_data(data) i = i+1 continue if n-i < 3: break if cdataclose.match(rawdata, i): self.syntax_error("bogus `]]>'") self.handle_data(rawdata[i]) i = i+1 continue else: raise Error('neither < nor & ??') # We get here only if incomplete matches but # nothing else break # end while if i > 0: self.__at_start = 0 if end and i < n: data = rawdata[i] self.syntax_error("bogus `%s'" % data) if not self.__accept_utf8 and illegal.search(data): self.syntax_error('illegal character in content') self.handle_data(data) self.lineno = self.lineno + data.count('\n') self.rawdata = rawdata[i+1:] return self.goahead(end) self.rawdata = rawdata[i:] if end: if not self.__seen_starttag: self.syntax_error('no elements in file') if self.stack: self.syntax_error('missing end tags') while self.stack: self.finish_endtag(self.stack[-1][0]) # Internal -- parse comment, return length or -1 if not terminated def parse_comment(self, i): rawdata = self.rawdata if rawdata[i:i+4] != '<!--': raise Error('unexpected call to handle_comment') res = commentclose.search(rawdata, i+4) if res is None: return -1 if doubledash.search(rawdata, i+4, res.start(0)): self.syntax_error("`--' inside comment") if rawdata[res.start(0)-1] == '-': self.syntax_error('comment cannot end in three dashes') if not self.__accept_utf8 and \ illegal.search(rawdata, i+4, res.start(0)): self.syntax_error('illegal character in comment') self.handle_comment(rawdata[i+4: res.start(0)]) return res.end(0) # Internal -- handle DOCTYPE tag, return length or -1 if not terminated def parse_doctype(self, res): rawdata = self.rawdata n = len(rawdata) name = res.group('name') if self.__map_case: name = name.lower() pubid, syslit = res.group('pubid', 'syslit') if pubid is not None: pubid = pubid[1:-1] # remove quotes pubid = ' '.join(pubid.split()) # normalize if syslit is not None: syslit = syslit[1:-1] # remove quotes j = k = res.end(0) if k >= n: return -1 if rawdata[k] == '[': level = 0 k = k+1 dq = sq = 0 while k < n: c = rawdata[k] if not sq and c == '"': dq = not dq elif not dq and c == "'": sq = not sq elif sq or dq: pass elif level <= 0 and c == ']': res = endbracket.match(rawdata, k+1) if res is None: return -1 self.handle_doctype(name, pubid, syslit, rawdata[j+1:k]) return res.end(0) elif c == '<': level = level + 1 elif c == '>': level = level - 1 if level < 0: self.syntax_error("bogus `>' in DOCTYPE") k = k+1 res = endbracketfind.match(rawdata, k) if res is None: return -1 if endbracket.match(rawdata, k) is None: self.syntax_error('garbage in DOCTYPE') self.handle_doctype(name, pubid, syslit, None) return res.end(0) # Internal -- handle CDATA tag, return length or -1 if not terminated def parse_cdata(self, i): rawdata = self.rawdata if rawdata[i:i+9] != '<![CDATA[': raise Error('unexpected call to parse_cdata') res = cdataclose.search(rawdata, i+9) if res is None: return -1 if not self.__accept_utf8 and \ illegal.search(rawdata, i+9, res.start(0)): self.syntax_error('illegal character in CDATA') if not self.stack: self.syntax_error('CDATA not in content') self.handle_cdata(rawdata[i+9:res.start(0)]) return res.end(0) __xml_namespace_attributes = {'ns':None, 'src':None, 'prefix':None} # Internal -- handle a processing instruction tag def parse_proc(self, i): rawdata = self.rawdata end = procclose.search(rawdata, i) if end is None: return -1 j = end.start(0) if not self.__accept_utf8 and illegal.search(rawdata, i+2, j): self.syntax_error('illegal character in processing instruction') res = tagfind.match(rawdata, i+2) if res is None: raise Error('unexpected call to parse_proc') k = res.end(0) name = res.group(0) if self.__map_case: name = name.lower() if name == 'xml:namespace': self.syntax_error('old-fashioned namespace declaration') self.__use_namespaces = -1 # namespace declaration # this must come after the <?xml?> declaration (if any) # and before the <!DOCTYPE> (if any). if self.__seen_doctype or self.__seen_starttag: self.syntax_error('xml:namespace declaration too late in document') attrdict, namespace, k = self.parse_attributes(name, k, j) if namespace: self.syntax_error('namespace declaration inside namespace declaration') for attrname in attrdict.keys(): if not self.__xml_namespace_attributes.has_key(attrname): self.syntax_error("unknown attribute `%s' in xml:namespace tag" % attrname) if not attrdict.has_key('ns') or not attrdict.has_key('prefix'): self.syntax_error('xml:namespace without required attributes') prefix = attrdict.get('prefix') if ncname.match(prefix) is None: self.syntax_error('xml:namespace illegal prefix value') return end.end(0) if self.__namespaces.has_key(prefix): self.syntax_error('xml:namespace prefix not unique') self.__namespaces[prefix] = attrdict['ns'] else: if name.lower() == 'xml': self.syntax_error('illegal processing instruction target name') self.handle_proc(name, rawdata[k:j]) return end.end(0) # Internal -- parse attributes between i and j def parse_attributes(self, tag, i, j): rawdata = self.rawdata attrdict = {} namespace = {} while i < j: res = attrfind.match(rawdata, i) if res is None: break attrname, attrvalue = res.group('name', 'value') if self.__map_case: attrname = attrname.lower() i = res.end(0) if attrvalue is None: self.syntax_error("no value specified for attribute `%s'" % attrname) attrvalue = attrname elif attrvalue[:1] == "'" == attrvalue[-1:] or \ attrvalue[:1] == '"' == attrvalue[-1:]: attrvalue = attrvalue[1:-1] elif not self.__accept_unquoted_attributes: self.syntax_error("attribute `%s' value not quoted" % attrname) res = xmlns.match(attrname) if res is not None: # namespace declaration ncname = res.group('ncname') namespace[ncname or ''] = attrvalue or None if not self.__use_namespaces: self.__use_namespaces = len(self.stack)+1 continue if '<' in attrvalue: self.syntax_error("`<' illegal in attribute value") if attrdict.has_key(attrname): self.syntax_error("attribute `%s' specified twice" % attrname) attrvalue = attrvalue.translate(attrtrans) attrdict[attrname] = self.translate_references(attrvalue) return attrdict, namespace, i # Internal -- handle starttag, return length or -1 if not terminated def parse_starttag(self, i): rawdata = self.rawdata # i points to start of tag end = endbracketfind.match(rawdata, i+1) if end is None: return -1 tag = starttagmatch.match(rawdata, i) if tag is None or tag.end(0) != end.end(0): self.syntax_error('garbage in starttag') return end.end(0) nstag = tagname = tag.group('tagname') if self.__map_case: nstag = tagname = nstag.lower() if not self.__seen_starttag and self.__seen_doctype and \ tagname != self.__seen_doctype: self.syntax_error('starttag does not match DOCTYPE') if self.__seen_starttag and not self.stack: self.syntax_error('multiple elements on top level') k, j = tag.span('attrs') attrdict, nsdict, k = self.parse_attributes(tagname, k, j) self.stack.append((tagname, nsdict, nstag)) if self.__use_namespaces: res = qname.match(tagname) else: res = None if res is not None: prefix, nstag = res.group('prefix', 'local') if prefix is None: prefix = '' ns = None for t, d, nst in self.stack: if d.has_key(prefix): ns = d[prefix] if ns is None and prefix != '':
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?