xmllib.py

来自「mallet是自然语言处理、机器学习领域的一个开源项目。」· Python 代码 · 共 930 行 · 第 1/3 页

PY
930
字号
"""A parser for XML, using the derived class as static DTD."""# Author: Sjoerd Mullender.import reimport stringversion = '0.3'class Error(RuntimeError):    pass# Regular expressions used for parsing_S = '[ \t\r\n]+'                       # white space_opS = '[ \t\r\n]*'                     # optional white space_Name = '[a-zA-Z_:][-a-zA-Z0-9._:]*'    # valid XML name_QStr = "(?:'[^']*'|\"[^\"]*\")"        # quoted XML stringillegal = re.compile('[^\t\r\n -\176\240-\377]') # illegal chars in contentinteresting = re.compile('[]&<]')amp = re.compile('&')ref = re.compile('&(' + _Name + '|#[0-9]+|#x[0-9a-fA-F]+)[^-a-zA-Z0-9._:]')entityref = re.compile('&(?P<name>' + _Name + ')[^-a-zA-Z0-9._:]')charref = re.compile('&#(?P<char>[0-9]+[^0-9]|x[0-9a-fA-F]+[^0-9a-fA-F])')space = re.compile(_S + '$')newline = re.compile('\n')attrfind = re.compile(    _S + '(?P<name>' + _Name + ')'    '(' + _opS + '=' + _opS +    '(?P<value>'+_QStr+'|[-a-zA-Z0-9.:+*%?!\(\)_#=~]+))?')starttagopen = re.compile('<' + _Name)starttagend = re.compile(_opS + '(?P<slash>/?)>')starttagmatch = re.compile('<(?P<tagname>'+_Name+')'                      '(?P<attrs>(?:'+attrfind.pattern+')*)'+                      starttagend.pattern)endtagopen = re.compile('</')endbracket = re.compile(_opS + '>')endbracketfind = re.compile('(?:[^>\'"]|'+_QStr+')*>')tagfind = re.compile(_Name)cdataopen = re.compile(r'<!\[CDATA\[')cdataclose = re.compile(r'\]\]>')# this matches one of the following:# SYSTEM SystemLiteral# PUBLIC PubidLiteral SystemLiteral_SystemLiteral = '(?P<%s>'+_QStr+')'_PublicLiteral = '(?P<%s>"[-\'\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*"|' \                        "'[-\(\)+,./:=?;!*#@$_%% \n\ra-zA-Z0-9]*')"_ExternalId = '(?:SYSTEM|' \                 'PUBLIC'+_S+_PublicLiteral%'pubid'+ \              ')'+_S+_SystemLiteral%'syslit'doctype = re.compile('<!DOCTYPE'+_S+'(?P<name>'+_Name+')'                     '(?:'+_S+_ExternalId+')?'+_opS)xmldecl = re.compile('<\?xml'+_S+                     'version'+_opS+'='+_opS+'(?P<version>'+_QStr+')'+                     '(?:'+_S+'encoding'+_opS+'='+_opS+                        "(?P<encoding>'[A-Za-z][-A-Za-z0-9._]*'|"                        '"[A-Za-z][-A-Za-z0-9._]*"))?'                     '(?:'+_S+'standalone'+_opS+'='+_opS+                        '(?P<standalone>\'(?:yes|no)\'|"(?:yes|no)"))?'+                     _opS+'\?>')procopen = re.compile(r'<\?(?P<proc>' + _Name + ')' + _opS)procclose = re.compile(_opS + r'\?>')commentopen = re.compile('<!--')commentclose = re.compile('-->')doubledash = re.compile('--')attrtrans = string.maketrans(' \r\n\t', '    ')# definitions for XML namespaces_NCName = '[a-zA-Z_][-a-zA-Z0-9._]*'    # XML Name, minus the ":"ncname = re.compile(_NCName + '$')qname = re.compile('(?:(?P<prefix>' + _NCName + '):)?' # optional prefix                   '(?P<local>' + _NCName + ')$')xmlns = re.compile('xmlns(?::(?P<ncname>'+_NCName+'))?$')# XML parser base class -- find tags and call handler functions.# Usage: p = XMLParser(); p.feed(data); ...; p.close().# The dtd is defined by deriving a class which defines methods with# special names to handle tags: start_foo and end_foo to handle <foo># and </foo>, respectively.  The data between tags is passed to the# parser by calling self.handle_data() with some data as argument (the# data may be split up in arbitrary chunks).class XMLParser:    attributes = {}                     # default, to be overridden    elements = {}                       # default, to be overridden    # parsing options, settable using keyword args in __init__    __accept_unquoted_attributes = 0    __accept_missing_endtag_name = 0    __map_case = 0    __accept_utf8 = 0    __translate_attribute_references = 1    # Interface -- initialize and reset this instance    def __init__(self, **kw):        self.__fixed = 0        if kw.has_key('accept_unquoted_attributes'):            self.__accept_unquoted_attributes = kw['accept_unquoted_attributes']        if kw.has_key('accept_missing_endtag_name'):            self.__accept_missing_endtag_name = kw['accept_missing_endtag_name']        if kw.has_key('map_case'):            self.__map_case = kw['map_case']        if kw.has_key('accept_utf8'):            self.__accept_utf8 = kw['accept_utf8']        if kw.has_key('translate_attribute_references'):            self.__translate_attribute_references = kw['translate_attribute_references']        self.reset()    def __fixelements(self):        self.__fixed = 1        self.elements = {}        self.__fixdict(self.__dict__)        self.__fixclass(self.__class__)    def __fixclass(self, kl):        self.__fixdict(kl.__dict__)        for k in kl.__bases__:            self.__fixclass(k)    def __fixdict(self, dict):        for key in dict.keys():            if key[:6] == 'start_':                tag = key[6:]                start, end = self.elements.get(tag, (None, None))                if start is None:                    self.elements[tag] = getattr(self, key), end            elif key[:4] == 'end_':                tag = key[4:]                start, end = self.elements.get(tag, (None, None))                if end is None:                    self.elements[tag] = start, getattr(self, key)    # Interface -- reset this instance.  Loses all unprocessed data    def reset(self):        self.rawdata = ''        self.stack = []        self.nomoretags = 0        self.literal = 0        self.lineno = 1        self.__at_start = 1        self.__seen_doctype = None        self.__seen_starttag = 0        self.__use_namespaces = 0        self.__namespaces = {'xml':None}   # xml is implicitly declared        # backward compatibility hack: if elements not overridden,        # fill it in ourselves        if self.elements is XMLParser.elements:            self.__fixelements()    # For derived classes only -- enter literal mode (CDATA) till EOF    def setnomoretags(self):        self.nomoretags = self.literal = 1    # For derived classes only -- enter literal mode (CDATA)    def setliteral(self, *args):        self.literal = 1    # Interface -- feed some data to the parser.  Call this as    # often as you want, with as little or as much text as you    # want (may include '\n').  (This just saves the text, all the    # processing is done by goahead().)    def feed(self, data):        self.rawdata = self.rawdata + data        self.goahead(0)    # Interface -- handle the remaining data    def close(self):        self.goahead(1)        if self.__fixed:            self.__fixed = 0            # remove self.elements so that we don't leak            del self.elements    # Interface -- translate references    def translate_references(self, data, all = 1):        if not self.__translate_attribute_references:            return data        i = 0        while 1:            res = amp.search(data, i)            if res is None:                return data            s = res.start(0)            res = ref.match(data, s)            if res is None:                self.syntax_error("bogus `&'")                i = s+1                continue            i = res.end(0)            str = res.group(1)            rescan = 0            if str[0] == '#':                if str[1] == 'x':                    str = chr(int(str[2:], 16))                else:                    str = chr(int(str[1:]))                if data[i - 1] != ';':                    self.syntax_error("`;' missing after char reference")                    i = i-1            elif all:                if self.entitydefs.has_key(str):                    str = self.entitydefs[str]                    rescan = 1                elif data[i - 1] != ';':                    self.syntax_error("bogus `&'")                    i = s + 1 # just past the &                    continue                else:                    self.syntax_error("reference to unknown entity `&%s;'" % str)                    str = '&' + str + ';'            elif data[i - 1] != ';':                self.syntax_error("bogus `&'")                i = s + 1 # just past the &                continue            # when we get here, str contains the translated text and i points            # to the end of the string that is to be replaced            data = data[:s] + str + data[i:]            if rescan:                i = s            else:                i = s + len(str)    # Interface - return a dictionary of all namespaces currently valid    def getnamespace(self):        nsdict = {}        for t, d, nst in self.stack:            nsdict.update(d)        return nsdict    # Internal -- handle data as far as reasonable.  May leave state    # and data to be processed by a subsequent call.  If 'end' is    # true, force handling all data as if followed by EOF marker.    def goahead(self, end):        rawdata = self.rawdata        i = 0        n = len(rawdata)        while i < n:            if i > 0:                self.__at_start = 0            if self.nomoretags:                data = rawdata[i:n]                self.handle_data(data)                self.lineno = self.lineno + data.count('\n')                i = n                break            res = interesting.search(rawdata, i)            if res:                j = res.start(0)            else:                j = n            if i < j:                data = rawdata[i:j]                if self.__at_start and space.match(data) is None:                    self.syntax_error('illegal data at start of file')                self.__at_start = 0                if not self.stack and space.match(data) is None:                    self.syntax_error('data not in content')                if not self.__accept_utf8 and illegal.search(data):                    self.syntax_error('illegal character in content')                self.handle_data(data)                self.lineno = self.lineno + data.count('\n')            i = j            if i == n: break            if rawdata[i] == '<':                if starttagopen.match(rawdata, i):                    if self.literal:                        data = rawdata[i]                        self.handle_data(data)                        self.lineno = self.lineno + data.count('\n')                        i = i+1                        continue                    k = self.parse_starttag(i)                    if k < 0: break                    self.__seen_starttag = 1                    self.lineno = self.lineno + rawdata[i:k].count('\n')                    i = k                    continue                if endtagopen.match(rawdata, i):                    k = self.parse_endtag(i)                    if k < 0: break                    self.lineno = self.lineno + rawdata[i:k].count('\n')                    i =  k                    continue                if commentopen.match(rawdata, i):                    if self.literal:                        data = rawdata[i]                        self.handle_data(data)                        self.lineno = self.lineno + data.count('\n')                        i = i+1                        continue                    k = self.parse_comment(i)                    if k < 0: break                    self.lineno = self.lineno + rawdata[i:k].count('\n')                    i = k                    continue                if cdataopen.match(rawdata, i):                    k = self.parse_cdata(i)                    if k < 0: break                    self.lineno = self.lineno + rawdata[i:k].count('\n')                    i = k                    continue                res = xmldecl.match(rawdata, i)                if res:                    if not self.__at_start:                        self.syntax_error("<?xml?> declaration not at start of document")

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?