xmllib.py

来自「mallet是自然语言处理、机器学习领域的一个开源项目。」· Python 代码 · 共 930 行 · 第 1/3 页

PY
930
字号
                    version, encoding, standalone = res.group('version',                                                              'encoding',                                                              'standalone')                    if version[1:-1] != '1.0':                        raise Error('only XML version 1.0 supported')                    if encoding: encoding = encoding[1:-1]                    if standalone: standalone = standalone[1:-1]                    self.handle_xml(encoding, standalone)                    i = res.end(0)                    continue                res = procopen.match(rawdata, i)                if res:                    k = self.parse_proc(i)                    if k < 0: break                    self.lineno = self.lineno + rawdata[i:k].count('\n')                    i = k                    continue                res = doctype.match(rawdata, i)                if res:                    if self.literal:                        data = rawdata[i]                        self.handle_data(data)                        self.lineno = self.lineno + data.count('\n')                        i = i+1                        continue                    if self.__seen_doctype:                        self.syntax_error('multiple DOCTYPE elements')                    if self.__seen_starttag:                        self.syntax_error('DOCTYPE not at beginning of document')                    k = self.parse_doctype(res)                    if k < 0: break                    self.__seen_doctype = res.group('name')                    if self.__map_case:                        self.__seen_doctype = self.__seen_doctype.lower()                    self.lineno = self.lineno + rawdata[i:k].count('\n')                    i = k                    continue            elif rawdata[i] == '&':                if self.literal:                    data = rawdata[i]                    self.handle_data(data)                    i = i+1                    continue                res = charref.match(rawdata, i)                if res is not None:                    i = res.end(0)                    if rawdata[i-1] != ';':                        self.syntax_error("`;' missing in charref")                        i = i-1                    if not self.stack:                        self.syntax_error('data not in content')                    self.handle_charref(res.group('char')[:-1])                    self.lineno = self.lineno + res.group(0).count('\n')                    continue                res = entityref.match(rawdata, i)                if res is not None:                    i = res.end(0)                    if rawdata[i-1] != ';':                        self.syntax_error("`;' missing in entityref")                        i = i-1                    name = res.group('name')                    if self.__map_case:                        name = name.lower()                    if self.entitydefs.has_key(name):                        self.rawdata = rawdata = rawdata[:res.start(0)] + self.entitydefs[name] + rawdata[i:]                        n = len(rawdata)                        i = res.start(0)                    else:                        self.unknown_entityref(name)                    self.lineno = self.lineno + res.group(0).count('\n')                    continue            elif rawdata[i] == ']':                if self.literal:                    data = rawdata[i]                    self.handle_data(data)                    i = i+1                    continue                if n-i < 3:                    break                if cdataclose.match(rawdata, i):                    self.syntax_error("bogus `]]>'")                self.handle_data(rawdata[i])                i = i+1                continue            else:                raise Error('neither < nor & ??')            # We get here only if incomplete matches but            # nothing else            break        # end while        if i > 0:            self.__at_start = 0        if end and i < n:            data = rawdata[i]            self.syntax_error("bogus `%s'" % data)            if not self.__accept_utf8 and illegal.search(data):                self.syntax_error('illegal character in content')            self.handle_data(data)            self.lineno = self.lineno + data.count('\n')            self.rawdata = rawdata[i+1:]            return self.goahead(end)        self.rawdata = rawdata[i:]        if end:            if not self.__seen_starttag:                self.syntax_error('no elements in file')            if self.stack:                self.syntax_error('missing end tags')                while self.stack:                    self.finish_endtag(self.stack[-1][0])    # Internal -- parse comment, return length or -1 if not terminated    def parse_comment(self, i):        rawdata = self.rawdata        if rawdata[i:i+4] != '<!--':            raise Error('unexpected call to handle_comment')        res = commentclose.search(rawdata, i+4)        if res is None:            return -1        if doubledash.search(rawdata, i+4, res.start(0)):            self.syntax_error("`--' inside comment")        if rawdata[res.start(0)-1] == '-':            self.syntax_error('comment cannot end in three dashes')        if not self.__accept_utf8 and \           illegal.search(rawdata, i+4, res.start(0)):            self.syntax_error('illegal character in comment')        self.handle_comment(rawdata[i+4: res.start(0)])        return res.end(0)    # Internal -- handle DOCTYPE tag, return length or -1 if not terminated    def parse_doctype(self, res):        rawdata = self.rawdata        n = len(rawdata)        name = res.group('name')        if self.__map_case:            name = name.lower()        pubid, syslit = res.group('pubid', 'syslit')        if pubid is not None:            pubid = pubid[1:-1]         # remove quotes            pubid = ' '.join(pubid.split()) # normalize        if syslit is not None: syslit = syslit[1:-1] # remove quotes        j = k = res.end(0)        if k >= n:            return -1        if rawdata[k] == '[':            level = 0            k = k+1            dq = sq = 0            while k < n:                c = rawdata[k]                if not sq and c == '"':                    dq = not dq                elif not dq and c == "'":                    sq = not sq                elif sq or dq:                    pass                elif level <= 0 and c == ']':                    res = endbracket.match(rawdata, k+1)                    if res is None:                        return -1                    self.handle_doctype(name, pubid, syslit, rawdata[j+1:k])                    return res.end(0)                elif c == '<':                    level = level + 1                elif c == '>':                    level = level - 1                    if level < 0:                        self.syntax_error("bogus `>' in DOCTYPE")                k = k+1        res = endbracketfind.match(rawdata, k)        if res is None:            return -1        if endbracket.match(rawdata, k) is None:            self.syntax_error('garbage in DOCTYPE')        self.handle_doctype(name, pubid, syslit, None)        return res.end(0)    # Internal -- handle CDATA tag, return length or -1 if not terminated    def parse_cdata(self, i):        rawdata = self.rawdata        if rawdata[i:i+9] != '<![CDATA[':            raise Error('unexpected call to parse_cdata')        res = cdataclose.search(rawdata, i+9)        if res is None:            return -1        if not self.__accept_utf8 and \           illegal.search(rawdata, i+9, res.start(0)):            self.syntax_error('illegal character in CDATA')        if not self.stack:            self.syntax_error('CDATA not in content')        self.handle_cdata(rawdata[i+9:res.start(0)])        return res.end(0)    __xml_namespace_attributes = {'ns':None, 'src':None, 'prefix':None}    # Internal -- handle a processing instruction tag    def parse_proc(self, i):        rawdata = self.rawdata        end = procclose.search(rawdata, i)        if end is None:            return -1        j = end.start(0)        if not self.__accept_utf8 and illegal.search(rawdata, i+2, j):            self.syntax_error('illegal character in processing instruction')        res = tagfind.match(rawdata, i+2)        if res is None:            raise Error('unexpected call to parse_proc')        k = res.end(0)        name = res.group(0)        if self.__map_case:            name = name.lower()        if name == 'xml:namespace':            self.syntax_error('old-fashioned namespace declaration')            self.__use_namespaces = -1            # namespace declaration            # this must come after the <?xml?> declaration (if any)            # and before the <!DOCTYPE> (if any).            if self.__seen_doctype or self.__seen_starttag:                self.syntax_error('xml:namespace declaration too late in document')            attrdict, namespace, k = self.parse_attributes(name, k, j)            if namespace:                self.syntax_error('namespace declaration inside namespace declaration')            for attrname in attrdict.keys():                if not self.__xml_namespace_attributes.has_key(attrname):                    self.syntax_error("unknown attribute `%s' in xml:namespace tag" % attrname)            if not attrdict.has_key('ns') or not attrdict.has_key('prefix'):                self.syntax_error('xml:namespace without required attributes')            prefix = attrdict.get('prefix')            if ncname.match(prefix) is None:                self.syntax_error('xml:namespace illegal prefix value')                return end.end(0)            if self.__namespaces.has_key(prefix):                self.syntax_error('xml:namespace prefix not unique')            self.__namespaces[prefix] = attrdict['ns']        else:            if name.lower() == 'xml':                self.syntax_error('illegal processing instruction target name')            self.handle_proc(name, rawdata[k:j])        return end.end(0)    # Internal -- parse attributes between i and j    def parse_attributes(self, tag, i, j):        rawdata = self.rawdata        attrdict = {}        namespace = {}        while i < j:            res = attrfind.match(rawdata, i)            if res is None:                break            attrname, attrvalue = res.group('name', 'value')            if self.__map_case:                attrname = attrname.lower()            i = res.end(0)            if attrvalue is None:                self.syntax_error("no value specified for attribute `%s'" % attrname)                attrvalue = attrname            elif attrvalue[:1] == "'" == attrvalue[-1:] or \                 attrvalue[:1] == '"' == attrvalue[-1:]:                attrvalue = attrvalue[1:-1]            elif not self.__accept_unquoted_attributes:                self.syntax_error("attribute `%s' value not quoted" % attrname)            res = xmlns.match(attrname)            if res is not None:                # namespace declaration                ncname = res.group('ncname')                namespace[ncname or ''] = attrvalue or None                if not self.__use_namespaces:                    self.__use_namespaces = len(self.stack)+1                continue            if '<' in attrvalue:                self.syntax_error("`<' illegal in attribute value")            if attrdict.has_key(attrname):                self.syntax_error("attribute `%s' specified twice" % attrname)            attrvalue = attrvalue.translate(attrtrans)            attrdict[attrname] = self.translate_references(attrvalue)        return attrdict, namespace, i    # Internal -- handle starttag, return length or -1 if not terminated    def parse_starttag(self, i):        rawdata = self.rawdata        # i points to start of tag        end = endbracketfind.match(rawdata, i+1)        if end is None:            return -1        tag = starttagmatch.match(rawdata, i)        if tag is None or tag.end(0) != end.end(0):            self.syntax_error('garbage in starttag')            return end.end(0)        nstag = tagname = tag.group('tagname')        if self.__map_case:            nstag = tagname = nstag.lower()        if not self.__seen_starttag and self.__seen_doctype and \           tagname != self.__seen_doctype:            self.syntax_error('starttag does not match DOCTYPE')        if self.__seen_starttag and not self.stack:            self.syntax_error('multiple elements on top level')        k, j = tag.span('attrs')        attrdict, nsdict, k = self.parse_attributes(tagname, k, j)        self.stack.append((tagname, nsdict, nstag))        if self.__use_namespaces:            res = qname.match(tagname)        else:            res = None        if res is not None:            prefix, nstag = res.group('prefix', 'local')            if prefix is None:                prefix = ''            ns = None            for t, d, nst in self.stack:                if d.has_key(prefix):                    ns = d[prefix]            if ns is None and prefix != '':

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?