markupbase.py

来自「mallet是自然语言处理、机器学习领域的一个开源项目。」· Python 代码 · 共 318 行
318 行
"""Shared support for scanning document type declarations in HTML and XHTML."""import reimport string_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match_declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').matchdel reclass ParserBase:    """Parser base class which provides some common support methods used    by the SGML/HTML and XHTML parsers."""    def __init__(self):        if self.__class__ is ParserBase:            raise RuntimeError(                "markupbase.ParserBase must be subclassed")    def error(self, message):        raise NotImplementedError(            "subclasses of ParserBase must override error()")    def reset(self):        self.lineno = 1        self.offset = 0    def getpos(self):        """Return current line number and offset."""        return self.lineno, self.offset    # Internal -- update line number and offset.  This should be    # called for each piece of data exactly once, in order -- in other    # words the concatenation of all the input strings to this    # function should be exactly the entire input.    def updatepos(self, i, j):        if i >= j:            return j        rawdata = self.rawdata        nlines = string.count(rawdata, "\n", i, j)        if nlines:            self.lineno = self.lineno + nlines            pos = string.rindex(rawdata, "\n", i, j) # Should not fail            self.offset = j-(pos+1)        else:            self.offset = self.offset + j-i        return j    _decl_otherchars = ''    # Internal -- parse declaration (for use by subclasses).    def parse_declaration(self, i):        # This is some sort of declaration; in "HTML as        # deployed," this should only be the document type        # declaration ("<!DOCTYPE html...>").        rawdata = self.rawdata        j = i + 2        assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"        if rawdata[j:j+1] in ("-", ""):            # Start of comment followed by buffer boundary,            # or just a buffer boundary.            return -1        # in practice, this should look like: ((name|stringlit) S*)+ '>'        n = len(rawdata)        decltype, j = self._scan_name(j, i)        if j < 0:            return j        if decltype == "doctype":            self._decl_otherchars = ''        while j < n:            c = rawdata[j]            if c == ">":                # end of declaration syntax                data = rawdata[i+2:j]                if decltype == "doctype":                    self.handle_decl(data)                else:                    self.unknown_decl(data)                return j + 1            if c in "\"'":                m = _declstringlit_match(rawdata, j)                if not m:                    return -1 # incomplete                j = m.end()            elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":                name, j = self._scan_name(j, i)            elif c in self._decl_otherchars:                j = j + 1            elif c == "[":                if decltype == "doctype":                    j = self._parse_doctype_subset(j + 1, i)                else:                    self.error("unexpected '[' char in declaration")            else:                self.error(                    "unexpected %s char in declaration" % `rawdata[j]`)            if j < 0:                return j        return -1 # incomplete    # Internal -- scan past the internal subset in a <!DOCTYPE declaration,    # returning the index just past any whitespace following the trailing ']'.    def _parse_doctype_subset(self, i, declstartpos):        rawdata = self.rawdata        n = len(rawdata)        j = i        while j < n:            c = rawdata[j]            if c == "<":                s = rawdata[j:j+2]                if s == "<":                    # end of buffer; incomplete                    return -1                if s != "<!":                    self.updatepos(declstartpos, j + 1)                    self.error("unexpected char in internal subset (in %s)"                               % `s`)                if (j + 2) == n:                    # end of buffer; incomplete                    return -1                if (j + 4) > n:                    # end of buffer; incomplete                    return -1                if rawdata[j:j+4] == "<!--":                    j = self.parse_comment(j, report=0)                    if j < 0:                        return j                    continue                name, j = self._scan_name(j + 2, declstartpos)                if j == -1:                    return -1                if name not in ("attlist", "element", "entity", "notation"):                    self.updatepos(declstartpos, j + 2)                    self.error(                        "unknown declaration %s in internal subset" % `name`)                # handle the individual names                meth = getattr(self, "_parse_doctype_" + name)                j = meth(j, declstartpos)                if j < 0:                    return j            elif c == "%":                # parameter entity reference                if (j + 1) == n:                    # end of buffer; incomplete                    return -1                s, j = self._scan_name(j + 1, declstartpos)                if j < 0:                    return j                if rawdata[j] == ";":                    j = j + 1            elif c == "]":                j = j + 1                while j < n and rawdata[j] in string.whitespace:                    j = j + 1                if j < n:                    if rawdata[j] == ">":                        return j                    self.updatepos(declstartpos, j)                    self.error("unexpected char after internal subset")                else:                    return -1            elif c in string.whitespace:                j = j + 1            else:                self.updatepos(declstartpos, j)                self.error("unexpected char %s in internal subset" % `c`)        # end of buffer reached        return -1    # Internal -- scan past <!ELEMENT declarations    def _parse_doctype_element(self, i, declstartpos):        name, j = self._scan_name(i, declstartpos)        if j == -1:            return -1        # style content model; just skip until '>'        rawdata = self.rawdata        if '>' in rawdata[j:]:            return string.find(rawdata, ">", j) + 1        return -1    # Internal -- scan past <!ATTLIST declarations    def _parse_doctype_attlist(self, i, declstartpos):        rawdata = self.rawdata        name, j = self._scan_name(i, declstartpos)        c = rawdata[j:j+1]        if c == "":            return -1        if c == ">":            return j + 1        while 1:            # scan a series of attribute descriptions; simplified:            #   name type [value] [#constraint]            name, j = self._scan_name(j, declstartpos)            if j < 0:                return j            c = rawdata[j:j+1]            if c == "":                return -1            if c == "(":                # an enumerated type; look for ')'                if ")" in rawdata[j:]:                    j = string.find(rawdata, ")", j) + 1                else:                    return -1                while rawdata[j:j+1] in string.whitespace:                    j = j + 1                if not rawdata[j:]:                    # end of buffer, incomplete                    return -1            else:                name, j = self._scan_name(j, declstartpos)            c = rawdata[j:j+1]            if not c:                return -1            if c in "'\"":                m = _declstringlit_match(rawdata, j)                if m:                    j = m.end()                else:                    return -1                c = rawdata[j:j+1]                if not c:                    return -1            if c == "#":                if rawdata[j:] == "#":                    # end of buffer                    return -1                name, j = self._scan_name(j + 1, declstartpos)                if j < 0:                    return j                c = rawdata[j:j+1]                if not c:                    return -1            if c == '>':                # all done                return j + 1    # Internal -- scan past <!NOTATION declarations    def _parse_doctype_notation(self, i, declstartpos):        name, j = self._scan_name(i, declstartpos)        if j < 0:            return j        rawdata = self.rawdata        while 1:            c = rawdata[j:j+1]            if not c:                # end of buffer; incomplete                return -1            if c == '>':                return j + 1            if c in "'\"":                m = _declstringlit_match(rawdata, j)                if not m:                    return -1                j = m.end()            else:                name, j = self._scan_name(j, declstartpos)                if j < 0:                    return j    # Internal -- scan past <!ENTITY declarations    def _parse_doctype_entity(self, i, declstartpos):        rawdata = self.rawdata        if rawdata[i:i+1] == "%":            j = i + 1            while 1:                c = rawdata[j:j+1]                if not c:                    return -1                if c in string.whitespace:                    j = j + 1                else:                    break        else:            j = i        name, j = self._scan_name(j, declstartpos)        if j < 0:            return j        while 1:            c = self.rawdata[j:j+1]            if not c:                return -1            if c in "'\"":                m = _declstringlit_match(rawdata, j)                if m:                    j = m.end()                else:                    return -1    # incomplete            elif c == ">":                return j + 1            else:                name, j = self._scan_name(j, declstartpos)                if j < 0:                    return j    # Internal -- scan a name token and the new position and the token, or    # return -1 if we've reached the end of the buffer.    def _scan_name(self, i, declstartpos):        rawdata = self.rawdata        n = len(rawdata)        if i == n:            return None, -1        m = _declname_match(rawdata, i)        if m:            s = m.group()            name = s.strip()            if (i + len(s)) == n:                return None, -1  # end of buffer            return string.lower(name), m.end()        else:            self.updatepos(declstartpos, i)            self.error("expected name token")    # To be overridden -- handlers for unknown objects    def unknown_decl(self, data):        pass
markupbase.py - 源码说明

本页面展示了「mallet是自然语言处理、机器学习领域的一个开源项目。」中的 markupbase.py 源码文件，采用 Python 编程语言编写，共 318 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与mallet相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?