sre_parse.py

来自「mallet是自然语言处理、机器学习领域的一个开源项目。」· Python 代码 · 共 739 行 · 第 1/2 页

PY
739
字号
## Secret Labs' Regular Expression Engine## convert re-style regular expression to sre pattern## Copyright (c) 1998-2001 by Secret Labs AB.  All rights reserved.## See the sre.py file for information on usage and redistribution.#"""Internal support module for sre"""# XXX: show string offset and offending character for all errors# this module works under 1.5.2 and later.  don't use string methodsimport string, sysfrom sre_constants import *SPECIAL_CHARS = ".\\[{()*+?^$|"REPEAT_CHARS = "*+?{"DIGITS = tuple("0123456789")OCTDIGITS = tuple("01234567")HEXDIGITS = tuple("0123456789abcdefABCDEF")WHITESPACE = tuple(" \t\n\r\v\f")ESCAPES = {    r"\a": (LITERAL, ord("\a")),    r"\b": (LITERAL, ord("\b")),    r"\f": (LITERAL, ord("\f")),    r"\n": (LITERAL, ord("\n")),    r"\r": (LITERAL, ord("\r")),    r"\t": (LITERAL, ord("\t")),    r"\v": (LITERAL, ord("\v")),    r"\\": (LITERAL, ord("\\"))}CATEGORIES = {    r"\A": (AT, AT_BEGINNING_STRING), # start of string    r"\b": (AT, AT_BOUNDARY),    r"\B": (AT, AT_NON_BOUNDARY),    r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),    r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),    r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),    r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),    r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),    r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),    r"\Z": (AT, AT_END_STRING), # end of string}FLAGS = {    # standard flags    "i": SRE_FLAG_IGNORECASE,    "L": SRE_FLAG_LOCALE,    "m": SRE_FLAG_MULTILINE,    "s": SRE_FLAG_DOTALL,    "x": SRE_FLAG_VERBOSE,    # extensions    "t": SRE_FLAG_TEMPLATE,    "u": SRE_FLAG_UNICODE,}# figure out best way to convert hex/octal numbers to integerstry:    int("10", 8)    atoi = int # 2.0 and laterexcept TypeError:    atoi = string.atoi # 1.5.2class Pattern:    # master pattern object.  keeps track of global attributes    def __init__(self):        self.flags = 0        self.open = []        self.groups = 1        self.groupdict = {}    def opengroup(self, name=None):        gid = self.groups        self.groups = gid + 1        if name:            ogid = self.groupdict.get(name, None)            if ogid is not None:                raise error, ("redefinition of group name %s as group %d; "                              "was group %d" % (repr(name), gid,  ogid))            self.groupdict[name] = gid        self.open.append(gid)        return gid    def closegroup(self, gid):        self.open.remove(gid)    def checkgroup(self, gid):        return gid < self.groups and gid not in self.openclass SubPattern:    # a subpattern, in intermediate form    def __init__(self, pattern, data=None):        self.pattern = pattern        if not data:            data = []        self.data = data        self.width = None    def dump(self, level=0):        nl = 1        for op, av in self.data:            print level*"  " + op,; nl = 0            if op == "in":                # member sublanguage                print; nl = 1                for op, a in av:                    print (level+1)*"  " + op, a            elif op == "branch":                print; nl = 1                i = 0                for a in av[1]:                    if i > 0:                        print level*"  " + "or"                    a.dump(level+1); nl = 1                    i = i + 1            elif type(av) in (type(()), type([])):                for a in av:                    if isinstance(a, SubPattern):                        if not nl: print                        a.dump(level+1); nl = 1                    else:                        print a, ; nl = 0            else:                print av, ; nl = 0            if not nl: print    def __repr__(self):        return repr(self.data)    def __len__(self):        return len(self.data)    def __delitem__(self, index):        del self.data[index]    def __getitem__(self, index):        return self.data[index]    def __setitem__(self, index, code):        self.data[index] = code    def __getslice__(self, start, stop):        return SubPattern(self.pattern, self.data[start:stop])    def insert(self, index, code):        self.data.insert(index, code)    def append(self, code):        self.data.append(code)    def getwidth(self):        # determine the width (min, max) for this subpattern        if self.width:            return self.width        lo = hi = 0L        for op, av in self.data:            if op is BRANCH:                i = sys.maxint                j = 0                for av in av[1]:                    l, h = av.getwidth()                    i = min(i, l)                    j = max(j, h)                lo = lo + i                hi = hi + j            elif op is CALL:                i, j = av.getwidth()                lo = lo + i                hi = hi + j            elif op is SUBPATTERN:                i, j = av[1].getwidth()                lo = lo + i                hi = hi + j            elif op in (MIN_REPEAT, MAX_REPEAT):                i, j = av[2].getwidth()                lo = lo + long(i) * av[0]                hi = hi + long(j) * av[1]            elif op in (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY):                lo = lo + 1                hi = hi + 1            elif op == SUCCESS:                break        self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint))        return self.widthclass Tokenizer:    def __init__(self, string):        self.string = string        self.index = 0        self.__next()    def __next(self):        if self.index >= len(self.string):            self.next = None            return        char = self.string[self.index]        if char[0] == "\\":            try:                c = self.string[self.index + 1]            except IndexError:                raise error, "bogus escape (end of line)"            char = char + c        self.index = self.index + len(char)        self.next = char    def match(self, char, skip=1):        if char == self.next:            if skip:                self.__next()            return 1        return 0    def get(self):        this = self.next        self.__next()        return this    def tell(self):        return self.index, self.next    def seek(self, index):        self.index, self.next = indexdef isident(char):    return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_"def isdigit(char):    return "0" <= char <= "9"def isname(name):    # check that group name is a valid string    if not isident(name[0]):        return 0    for char in name:        if not isident(char) and not isdigit(char):            return 0    return 1def _group(escape, groups):    # check if the escape string represents a valid group    try:        gid = atoi(escape[1:])        if gid and gid < groups:            return gid    except ValueError:        pass    return None # not a valid groupdef _class_escape(source, escape):    # handle escape code inside character class    code = ESCAPES.get(escape)    if code:        return code    code = CATEGORIES.get(escape)    if code:        return code    try:        if escape[1:2] == "x":            # hexadecimal escape (exactly two digits)            while source.next in HEXDIGITS and len(escape) < 4:                escape = escape + source.get()            escape = escape[2:]            if len(escape) != 2:                raise error, "bogus escape: %s" % repr("\\" + escape)            return LITERAL, atoi(escape, 16) & 0xff        elif escape[1:2] in OCTDIGITS:            # octal escape (up to three digits)            while source.next in OCTDIGITS and len(escape) < 5:                escape = escape + source.get()            escape = escape[1:]            return LITERAL, atoi(escape, 8) & 0xff        if len(escape) == 2:            return LITERAL, ord(escape[1])    except ValueError:        pass    raise error, "bogus escape: %s" % repr(escape)def _escape(source, escape, state):    # handle escape code in expression    code = CATEGORIES.get(escape)    if code:        return code    code = ESCAPES.get(escape)    if code:        return code    try:        if escape[1:2] == "x":            # hexadecimal escape            while source.next in HEXDIGITS and len(escape) < 4:                escape = escape + source.get()            if len(escape) != 4:                raise ValueError            return LITERAL, atoi(escape[2:], 16) & 0xff        elif escape[1:2] == "0":            # octal escape            while source.next in OCTDIGITS and len(escape) < 4:                escape = escape + source.get()            return LITERAL, atoi(escape[1:], 8) & 0xff        elif escape[1:2] in DIGITS:            # octal escape *or* decimal group reference (sigh)            here = source.tell()            if source.next in DIGITS:                escape = escape + source.get()                if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and                    source.next in OCTDIGITS):                    # got three octal digits; this is an octal escape                    escape = escape + source.get()                    return LITERAL, atoi(escape[1:], 8) & 0xff            # got at least one decimal digit; this is a group reference            group = _group(escape, state.groups)            if group:                if not state.checkgroup(group):                    raise error, "cannot refer to open group"                return GROUPREF, group            raise ValueError        if len(escape) == 2:            return LITERAL, ord(escape[1])    except ValueError:        pass    raise error, "bogus escape: %s" % repr(escape)def _parse_sub(source, state, nested=1):    # parse an alternation: a|b|c    items = []    while 1:        items.append(_parse(source, state))        if source.match("|"):            continue        if not nested:            break        if not source.next or source.match(")", 0):            break        else:            raise error, "pattern not properly closed"    if len(items) == 1:        return items[0]    subpattern = SubPattern(state)    # check if all items share a common prefix    while 1:        prefix = None        for item in items:            if not item:                break            if prefix is None:                prefix = item[0]            elif item[0] != prefix:                break        else:            # all subitems start with a common "prefix".            # move it out of the branch            for item in items:                del item[0]            subpattern.append(prefix)            continue # check next one        break    # check if the branch can be replaced by a character set    for item in items:        if len(item) != 1 or item[0][0] != LITERAL:            break    else:        # we can store this as a character set instead of a        # branch (the compiler may optimize this even more)        set = []        for item in items:            set.append(item[0])        subpattern.append((IN, set))        return subpattern    subpattern.append((BRANCH, (None, items)))    return subpatterndef _parse(source, state):    # parse a simple pattern

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?