sre_parse.py

来自「mallet是自然语言处理、机器学习领域的一个开源项目。」· Python 代码 · 共 739 行 · 第 1/2 页

PY
739
字号
    subpattern = SubPattern(state)    while 1:        if source.next in ("|", ")"):            break # end of subpattern        this = source.get()        if this is None:            break # end of pattern        if state.flags & SRE_FLAG_VERBOSE:            # skip whitespace and comments            if this in WHITESPACE:                continue            if this == "#":                while 1:                    this = source.get()                    if this in (None, "\n"):                        break                continue        if this and this[0] not in SPECIAL_CHARS:            subpattern.append((LITERAL, ord(this)))        elif this == "[":            # character set            set = []##          if source.match(":"):##              pass # handle character classes            if source.match("^"):                set.append((NEGATE, None))            # check remaining characters            start = set[:]            while 1:                this = source.get()                if this == "]" and set != start:                    break                elif this and this[0] == "\\":                    code1 = _class_escape(source, this)                elif this:                    code1 = LITERAL, ord(this)                else:                    raise error, "unexpected end of regular expression"                if source.match("-"):                    # potential range                    this = source.get()                    if this == "]":                        if code1[0] is IN:                            code1 = code1[1][0]                        set.append(code1)                        set.append((LITERAL, ord("-")))                        break                    else:                        if this[0] == "\\":                            code2 = _class_escape(source, this)                        else:                            code2 = LITERAL, ord(this)                        if code1[0] != LITERAL or code2[0] != LITERAL:                            raise error, "bad character range"                        lo = code1[1]                        hi = code2[1]                        if hi < lo:                            raise error, "bad character range"                        set.append((RANGE, (lo, hi)))                else:                    if code1[0] is IN:                        code1 = code1[1][0]                    set.append(code1)            # XXX: <fl> should move set optimization to compiler!            if len(set)==1 and set[0][0] is LITERAL:                subpattern.append(set[0]) # optimization            elif len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:                subpattern.append((NOT_LITERAL, set[1][1])) # optimization            else:                # XXX: <fl> should add charmap optimization here                subpattern.append((IN, set))        elif this and this[0] in REPEAT_CHARS:            # repeat previous item            if this == "?":                min, max = 0, 1            elif this == "*":                min, max = 0, MAXREPEAT            elif this == "+":                min, max = 1, MAXREPEAT            elif this == "{":                here = source.tell()                min, max = 0, MAXREPEAT                lo = hi = ""                while source.next in DIGITS:                    lo = lo + source.get()                if source.match(","):                    while source.next in DIGITS:                        hi = hi + source.get()                else:                    hi = lo                if not source.match("}"):                    subpattern.append((LITERAL, ord(this)))                    source.seek(here)                    continue                if lo:                    min = atoi(lo)                if hi:                    max = atoi(hi)                if max < min:                    raise error, "bad repeat interval"            else:                raise error, "not supported"            # figure out which item to repeat            if subpattern:                item = subpattern[-1:]            else:                item = None            if not item or (len(item) == 1 and item[0][0] == AT):                raise error, "nothing to repeat"            if item[0][0] in (MIN_REPEAT, MAX_REPEAT):                raise error, "multiple repeat"            if source.match("?"):                subpattern[-1] = (MIN_REPEAT, (min, max, item))            else:                subpattern[-1] = (MAX_REPEAT, (min, max, item))        elif this == ".":            subpattern.append((ANY, None))        elif this == "(":            group = 1            name = None            if source.match("?"):                group = 0                # options                if source.match("P"):                    # python extensions                    if source.match("<"):                        # named group: skip forward to end of name                        name = ""                        while 1:                            char = source.get()                            if char is None:                                raise error, "unterminated name"                            if char == ">":                                break                            name = name + char                        group = 1                        if not isname(name):                            raise error, "bad character in group name"                    elif source.match("="):                        # named backreference                        name = ""                        while 1:                            char = source.get()                            if char is None:                                raise error, "unterminated name"                            if char == ")":                                break                            name = name + char                        if not isname(name):                            raise error, "bad character in group name"                        gid = state.groupdict.get(name)                        if gid is None:                            raise error, "unknown group name"                        subpattern.append((GROUPREF, gid))                        continue                    else:                        char = source.get()                        if char is None:                            raise error, "unexpected end of pattern"                        raise error, "unknown specifier: ?P%s" % char                elif source.match(":"):                    # non-capturing group                    group = 2                elif source.match("#"):                    # comment                    while 1:                        if source.next is None or source.next == ")":                            break                        source.get()                    if not source.match(")"):                        raise error, "unbalanced parenthesis"                    continue                elif source.next in ("=", "!", "<"):                    # lookahead assertions                    char = source.get()                    dir = 1                    if char == "<":                        if source.next not in ("=", "!"):                            raise error, "syntax error"                        dir = -1 # lookbehind                        char = source.get()                    p = _parse_sub(source, state)                    if not source.match(")"):                        raise error, "unbalanced parenthesis"                    if char == "=":                        subpattern.append((ASSERT, (dir, p)))                    else:                        subpattern.append((ASSERT_NOT, (dir, p)))                    continue                else:                    # flags                    if not FLAGS.has_key(source.next):                        raise error, "unexpected end of pattern"                    while FLAGS.has_key(source.next):                        state.flags = state.flags | FLAGS[source.get()]            if group:                # parse group contents                if group == 2:                    # anonymous group                    group = None                else:                    group = state.opengroup(name)                p = _parse_sub(source, state)                if not source.match(")"):                    raise error, "unbalanced parenthesis"                if group is not None:                    state.closegroup(group)                subpattern.append((SUBPATTERN, (group, p)))            else:                while 1:                    char = source.get()                    if char is None:                        raise error, "unexpected end of pattern"                    if char == ")":                        break                    raise error, "unknown extension"        elif this == "^":            subpattern.append((AT, AT_BEGINNING))        elif this == "$":            subpattern.append((AT, AT_END))        elif this and this[0] == "\\":            code = _escape(source, this, state)            subpattern.append(code)        else:            raise error, "parser error"    return subpatterndef parse(str, flags=0, pattern=None):    # parse 're' pattern into list of (opcode, argument) tuples    source = Tokenizer(str)    if pattern is None:        pattern = Pattern()    pattern.flags = flags    pattern.str = str    p = _parse_sub(source, pattern, 0)    tail = source.get()    if tail == ")":        raise error, "unbalanced parenthesis"    elif tail:        raise error, "bogus characters at end of regular expression"    if flags & SRE_FLAG_DEBUG:        p.dump()    if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE:        # the VERBOSE flag was switched on inside the pattern.  to be        # on the safe side, we'll parse the whole thing again...        return parse(str, p.pattern.flags)    return pdef parse_template(source, pattern):    # parse 're' replacement string into list of literals and    # group references    s = Tokenizer(source)    p = []    a = p.append    def literal(literal, p=p):        if p and p[-1][0] is LITERAL:            p[-1] = LITERAL, p[-1][1] + literal        else:            p.append((LITERAL, literal))    sep = source[:0]    if type(sep) is type(""):        makechar = chr    else:        makechar = unichr    while 1:        this = s.get()        if this is None:            break # end of replacement string        if this and this[0] == "\\":            # group            if this == "\\g":                name = ""                if s.match("<"):                    while 1:                        char = s.get()                        if char is None:                            raise error, "unterminated group name"                        if char == ">":                            break                        name = name + char                if not name:                    raise error, "bad group name"                try:                    index = atoi(name)                except ValueError:                    if not isname(name):                        raise error, "bad character in group name"                    try:                        index = pattern.groupindex[name]                    except KeyError:                        raise IndexError, "unknown group name"                a((MARK, index))            elif len(this) > 1 and this[1] in DIGITS:                code = None                while 1:                    group = _group(this, pattern.groups+1)                    if group:                        if (s.next not in DIGITS or                            not _group(this + s.next, pattern.groups+1)):                            code = MARK, group                            break                    elif s.next in OCTDIGITS:                        this = this + s.get()                    else:                        break                if not code:                    this = this[1:]                    code = LITERAL, makechar(atoi(this[-6:], 8) & 0xff)                if code[0] is LITERAL:                    literal(code[1])                else:                    a(code)            else:                try:                    this = makechar(ESCAPES[this][1])                except KeyError:                    pass                literal(this)        else:            literal(this)    # convert template to groups and literals lists    i = 0    groups = []    literals = []    for c, s in p:        if c is MARK:            groups.append((i, s))            literals.append(None)        else:            literals.append(s)        i = i + 1    return groups, literalsdef expand_template(template, match):    g = match.group    sep = match.string[:0]    groups, literals = template    literals = literals[:]    try:        for index, group in groups:            literals[index] = s = g(group)            if s is None:                raise IndexError    except IndexError:        raise error, "empty group"    return string.join(literals, sep)

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?