📄 difflib.py

📁 mallet是自然语言处理、机器学习领域的一个开源项目。
💻 PY
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
    __init__(linejunk=None, charjunk=None)        Construct a text differencer, with optional filters.    compare(a, b)        Compare two sequences of lines; generate the resulting delta.    """    def __init__(self, linejunk=None, charjunk=None):        """        Construct a text differencer, with optional filters.        The two optional keyword parameters are for filter functions:        - `linejunk`: A function that should accept a single string argument,          and return true iff the string is junk. The module-level function          `IS_LINE_JUNK` may be used to filter out lines without visible          characters, except for at most one splat ('#').        - `charjunk`: A function that should accept a string of length 1. The          module-level function `IS_CHARACTER_JUNK` may be used to filter out          whitespace characters (a blank or tab; **note**: bad idea to include          newline in this!).        """        self.linejunk = linejunk        self.charjunk = charjunk    def compare(self, a, b):        r"""        Compare two sequences of lines; generate the resulting delta.        Each sequence must contain individual single-line strings ending with        newlines. Such sequences can be obtained from the `readlines()` method        of file-like objects.  The delta generated also consists of newline-        terminated strings, ready to be printed as-is via the writeline()        method of a file-like object.        Example:        >>> print ''.join(Differ().compare('one\ntwo\nthree\n'.splitlines(1),        ...                                'ore\ntree\nemu\n'.splitlines(1))),        - one        ?  ^        + ore        ?  ^        - two        - three        ?  -        + tree        + emu        """        cruncher = SequenceMatcher(self.linejunk, a, b)        for tag, alo, ahi, blo, bhi in cruncher.get_opcodes():            if tag == 'replace':                g = self._fancy_replace(a, alo, ahi, b, blo, bhi)            elif tag == 'delete':                g = self._dump('-', a, alo, ahi)            elif tag == 'insert':                g = self._dump('+', b, blo, bhi)            elif tag == 'equal':                g = self._dump(' ', a, alo, ahi)            else:                raise ValueError, 'unknown tag ' + `tag`            for line in g:                yield line    def _dump(self, tag, x, lo, hi):        """Generate comparison results for a same-tagged range."""        for i in xrange(lo, hi):            yield '%s %s' % (tag, x[i])    def _plain_replace(self, a, alo, ahi, b, blo, bhi):        assert alo < ahi and blo < bhi        # dump the shorter block first -- reduces the burden on short-term        # memory if the blocks are of very different sizes        if bhi - blo < ahi - alo:            first  = self._dump('+', b, blo, bhi)            second = self._dump('-', a, alo, ahi)        else:            first  = self._dump('-', a, alo, ahi)            second = self._dump('+', b, blo, bhi)        for g in first, second:            for line in g:                yield line    def _fancy_replace(self, a, alo, ahi, b, blo, bhi):        r"""        When replacing one block of lines with another, search the blocks        for *similar* lines; the best-matching pair (if any) is used as a        synch point, and intraline difference marking is done on the        similar pair. Lots of work, but often worth it.        Example:        >>> d = Differ()        >>> d._fancy_replace(['abcDefghiJkl\n'], 0, 1, ['abcdefGhijkl\n'], 0, 1)        >>> print ''.join(d.results),        - abcDefghiJkl        ?    ^  ^  ^        + abcdefGhijkl        ?    ^  ^  ^        """        # don't synch up unless the lines have a similarity score of at        # least cutoff; best_ratio tracks the best score seen so far        best_ratio, cutoff = 0.74, 0.75        cruncher = SequenceMatcher(self.charjunk)        eqi, eqj = None, None   # 1st indices of equal lines (if any)        # search for the pair that matches best without being identical        # (identical lines must be junk lines, & we don't want to synch up        # on junk -- unless we have to)        for j in xrange(blo, bhi):            bj = b[j]            cruncher.set_seq2(bj)            for i in xrange(alo, ahi):                ai = a[i]                if ai == bj:                    if eqi is None:                        eqi, eqj = i, j                    continue                cruncher.set_seq1(ai)                # computing similarity is expensive, so use the quick                # upper bounds first -- have seen this speed up messy                # compares by a factor of 3.                # note that ratio() is only expensive to compute the first                # time it's called on a sequence pair; the expensive part                # of the computation is cached by cruncher                if cruncher.real_quick_ratio() > best_ratio and \                      cruncher.quick_ratio() > best_ratio and \                      cruncher.ratio() > best_ratio:                    best_ratio, best_i, best_j = cruncher.ratio(), i, j        if best_ratio < cutoff:            # no non-identical "pretty close" pair            if eqi is None:                # no identical pair either -- treat it as a straight replace                for line in self._plain_replace(a, alo, ahi, b, blo, bhi):                    yield line                return            # no close pair, but an identical pair -- synch up on that            best_i, best_j, best_ratio = eqi, eqj, 1.0        else:            # there's a close pair, so forget the identical pair (if any)            eqi = None        # a[best_i] very similar to b[best_j]; eqi is None iff they're not        # identical        # pump out diffs from before the synch point        for line in self._fancy_helper(a, alo, best_i, b, blo, best_j):            yield line        # do intraline marking on the synch pair        aelt, belt = a[best_i], b[best_j]        if eqi is None:            # pump out a '-', '?', '+', '?' quad for the synched lines            atags = btags = ""            cruncher.set_seqs(aelt, belt)            for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes():                la, lb = ai2 - ai1, bj2 - bj1                if tag == 'replace':                    atags += '^' * la                    btags += '^' * lb                elif tag == 'delete':                    atags += '-' * la                elif tag == 'insert':                    btags += '+' * lb                elif tag == 'equal':                    atags += ' ' * la                    btags += ' ' * lb                else:                    raise ValueError, 'unknown tag ' + `tag`            for line in self._qformat(aelt, belt, atags, btags):                yield line        else:            # the synch pair is identical            yield '  ' + aelt        # pump out diffs from after the synch point        for line in self._fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi):            yield line    def _fancy_helper(self, a, alo, ahi, b, blo, bhi):        g = []        if alo < ahi:            if blo < bhi:                g = self._fancy_replace(a, alo, ahi, b, blo, bhi)            else:                g = self._dump('-', a, alo, ahi)        elif blo < bhi:            g = self._dump('+', b, blo, bhi)        for line in g:            yield line    def _qformat(self, aline, bline, atags, btags):        r"""        Format "?" output and deal with leading tabs.        Example:        >>> d = Differ()        >>> d._qformat('\tabcDefghiJkl\n', '\t\tabcdefGhijkl\n',        ...            '  ^ ^  ^      ', '+  ^ ^  ^      ')        >>> for line in d.results: print repr(line)        ...        '- \tabcDefghiJkl\n'        '? \t ^ ^  ^\n'        '+ \t\tabcdefGhijkl\n'        '? \t  ^ ^  ^\n'        """        # Can hurt, but will probably help most of the time.        common = min(_count_leading(aline, "\t"),                     _count_leading(bline, "\t"))        common = min(common, _count_leading(atags[:common], " "))        atags = atags[common:].rstrip()        btags = btags[common:].rstrip()        yield "- " + aline        if atags:            yield "? %s%s\n" % ("\t" * common, atags)        yield "+ " + bline        if btags:            yield "? %s%s\n" % ("\t" * common, btags)# With respect to junk, an earlier version of ndiff simply refused to# *start* a match with a junk element.  The result was cases like this:#     before: private Thread currentThread;#     after:  private volatile Thread currentThread;# If you consider whitespace to be junk, the longest contiguous match# not starting with junk is "e Thread currentThread".  So ndiff reported# that "e volatil" was inserted between the 't' and the 'e' in "private".# While an accurate view, to people that's absurd.  The current version# looks for matching blocks that are entirely junk-free, then extends the# longest one of those as far as possible but only with matching junk.# So now "currentThread" is matched, then extended to suck up the# preceding blank; then "private" is matched, and extended to suck up the# following blank; then "Thread" is matched; and finally ndiff reports# that "volatile " was inserted before "Thread".  The only quibble# remaining is that perhaps it was really the case that " volatile"# was inserted after "private".  I can live with that <wink>.import redef IS_LINE_JUNK(line, pat=re.compile(r"\s*#?\s*$").match):    r"""    Return 1 for ignorable line: iff `line` is blank or contains a single '#'.    Examples:    >>> IS_LINE_JUNK('\n')    1    >>> IS_LINE_JUNK('  #   \n')    1    >>> IS_LINE_JUNK('hello\n')    0    """    return pat(line) is not Nonedef IS_CHARACTER_JUNK(ch, ws=" \t"):    r"""    Return 1 for ignorable character: iff `ch` is a space or tab.    Examples:    >>> IS_CHARACTER_JUNK(' ')    1    >>> IS_CHARACTER_JUNK('\t')    1    >>> IS_CHARACTER_JUNK('\n')    0    >>> IS_CHARACTER_JUNK('x')    0    """    return ch in wsdel redef ndiff(a, b, linejunk=IS_LINE_JUNK, charjunk=IS_CHARACTER_JUNK):    r"""    Compare `a` and `b` (lists of strings); return a `Differ`-style delta.    Optional keyword parameters `linejunk` and `charjunk` are for filter    functions (or None):    - linejunk: A function that should accept a single string argument, and      return true iff the string is junk. The default is module-level function      IS_LINE_JUNK, which filters out lines without visible characters, except      for at most one splat ('#').    - charjunk: A function that should accept a string of length 1. The      default is module-level function IS_CHARACTER_JUNK, which filters out      whitespace characters (a blank or tab; note: bad idea to include newline      in this!).    Tools/scripts/ndiff.py is a command-line front-end to this function.    Example:    >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),    ...              'ore\ntree\nemu\n'.splitlines(1))    >>> print ''.join(diff),    - one    ?  ^    + ore    ?  ^    - two    - three    ?  -    + tree    + emu    """    return Differ(linejunk, charjunk).compare(a, b)def restore(delta, which):    r"""    Generate one of the two sequences that generated a delta.    Given a `delta` produced by `Differ.compare()` or `ndiff()`, extract    lines originating from file 1 or 2 (parameter `which`), stripping off line    prefixes.    Examples:    >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1),    ...              'ore\ntree\nemu\n'.splitlines(1))    >>> diff = list(diff)    >>> print ''.join(restore(diff, 1)),    one    two    three    >>> print ''.join(restore(diff, 2)),    ore    tree    emu    """    try:        tag = {1: "- ", 2: "+ "}[int(which)]    except KeyError:        raise ValueError, ('unknown delta choice (must be 1 or 2): %r'                           % which)    prefixes = ("  ", tag)    for line in delta:        if line[:2] in prefixes:            yield line[2:]def _test():    import doctest, difflib    return doctest.testmod(difflib)if __name__ == "__main__":    _test()
上一页 1 23
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -