📄 difflib.py
字号:
__init__(linejunk=None, charjunk=None) Construct a text differencer, with optional filters. compare(a, b) Compare two sequences of lines; generate the resulting delta. """ def __init__(self, linejunk=None, charjunk=None): """ Construct a text differencer, with optional filters. The two optional keyword parameters are for filter functions: - `linejunk`: A function that should accept a single string argument, and return true iff the string is junk. The module-level function `IS_LINE_JUNK` may be used to filter out lines without visible characters, except for at most one splat ('#'). - `charjunk`: A function that should accept a string of length 1. The module-level function `IS_CHARACTER_JUNK` may be used to filter out whitespace characters (a blank or tab; **note**: bad idea to include newline in this!). """ self.linejunk = linejunk self.charjunk = charjunk def compare(self, a, b): r""" Compare two sequences of lines; generate the resulting delta. Each sequence must contain individual single-line strings ending with newlines. Such sequences can be obtained from the `readlines()` method of file-like objects. The delta generated also consists of newline- terminated strings, ready to be printed as-is via the writeline() method of a file-like object. Example: >>> print ''.join(Differ().compare('one\ntwo\nthree\n'.splitlines(1), ... 'ore\ntree\nemu\n'.splitlines(1))), - one ? ^ + ore ? ^ - two - three ? - + tree + emu """ cruncher = SequenceMatcher(self.linejunk, a, b) for tag, alo, ahi, blo, bhi in cruncher.get_opcodes(): if tag == 'replace': g = self._fancy_replace(a, alo, ahi, b, blo, bhi) elif tag == 'delete': g = self._dump('-', a, alo, ahi) elif tag == 'insert': g = self._dump('+', b, blo, bhi) elif tag == 'equal': g = self._dump(' ', a, alo, ahi) else: raise ValueError, 'unknown tag ' + `tag` for line in g: yield line def _dump(self, tag, x, lo, hi): """Generate comparison results for a same-tagged range.""" for i in xrange(lo, hi): yield '%s %s' % (tag, x[i]) def _plain_replace(self, a, alo, ahi, b, blo, bhi): assert alo < ahi and blo < bhi # dump the shorter block first -- reduces the burden on short-term # memory if the blocks are of very different sizes if bhi - blo < ahi - alo: first = self._dump('+', b, blo, bhi) second = self._dump('-', a, alo, ahi) else: first = self._dump('-', a, alo, ahi) second = self._dump('+', b, blo, bhi) for g in first, second: for line in g: yield line def _fancy_replace(self, a, alo, ahi, b, blo, bhi): r""" When replacing one block of lines with another, search the blocks for *similar* lines; the best-matching pair (if any) is used as a synch point, and intraline difference marking is done on the similar pair. Lots of work, but often worth it. Example: >>> d = Differ() >>> d._fancy_replace(['abcDefghiJkl\n'], 0, 1, ['abcdefGhijkl\n'], 0, 1) >>> print ''.join(d.results), - abcDefghiJkl ? ^ ^ ^ + abcdefGhijkl ? ^ ^ ^ """ # don't synch up unless the lines have a similarity score of at # least cutoff; best_ratio tracks the best score seen so far best_ratio, cutoff = 0.74, 0.75 cruncher = SequenceMatcher(self.charjunk) eqi, eqj = None, None # 1st indices of equal lines (if any) # search for the pair that matches best without being identical # (identical lines must be junk lines, & we don't want to synch up # on junk -- unless we have to) for j in xrange(blo, bhi): bj = b[j] cruncher.set_seq2(bj) for i in xrange(alo, ahi): ai = a[i] if ai == bj: if eqi is None: eqi, eqj = i, j continue cruncher.set_seq1(ai) # computing similarity is expensive, so use the quick # upper bounds first -- have seen this speed up messy # compares by a factor of 3. # note that ratio() is only expensive to compute the first # time it's called on a sequence pair; the expensive part # of the computation is cached by cruncher if cruncher.real_quick_ratio() > best_ratio and \ cruncher.quick_ratio() > best_ratio and \ cruncher.ratio() > best_ratio: best_ratio, best_i, best_j = cruncher.ratio(), i, j if best_ratio < cutoff: # no non-identical "pretty close" pair if eqi is None: # no identical pair either -- treat it as a straight replace for line in self._plain_replace(a, alo, ahi, b, blo, bhi): yield line return # no close pair, but an identical pair -- synch up on that best_i, best_j, best_ratio = eqi, eqj, 1.0 else: # there's a close pair, so forget the identical pair (if any) eqi = None # a[best_i] very similar to b[best_j]; eqi is None iff they're not # identical # pump out diffs from before the synch point for line in self._fancy_helper(a, alo, best_i, b, blo, best_j): yield line # do intraline marking on the synch pair aelt, belt = a[best_i], b[best_j] if eqi is None: # pump out a '-', '?', '+', '?' quad for the synched lines atags = btags = "" cruncher.set_seqs(aelt, belt) for tag, ai1, ai2, bj1, bj2 in cruncher.get_opcodes(): la, lb = ai2 - ai1, bj2 - bj1 if tag == 'replace': atags += '^' * la btags += '^' * lb elif tag == 'delete': atags += '-' * la elif tag == 'insert': btags += '+' * lb elif tag == 'equal': atags += ' ' * la btags += ' ' * lb else: raise ValueError, 'unknown tag ' + `tag` for line in self._qformat(aelt, belt, atags, btags): yield line else: # the synch pair is identical yield ' ' + aelt # pump out diffs from after the synch point for line in self._fancy_helper(a, best_i+1, ahi, b, best_j+1, bhi): yield line def _fancy_helper(self, a, alo, ahi, b, blo, bhi): g = [] if alo < ahi: if blo < bhi: g = self._fancy_replace(a, alo, ahi, b, blo, bhi) else: g = self._dump('-', a, alo, ahi) elif blo < bhi: g = self._dump('+', b, blo, bhi) for line in g: yield line def _qformat(self, aline, bline, atags, btags): r""" Format "?" output and deal with leading tabs. Example: >>> d = Differ() >>> d._qformat('\tabcDefghiJkl\n', '\t\tabcdefGhijkl\n', ... ' ^ ^ ^ ', '+ ^ ^ ^ ') >>> for line in d.results: print repr(line) ... '- \tabcDefghiJkl\n' '? \t ^ ^ ^\n' '+ \t\tabcdefGhijkl\n' '? \t ^ ^ ^\n' """ # Can hurt, but will probably help most of the time. common = min(_count_leading(aline, "\t"), _count_leading(bline, "\t")) common = min(common, _count_leading(atags[:common], " ")) atags = atags[common:].rstrip() btags = btags[common:].rstrip() yield "- " + aline if atags: yield "? %s%s\n" % ("\t" * common, atags) yield "+ " + bline if btags: yield "? %s%s\n" % ("\t" * common, btags)# With respect to junk, an earlier version of ndiff simply refused to# *start* a match with a junk element. The result was cases like this:# before: private Thread currentThread;# after: private volatile Thread currentThread;# If you consider whitespace to be junk, the longest contiguous match# not starting with junk is "e Thread currentThread". So ndiff reported# that "e volatil" was inserted between the 't' and the 'e' in "private".# While an accurate view, to people that's absurd. The current version# looks for matching blocks that are entirely junk-free, then extends the# longest one of those as far as possible but only with matching junk.# So now "currentThread" is matched, then extended to suck up the# preceding blank; then "private" is matched, and extended to suck up the# following blank; then "Thread" is matched; and finally ndiff reports# that "volatile " was inserted before "Thread". The only quibble# remaining is that perhaps it was really the case that " volatile"# was inserted after "private". I can live with that <wink>.import redef IS_LINE_JUNK(line, pat=re.compile(r"\s*#?\s*$").match): r""" Return 1 for ignorable line: iff `line` is blank or contains a single '#'. Examples: >>> IS_LINE_JUNK('\n') 1 >>> IS_LINE_JUNK(' # \n') 1 >>> IS_LINE_JUNK('hello\n') 0 """ return pat(line) is not Nonedef IS_CHARACTER_JUNK(ch, ws=" \t"): r""" Return 1 for ignorable character: iff `ch` is a space or tab. Examples: >>> IS_CHARACTER_JUNK(' ') 1 >>> IS_CHARACTER_JUNK('\t') 1 >>> IS_CHARACTER_JUNK('\n') 0 >>> IS_CHARACTER_JUNK('x') 0 """ return ch in wsdel redef ndiff(a, b, linejunk=IS_LINE_JUNK, charjunk=IS_CHARACTER_JUNK): r""" Compare `a` and `b` (lists of strings); return a `Differ`-style delta. Optional keyword parameters `linejunk` and `charjunk` are for filter functions (or None): - linejunk: A function that should accept a single string argument, and return true iff the string is junk. The default is module-level function IS_LINE_JUNK, which filters out lines without visible characters, except for at most one splat ('#'). - charjunk: A function that should accept a string of length 1. The default is module-level function IS_CHARACTER_JUNK, which filters out whitespace characters (a blank or tab; note: bad idea to include newline in this!). Tools/scripts/ndiff.py is a command-line front-end to this function. Example: >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1), ... 'ore\ntree\nemu\n'.splitlines(1)) >>> print ''.join(diff), - one ? ^ + ore ? ^ - two - three ? - + tree + emu """ return Differ(linejunk, charjunk).compare(a, b)def restore(delta, which): r""" Generate one of the two sequences that generated a delta. Given a `delta` produced by `Differ.compare()` or `ndiff()`, extract lines originating from file 1 or 2 (parameter `which`), stripping off line prefixes. Examples: >>> diff = ndiff('one\ntwo\nthree\n'.splitlines(1), ... 'ore\ntree\nemu\n'.splitlines(1)) >>> diff = list(diff) >>> print ''.join(restore(diff, 1)), one two three >>> print ''.join(restore(diff, 2)), ore tree emu """ try: tag = {1: "- ", 2: "+ "}[int(which)] except KeyError: raise ValueError, ('unknown delta choice (must be 1 or 2): %r' % which) prefixes = (" ", tag) for line in delta: if line[:2] in prefixes: yield line[2:]def _test(): import doctest, difflib return doctest.testmod(difflib)if __name__ == "__main__": _test()
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -