📄 difflib.py
字号:
# stripped, it's "a" (tied with "b"). UNIX(tm) diff does so # strip, so ends up claiming that ab is changed to acab by # inserting "ca" in the middle. That's minimal but unintuitive: # "it's obvious" that someone inserted "ac" at the front. # Windiff ends up at the same place as diff, but by pairing up # the unique 'b's and then matching the first two 'a's. a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk besti, bestj, bestsize = alo, blo, 0 # find longest junk-free match # during an iteration of the loop, j2len[j] = length of longest # junk-free match ending with a[i-1] and b[j] j2len = {} nothing = [] for i in xrange(alo, ahi): # look at all instances of a[i] in b; note that because # b2j has no junk keys, the loop is skipped if a[i] is junk j2lenget = j2len.get newj2len = {} for j in b2j.get(a[i], nothing): # a[i] matches b[j] if j < blo: continue if j >= bhi: break k = newj2len[j] = j2lenget(j-1, 0) + 1 if k > bestsize: besti, bestj, bestsize = i-k+1, j-k+1, k j2len = newj2len # Now that we have a wholly interesting match (albeit possibly # empty!), we may as well suck up the matching junk on each # side of it too. Can't think of a good reason not to, and it # saves post-processing the (possibly considerable) expense of # figuring out what to do with it. In the case of an empty # interesting match, this is clearly the right thing to do, # because no other kind of match is possible in the regions. while besti > alo and bestj > blo and \ isbjunk(b[bestj-1]) and \ a[besti-1] == b[bestj-1]: besti, bestj, bestsize = besti-1, bestj-1, bestsize+1 while besti+bestsize < ahi and bestj+bestsize < bhi and \ isbjunk(b[bestj+bestsize]) and \ a[besti+bestsize] == b[bestj+bestsize]: bestsize = bestsize + 1 return besti, bestj, bestsize def get_matching_blocks(self): """Return list of triples describing matching subsequences. Each triple is of the form (i, j, n), and means that a[i:i+n] == b[j:j+n]. The triples are monotonically increasing in i and in j. The last triple is a dummy, (len(a), len(b), 0), and is the only triple with n==0. >>> s = SequenceMatcher(None, "abxcd", "abcd") >>> s.get_matching_blocks() [(0, 0, 2), (3, 2, 2), (5, 4, 0)] """ if self.matching_blocks is not None: return self.matching_blocks self.matching_blocks = [] la, lb = len(self.a), len(self.b) self.__helper(0, la, 0, lb, self.matching_blocks) self.matching_blocks.append( (la, lb, 0) ) return self.matching_blocks # builds list of matching blocks covering a[alo:ahi] and # b[blo:bhi], appending them in increasing order to answer def __helper(self, alo, ahi, blo, bhi, answer): i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi) # a[alo:i] vs b[blo:j] unknown # a[i:i+k] same as b[j:j+k] # a[i+k:ahi] vs b[j+k:bhi] unknown if k: if alo < i and blo < j: self.__helper(alo, i, blo, j, answer) answer.append(x) if i+k < ahi and j+k < bhi: self.__helper(i+k, ahi, j+k, bhi, answer) def get_opcodes(self): """Return list of 5-tuples describing how to turn a into b. Each tuple is of the form (tag, i1, i2, j1, j2). The first tuple has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the tuple preceding it, and likewise for j1 == the previous j2. The tags are strings, with these meanings: 'replace': a[i1:i2] should be replaced by b[j1:j2] 'delete': a[i1:i2] should be deleted. Note that j1==j2 in this case. 'insert': b[j1:j2] should be inserted at a[i1:i1]. Note that i1==i2 in this case. 'equal': a[i1:i2] == b[j1:j2] >>> a = "qabxcd" >>> b = "abycdf" >>> s = SequenceMatcher(None, a, b) >>> for tag, i1, i2, j1, j2 in s.get_opcodes(): ... print ("%7s a[%d:%d] (%s) b[%d:%d] (%s)" % ... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2])) delete a[0:1] (q) b[0:0] () equal a[1:3] (ab) b[0:2] (ab) replace a[3:4] (x) b[2:3] (y) equal a[4:6] (cd) b[3:5] (cd) insert a[6:6] () b[5:6] (f) """ if self.opcodes is not None: return self.opcodes i = j = 0 self.opcodes = answer = [] for ai, bj, size in self.get_matching_blocks(): # invariant: we've pumped out correct diffs to change # a[:i] into b[:j], and the next matching block is # a[ai:ai+size] == b[bj:bj+size]. So we need to pump # out a diff to change a[i:ai] into b[j:bj], pump out # the matching block, and move (i,j) beyond the match tag = '' if i < ai and j < bj: tag = 'replace' elif i < ai: tag = 'delete' elif j < bj: tag = 'insert' if tag: answer.append( (tag, i, ai, j, bj) ) i, j = ai+size, bj+size # the list of matching blocks is terminated by a # sentinel with size 0 if size: answer.append( ('equal', ai, i, bj, j) ) return answer def ratio(self): """Return a measure of the sequences' similarity (float in [0,1]). Where T is the total number of elements in both sequences, and M is the number of matches, this is 2,0*M / T. Note that this is 1 if the sequences are identical, and 0 if they have nothing in common. .ratio() is expensive to compute if you haven't already computed .get_matching_blocks() or .get_opcodes(), in which case you may want to try .quick_ratio() or .real_quick_ratio() first to get an upper bound. >>> s = SequenceMatcher(None, "abcd", "bcde") >>> s.ratio() 0.75 >>> s.quick_ratio() 0.75 >>> s.real_quick_ratio() 1.0 """ matches = reduce(lambda sum, triple: sum + triple[-1], self.get_matching_blocks(), 0) return 2.0 * matches / (len(self.a) + len(self.b)) def quick_ratio(self): """Return an upper bound on ratio() relatively quickly. This isn't defined beyond that it is an upper bound on .ratio(), and is faster to compute. """ # viewing a and b as multisets, set matches to the cardinality # of their intersection; this counts the number of matches # without regard to order, so is clearly an upper bound if self.fullbcount is None: self.fullbcount = fullbcount = {} for elt in self.b: fullbcount[elt] = fullbcount.get(elt, 0) + 1 fullbcount = self.fullbcount # avail[x] is the number of times x appears in 'b' less the # number of times we've seen it in 'a' so far ... kinda avail = {} availhas, matches = avail.has_key, 0 for elt in self.a: if availhas(elt): numb = avail[elt] else: numb = fullbcount.get(elt, 0) avail[elt] = numb - 1 if numb > 0: matches = matches + 1 return 2.0 * matches / (len(self.a) + len(self.b)) def real_quick_ratio(self): """Return an upper bound on ratio() very quickly. This isn't defined beyond that it is an upper bound on .ratio(), and is faster to compute than either .ratio() or .quick_ratio(). """ la, lb = len(self.a), len(self.b) # can't have more matches than the number of elements in the # shorter sequence return 2.0 * min(la, lb) / (la + lb)def get_close_matches(word, possibilities, n=3, cutoff=0.6): """Use SequenceMatcher to return list of the best "good enough" matches. word is a sequence for which close matches are desired (typically a string). possibilities is a list of sequences against which to match word (typically a list of strings). Optional arg n (default 3) is the maximum number of close matches to return. n must be > 0. Optional arg cutoff (default 0.6) is a float in [0, 1]. Possibilities that don't score at least that similar to word are ignored. The best (no more than n) matches among the possibilities are returned in a list, sorted by similarity score, most similar first. >>> get_close_matches("appel", ["ape", "apple", "peach", "puppy"]) ['apple', 'ape'] >>> import keyword as _keyword >>> get_close_matches("wheel", _keyword.kwlist) ['while'] >>> get_close_matches("apple", _keyword.kwlist) [] >>> get_close_matches("accept", _keyword.kwlist) ['except'] """ if not n > 0: raise ValueError("n must be > 0: " + `n`) if not 0.0 <= cutoff <= 1.0: raise ValueError("cutoff must be in [0.0, 1.0]: " + `cutoff`) result = [] s = SequenceMatcher() s.set_seq2(word) for x in possibilities: s.set_seq1(x) if s.real_quick_ratio() >= cutoff and \ s.quick_ratio() >= cutoff and \ s.ratio() >= cutoff: result.append((s.ratio(), x)) # Sort by score. result.sort() # Retain only the best n. result = result[-n:] # Move best-scorer to head of list. result.reverse() # Strip scores. return [x for score, x in result]def _count_leading(line, ch): """ Return number of `ch` characters at the start of `line`. Example: >>> _count_leading(' abc', ' ') 3 """ i, n = 0, len(line) while i < n and line[i] == ch: i += 1 return iclass Differ: r""" Differ is a class for comparing sequences of lines of text, and producing human-readable differences or deltas. Differ uses SequenceMatcher both to compare sequences of lines, and to compare sequences of characters within similar (near-matching) lines. Each line of a Differ delta begins with a two-letter code: '- ' line unique to sequence 1 '+ ' line unique to sequence 2 ' ' line common to both sequences '? ' line not present in either input sequence Lines beginning with '? ' attempt to guide the eye to intraline differences, and were not present in either input sequence. These lines can be confusing if the sequences contain tab characters. Note that Differ makes no claim to produce a *minimal* diff. To the contrary, minimal diffs are often counter-intuitive, because they synch up anywhere possible, sometimes accidental matches 100 pages apart. Restricting synch points to contiguous matches preserves some notion of locality, at the occasional cost of producing a longer diff. Example: Comparing two texts. First we set up the texts, sequences of individual single-line strings ending with newlines (such sequences can also be obtained from the `readlines()` method of file-like objects): >>> text1 = ''' 1. Beautiful is better than ugly. ... 2. Explicit is better than implicit. ... 3. Simple is better than complex. ... 4. Complex is better than complicated. ... '''.splitlines(1) >>> len(text1) 4 >>> text1[0][-1] '\n' >>> text2 = ''' 1. Beautiful is better than ugly. ... 3. Simple is better than complex. ... 4. Complicated is better than complex. ... 5. Flat is better than nested. ... '''.splitlines(1) Next we instantiate a Differ object: >>> d = Differ() Note that when instantiating a Differ object we may pass functions to filter out line and character 'junk'. See Differ.__init__ for details. Finally, we compare the two: >>> result = list(d.compare(text1, text2)) 'result' is a list of strings, so let's pretty-print it: >>> from pprint import pprint as _pprint >>> _pprint(result) [' 1. Beautiful is better than ugly.\n', '- 2. Explicit is better than implicit.\n', '- 3. Simple is better than complex.\n', '+ 3. Simple is better than complex.\n', '? ++\n', '- 4. Complex is better than complicated.\n', '? ^ ---- ^\n', '+ 4. Complicated is better than complex.\n', '? ++++ ^ ^\n', '+ 5. Flat is better than nested.\n'] As a single multi-line string it looks like this: >>> print ''.join(result), 1. Beautiful is better than ugly. - 2. Explicit is better than implicit. - 3. Simple is better than complex. + 3. Simple is better than complex. ? ++ - 4. Complex is better than complicated. ? ^ ---- ^ + 4. Complicated is better than complex. ? ++++ ^ ^ + 5. Flat is better than nested. Methods:
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -