📄 difflib.py

📁 mallet是自然语言处理、机器学习领域的一个开源项目。
💻 PY
📖 第 1 页 / 共 3 页
字号:
        # stripped, it's "a" (tied with "b").  UNIX(tm) diff does so        # strip, so ends up claiming that ab is changed to acab by        # inserting "ca" in the middle.  That's minimal but unintuitive:        # "it's obvious" that someone inserted "ac" at the front.        # Windiff ends up at the same place as diff, but by pairing up        # the unique 'b's and then matching the first two 'a's.        a, b, b2j, isbjunk = self.a, self.b, self.b2j, self.isbjunk        besti, bestj, bestsize = alo, blo, 0        # find longest junk-free match        # during an iteration of the loop, j2len[j] = length of longest        # junk-free match ending with a[i-1] and b[j]        j2len = {}        nothing = []        for i in xrange(alo, ahi):            # look at all instances of a[i] in b; note that because            # b2j has no junk keys, the loop is skipped if a[i] is junk            j2lenget = j2len.get            newj2len = {}            for j in b2j.get(a[i], nothing):                # a[i] matches b[j]                if j < blo:                    continue                if j >= bhi:                    break                k = newj2len[j] = j2lenget(j-1, 0) + 1                if k > bestsize:                    besti, bestj, bestsize = i-k+1, j-k+1, k            j2len = newj2len        # Now that we have a wholly interesting match (albeit possibly        # empty!), we may as well suck up the matching junk on each        # side of it too.  Can't think of a good reason not to, and it        # saves post-processing the (possibly considerable) expense of        # figuring out what to do with it.  In the case of an empty        # interesting match, this is clearly the right thing to do,        # because no other kind of match is possible in the regions.        while besti > alo and bestj > blo and \              isbjunk(b[bestj-1]) and \              a[besti-1] == b[bestj-1]:            besti, bestj, bestsize = besti-1, bestj-1, bestsize+1        while besti+bestsize < ahi and bestj+bestsize < bhi and \              isbjunk(b[bestj+bestsize]) and \              a[besti+bestsize] == b[bestj+bestsize]:            bestsize = bestsize + 1        return besti, bestj, bestsize    def get_matching_blocks(self):        """Return list of triples describing matching subsequences.        Each triple is of the form (i, j, n), and means that        a[i:i+n] == b[j:j+n].  The triples are monotonically increasing in        i and in j.        The last triple is a dummy, (len(a), len(b), 0), and is the only        triple with n==0.        >>> s = SequenceMatcher(None, "abxcd", "abcd")        >>> s.get_matching_blocks()        [(0, 0, 2), (3, 2, 2), (5, 4, 0)]        """        if self.matching_blocks is not None:            return self.matching_blocks        self.matching_blocks = []        la, lb = len(self.a), len(self.b)        self.__helper(0, la, 0, lb, self.matching_blocks)        self.matching_blocks.append( (la, lb, 0) )        return self.matching_blocks    # builds list of matching blocks covering a[alo:ahi] and    # b[blo:bhi], appending them in increasing order to answer    def __helper(self, alo, ahi, blo, bhi, answer):        i, j, k = x = self.find_longest_match(alo, ahi, blo, bhi)        # a[alo:i] vs b[blo:j] unknown        # a[i:i+k] same as b[j:j+k]        # a[i+k:ahi] vs b[j+k:bhi] unknown        if k:            if alo < i and blo < j:                self.__helper(alo, i, blo, j, answer)            answer.append(x)            if i+k < ahi and j+k < bhi:                self.__helper(i+k, ahi, j+k, bhi, answer)    def get_opcodes(self):        """Return list of 5-tuples describing how to turn a into b.        Each tuple is of the form (tag, i1, i2, j1, j2).  The first tuple        has i1 == j1 == 0, and remaining tuples have i1 == the i2 from the        tuple preceding it, and likewise for j1 == the previous j2.        The tags are strings, with these meanings:        'replace':  a[i1:i2] should be replaced by b[j1:j2]        'delete':   a[i1:i2] should be deleted.                    Note that j1==j2 in this case.        'insert':   b[j1:j2] should be inserted at a[i1:i1].                    Note that i1==i2 in this case.        'equal':    a[i1:i2] == b[j1:j2]        >>> a = "qabxcd"        >>> b = "abycdf"        >>> s = SequenceMatcher(None, a, b)        >>> for tag, i1, i2, j1, j2 in s.get_opcodes():        ...    print ("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %        ...           (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2]))         delete a[0:1] (q) b[0:0] ()          equal a[1:3] (ab) b[0:2] (ab)        replace a[3:4] (x) b[2:3] (y)          equal a[4:6] (cd) b[3:5] (cd)         insert a[6:6] () b[5:6] (f)        """        if self.opcodes is not None:            return self.opcodes        i = j = 0        self.opcodes = answer = []        for ai, bj, size in self.get_matching_blocks():            # invariant:  we've pumped out correct diffs to change            # a[:i] into b[:j], and the next matching block is            # a[ai:ai+size] == b[bj:bj+size].  So we need to pump            # out a diff to change a[i:ai] into b[j:bj], pump out            # the matching block, and move (i,j) beyond the match            tag = ''            if i < ai and j < bj:                tag = 'replace'            elif i < ai:                tag = 'delete'            elif j < bj:                tag = 'insert'            if tag:                answer.append( (tag, i, ai, j, bj) )            i, j = ai+size, bj+size            # the list of matching blocks is terminated by a            # sentinel with size 0            if size:                answer.append( ('equal', ai, i, bj, j) )        return answer    def ratio(self):        """Return a measure of the sequences' similarity (float in [0,1]).        Where T is the total number of elements in both sequences, and        M is the number of matches, this is 2,0*M / T.        Note that this is 1 if the sequences are identical, and 0 if        they have nothing in common.        .ratio() is expensive to compute if you haven't already computed        .get_matching_blocks() or .get_opcodes(), in which case you may        want to try .quick_ratio() or .real_quick_ratio() first to get an        upper bound.        >>> s = SequenceMatcher(None, "abcd", "bcde")        >>> s.ratio()        0.75        >>> s.quick_ratio()        0.75        >>> s.real_quick_ratio()        1.0        """        matches = reduce(lambda sum, triple: sum + triple[-1],                         self.get_matching_blocks(), 0)        return 2.0 * matches / (len(self.a) + len(self.b))    def quick_ratio(self):        """Return an upper bound on ratio() relatively quickly.        This isn't defined beyond that it is an upper bound on .ratio(), and        is faster to compute.        """        # viewing a and b as multisets, set matches to the cardinality        # of their intersection; this counts the number of matches        # without regard to order, so is clearly an upper bound        if self.fullbcount is None:            self.fullbcount = fullbcount = {}            for elt in self.b:                fullbcount[elt] = fullbcount.get(elt, 0) + 1        fullbcount = self.fullbcount        # avail[x] is the number of times x appears in 'b' less the        # number of times we've seen it in 'a' so far ... kinda        avail = {}        availhas, matches = avail.has_key, 0        for elt in self.a:            if availhas(elt):                numb = avail[elt]            else:                numb = fullbcount.get(elt, 0)            avail[elt] = numb - 1            if numb > 0:                matches = matches + 1        return 2.0 * matches / (len(self.a) + len(self.b))    def real_quick_ratio(self):        """Return an upper bound on ratio() very quickly.        This isn't defined beyond that it is an upper bound on .ratio(), and        is faster to compute than either .ratio() or .quick_ratio().        """        la, lb = len(self.a), len(self.b)        # can't have more matches than the number of elements in the        # shorter sequence        return 2.0 * min(la, lb) / (la + lb)def get_close_matches(word, possibilities, n=3, cutoff=0.6):    """Use SequenceMatcher to return list of the best "good enough" matches.    word is a sequence for which close matches are desired (typically a    string).    possibilities is a list of sequences against which to match word    (typically a list of strings).    Optional arg n (default 3) is the maximum number of close matches to    return.  n must be > 0.    Optional arg cutoff (default 0.6) is a float in [0, 1].  Possibilities    that don't score at least that similar to word are ignored.    The best (no more than n) matches among the possibilities are returned    in a list, sorted by similarity score, most similar first.    >>> get_close_matches("appel", ["ape", "apple", "peach", "puppy"])    ['apple', 'ape']    >>> import keyword as _keyword    >>> get_close_matches("wheel", _keyword.kwlist)    ['while']    >>> get_close_matches("apple", _keyword.kwlist)    []    >>> get_close_matches("accept", _keyword.kwlist)    ['except']    """    if not n >  0:        raise ValueError("n must be > 0: " + `n`)    if not 0.0 <= cutoff <= 1.0:        raise ValueError("cutoff must be in [0.0, 1.0]: " + `cutoff`)    result = []    s = SequenceMatcher()    s.set_seq2(word)    for x in possibilities:        s.set_seq1(x)        if s.real_quick_ratio() >= cutoff and \           s.quick_ratio() >= cutoff and \           s.ratio() >= cutoff:            result.append((s.ratio(), x))    # Sort by score.    result.sort()    # Retain only the best n.    result = result[-n:]    # Move best-scorer to head of list.    result.reverse()    # Strip scores.    return [x for score, x in result]def _count_leading(line, ch):    """    Return number of `ch` characters at the start of `line`.    Example:    >>> _count_leading('   abc', ' ')    3    """    i, n = 0, len(line)    while i < n and line[i] == ch:        i += 1    return iclass Differ:    r"""    Differ is a class for comparing sequences of lines of text, and    producing human-readable differences or deltas.  Differ uses    SequenceMatcher both to compare sequences of lines, and to compare    sequences of characters within similar (near-matching) lines.    Each line of a Differ delta begins with a two-letter code:        '- '    line unique to sequence 1        '+ '    line unique to sequence 2        '  '    line common to both sequences        '? '    line not present in either input sequence    Lines beginning with '? ' attempt to guide the eye to intraline    differences, and were not present in either input sequence.  These lines    can be confusing if the sequences contain tab characters.    Note that Differ makes no claim to produce a *minimal* diff.  To the    contrary, minimal diffs are often counter-intuitive, because they synch    up anywhere possible, sometimes accidental matches 100 pages apart.    Restricting synch points to contiguous matches preserves some notion of    locality, at the occasional cost of producing a longer diff.    Example: Comparing two texts.    First we set up the texts, sequences of individual single-line strings    ending with newlines (such sequences can also be obtained from the    `readlines()` method of file-like objects):    >>> text1 = '''  1. Beautiful is better than ugly.    ...   2. Explicit is better than implicit.    ...   3. Simple is better than complex.    ...   4. Complex is better than complicated.    ... '''.splitlines(1)    >>> len(text1)    4    >>> text1[0][-1]    '\n'    >>> text2 = '''  1. Beautiful is better than ugly.    ...   3.   Simple is better than complex.    ...   4. Complicated is better than complex.    ...   5. Flat is better than nested.    ... '''.splitlines(1)    Next we instantiate a Differ object:    >>> d = Differ()    Note that when instantiating a Differ object we may pass functions to    filter out line and character 'junk'.  See Differ.__init__ for details.    Finally, we compare the two:    >>> result = list(d.compare(text1, text2))    'result' is a list of strings, so let's pretty-print it:    >>> from pprint import pprint as _pprint    >>> _pprint(result)    ['    1. Beautiful is better than ugly.\n',     '-   2. Explicit is better than implicit.\n',     '-   3. Simple is better than complex.\n',     '+   3.   Simple is better than complex.\n',     '?     ++\n',     '-   4. Complex is better than complicated.\n',     '?            ^                     ---- ^\n',     '+   4. Complicated is better than complex.\n',     '?           ++++ ^                      ^\n',     '+   5. Flat is better than nested.\n']    As a single multi-line string it looks like this:    >>> print ''.join(result),        1. Beautiful is better than ugly.    -   2. Explicit is better than implicit.    -   3. Simple is better than complex.    +   3.   Simple is better than complex.    ?     ++    -   4. Complex is better than complicated.    ?            ^                     ---- ^    +   4. Complicated is better than complex.    ?           ++++ ^                      ^    +   5. Flat is better than nested.    Methods:
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -