📄 index.vb

📁 an lcs algorithm for finding common string
💻 VB
字号:
Namespace We.See.Diff

    ''' <summary>
    ''' We See Diff comparison can be performed on IComparable or an indexed list
    ''' The indexed list will convert 2 lists or arbitrary data to a combination of
    ''' list of integers and one list of the data
    ''' 
    ''' Diff can then be perfomed on the list of integer, speeding up the comparison
    ''' process. Mapping the resulting integers back to the Lookup data transforms the
    ''' result back to LCS information for the original datatype
    ''' 
    ''' This class creates the Index
    ''' 
    ''' Because the start and the end sequence of both arrays might be equal, and the
    ''' comparison time is exponential with the length of both arrays, the index creation
    ''' determines overlap in the start and the end of the sequence
    ''' 
    ''' e.g.
    ''' abcdefgh
    ''' abczdeydegh
    ''' 
    ''' the start and end of the strings should be compared first, the (dynamic) LCS will only
    ''' be determined for cdef and zdey (the "blocks") as the start part and end part will naturally be included
    ''' 
    ''' it still indexes values in starting and ending as there might be duplicates of those values in the (internal) blocks
    ''' only the inner difference has to be compared for the determination of LCS
    ''' 
    ''' the issue of duplicates
    ''' 
    ''' when there are duplicates in one of the 2 lists
    ''' abbc
    ''' and the second list is
    ''' abc
    ''' 
    ''' there is no way of determining whether the 1st of the second b is deleted
    ''' 
    ''' how this effects the rest of the application is currently undetermined
    ''' this might need to be solved in the future, possibly by making two ways to create the indexes;
    ''' treating duplicates as identical or treating duplicates as nonidentical
    ''' abbc
    ''' would creates 0112 in the first scenario and 0123 in the second
    ''' 
    ''' currently, for indexing purposes, duplicates are considered identical
    ''' 
    ''' This file Copyright 2008 by Wouter Steenbergen
    ''' September 25 2008
    ''' </summary>
    ''' <typeparam name="T"></typeparam>
    ''' <remarks></remarks>

    Public Structure Index(Of T)

        Public List1 As T()
        Public List2 As T()

        Public UniqueValues As T()

        Public Index1 As Integer()
        Public Index2 As Integer()

        Public StartSize As Integer
        Public EndSize As Integer

        Public Index1High As Integer ' Any id's appearing in Data2 > than this value does not exist in Data1

        ' possible performance inprovement:
        ' currenlty, the IndexOf function is used on the uniquevalues array to determine if objects are present
        ' internally, this calls the Equals function of the tested object with all the elements in the 
        ' array to determine equality. alternatively, a hashtable could be used to speed up this search
        ' (hashtable creates a hash for the objects and then traverses a tree to find the bucket where it should be,
        ' greatly decreasing the equals calls)

        Public ReadOnly Property Block1Length() As Integer
            Get
                Return List1.Length - StartSize - EndSize
            End Get
        End Property

        Public ReadOnly Property Block2Length() As Integer
            Get
                Return List2.Length - StartSize - EndSize
            End Get
        End Property

        Public ReadOnly Property ListsEqaul() As Boolean
            Get
                Return List1.Length = StartSize AndAlso List1.Length = List2.Length
            End Get
        End Property

        Public Shared Function Build(ByVal list1 As T(), ByVal list2 As T()) As Index(Of T)

            Dim result As New Index(Of T)

            Dim length1 As Integer = list1.Length
            Dim length2 As Integer = list2.Length

            result.List1 = list1
            result.List2 = list2
            Dim index1 As Integer() = New Integer(length1 - 1) {}
            Dim index2 As Integer() = New Integer(length2 - 1) {}

            Dim uniqueValues As T() = New T(length1 + length2 - 1) {} ' the maximum number of unique values
            Dim uniqueValueCount As Integer
            Dim id As Integer

            Dim startSize As Integer
            Dim endSize As Integer

            ' first determine the startSize(the number of overlapping values of both data1 and data2 at the start)
            ' index the found values while we're at it
            For i As Integer = 0 To Math.Min(length1, length2) - 1
                If Not list1(i).Equals(list2(i)) Then
                    ' this is the first element thats different
                    startSize = i
                    Exit For
                Else
                    ' there might be duplicates in the overlapping region, we still need to index the elements
                    id = Array.IndexOf(uniqueValues, list1(i), 0, uniqueValueCount)
                    If id = -1 Then
                        ' value not found in the index, add it
                        id = uniqueValueCount
                        uniqueValues(id) = list1(i)
                        uniqueValueCount += 1
                    End If
                    index1(i) = id
                    index2(i) = id
                End If
            Next

            result.StartSize = startSize

            ' if all elements in both arrays are the same then we're done
            ' the following evaluation is the inverse of the one perfomed on the
            ' ListsEqual property
            If length1 <> length2 Or length1 <> startSize Then

                ' next determine endSize
                ' evaluate in reverse from end to startSize
                Dim delta As Integer = length1 - length2 ' note that delta might be < 0
                ' the +1 is because we've already tested the first different element above,
                ' we can default to this being the endsize ( the loop will then continue completely, exit for is never reached)
                endSize = length1 - Math.Max(startSize, startSize + delta)
                For i As Integer = length1 - 1 To length1 - endSize Step -1
                    If Not list1(i).Equals(list2(i - delta)) Then
                        ' this is the first element thats different
                        endSize = length1 - i - 1
                        Exit For
                    Else
                        ' there might be duplicates in the overlapping region, we still need to index the elements
                        id = Array.IndexOf(uniqueValues, list1(i), 0, uniqueValueCount)
                        If id = -1 Then
                            ' value not found in the index, add it
                            id = uniqueValueCount
                            uniqueValues(id) = list1(i)
                            uniqueValueCount += 1
                        End If
                        index1(i) = id
                        index2(i - delta) = id
                    End If
                Next

                result.EndSize = endSize

                ' now we have the following information:
                ' data1=abcdefgh
                ' data2=abczdeydegh
                ' length1=8
                ' length2=11
                ' delta=3
                ' startsize=3
                ' endsize=2
                ' index1=01200043
                ' index2=01200000043
                ' uniquevalues=abchg

                If startSize + endSize > Math.Min(length1, length2) Then Throw New Exception("the total length of overlap can never be greater than the length of the smallest array")

                ' Index the block(btw start and end) values in Data1
                For i As Integer = startSize To length1 - endSize - 1
                    id = Array.IndexOf(uniqueValues, list1(i), 0, uniqueValueCount)
                    If id = -1 Then
                        ' value not found in the index, add it
                        id = uniqueValueCount
                        uniqueValues(id) = list1(i)
                        uniqueValueCount += 1
                    End If
                    index1(i) = id
                Next

                ' Index1High can be used to determine which values occur in Data2 but not in Data1
                ' (when a index2 value > this value, it has been added after the data1 indexation is performed)
                result.Index1High = uniqueValueCount - 1

                ' Index the block(btw start and end) values in Data2
                For i As Integer = startSize To length2 - endSize - 1
                    id = Array.IndexOf(uniqueValues, list2(i), 0, uniqueValueCount)
                    If id = -1 Then
                        ' value not found in the index, add it
                        id = uniqueValueCount
                        uniqueValues(id) = list2(i)
                        uniqueValueCount += 1
                    End If
                    index2(i) = id
                Next

                ' values is the list of unique references to data found in Data1 and Data2
                ' index1 contains the indexes in the value list to those values
                ' index2 contains the indexes in the value list to the data2 values
                ' indexes in data2 > Index1High naturally do not occus in the Data1 list

            End If

            ' the result does not have to be loaded when lists are equal. the reason behind this is
            ' a performance optimization; there is no need to test further when the lists are equal
            ' this is kind of counterintuitive should this class be re-used in different scenario's
            ' for this reason, the result is loaded regardless, valueing coding clarity over performance
            ' (it's only going to make a minimal difference anyway)

            result.Index1 = index1
            result.Index2 = index2
            Array.Resize(uniqueValues, uniqueValueCount)
            result.UniqueValues = uniqueValues

            Return result
        End Function

    End Structure

End Namespace
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -