📄 frmwebrobot.frm

📁 vb写的网络蜘蛛程序
💻 FRM
📖 第 1 页 / 共 3 页
字号:
上一页 1 23
            txtOutput.Text = ""
            
            Set Analyst = New clsDocumentAnalyst
            Analyst.Init txtSearchString.Text
            If RootWebNode.AlreadyContains(txtURL.Text, ExistingNode) Then
                Set Diplomat = DispatchNewDiplomat(ExistingNode.URL)
                If Not (Diplomat Is Nothing) Then
                    SearchWeb ExistingNode, SearchDepth, Diplomat
                  End If
              Else
                Set NewWebNode = New clsHTMLPageResourceNode
                NewWebNode.URL = txtURL.Text
                NewWebNode.Path = txtURL.Text
                Set NewWebNode.Parent = RootWebNode
                Set Diplomat = DispatchNewDiplomat(NewWebNode.URL)
                If Not (Diplomat Is Nothing) Then
                    RootWebNode.PageLinks.Add NewWebNode, NewWebNode.URL
                    SearchWeb NewWebNode, SearchDepth, Diplomat
                  End If
              End If
            
            txtURL.Enabled = True
            mnuPerformSearch.Enabled = True
            mnuStopSearch.Enabled = False
            tbToolbar.Buttons("btnSearch").Enabled = True
            tbToolbar.Buttons("btnStop").Enabled = False
            tmrBusy.Enabled = False
            Me.MousePointer = vbDefault
            bPerformingSearch = False
            bPerformingRequest = False
          End Select
      End If

    End Sub

Private Function ValidSearchDepth(SearchDepth As Integer)
    
    ValidSearchDepth = True
    If IsNumeric(txtSearchDepth) Then
        SearchDepth = Val(txtSearchDepth.Text)
        If Not (SearchDepth = Abs(Int(SearchDepth)) And _
            (SearchDepth > 0)) Then
            MsgBox "Please enter an integer value for Search Depth.", _
                   vbOKOnly + vbCritical, _
                   "Invalid Search Depth"
            ValidSearchDepth = False
          End If
      Else
        MsgBox "Please enter an integer value for Search Depth.", _
               vbOKOnly + vbCritical, _
               "Invalid Search Depth"
        ValidSearchDepth = False
      End If

    End Function

Private Sub udSearchDepth_DownClick()
    Dim SearchDepth As Integer
    
    If ValidSearchDepth(SearchDepth) Then
        SearchDepth = MaxLong(1, SearchDepth - 1)
        txtSearchDepth.Text = Str(SearchDepth)
      End If
    End Sub


Private Sub udSearchDepth_UpClick()
    Dim SearchDepth As Integer
    
    If ValidSearchDepth(SearchDepth) Then
        SearchDepth = MinLong(SearchDepth + 1, 999)
        txtSearchDepth.Text = Str(SearchDepth)
      End If

    End Sub



Private Sub SearchWeb(CurrentWebNode As clsHTMLPageResourceNode, _
                      ByVal LevelsToSearch As Integer, _
                      ByVal Diplomat As clsDiplomat)

    Dim WebNode As clsHTMLPageResourceNode
    Dim NodeIcon As Integer
    Dim TempText As String
    
    ' Evaluate Access Restrictions
    If Diplomat.NewServerURL(CurrentWebNode.URL) Then
        Set Diplomat = DispatchNewDiplomat(CurrentWebNode.URL)
      End If
    If Diplomat.Rejects(CurrentWebNode.URL) Then
        AddNodeToTreeView CurrentWebNode, icoERROR
        Exit Sub
      End If
      
    If AbleToGetWebDocument(CurrentWebNode.URL) Then
        TimeOfLastTransfer = Now
        CurrentWebNode.MatchesSearch = Analyst.DocumentMatches(txtOutput.Text)
        If CurrentWebNode.MatchesSearch Then
            NodeIcon = icoHIT
          ElseIf InStr(CurrentWebNode.Path, "://") Then
            NodeIcon = icoSERVER
          Else
            NodeIcon = icoDOCUMENT
          End If
        AddNodeToTreeView CurrentWebNode, NodeIcon
        If (LevelsToSearch > 1) Then
            TempText = txtOutput.Text
            ExtractLinksFromDocument TempText, "href", CurrentWebNode
            ExtractLinksFromDocument TempText, "src", CurrentWebNode
            For Each WebNode In CurrentWebNode.PageElements
                DoEvents
                If bSearchCancelled Then Exit For
                AddNodeToTreeView WebNode, icoIMAGE
                Next WebNode
            For Each WebNode In CurrentWebNode.PageLinks
                DoEvents
                If bSearchCancelled Then Exit For
                Do Until DateDiff("s", TimeOfLastTransfer, Now) > 5
                    DoEvents
                    Loop
                SearchWeb WebNode, LevelsToSearch - 1, Diplomat
                Debug.Print "Call SearchWeb "; WebNode.URL
                Next WebNode
            If bSearchCancelled Then MsgBox "Search Cancelled"
          End If
      Else
        AddNodeToTreeView CurrentWebNode, icoERROR
      End If
    End Sub



Public Sub AddNodeToTreeView(CurrentWebNode As clsHTMLPageResourceNode, _
                             TreeNodeIcon As Integer)

                             
    Dim CurrentTreeViewNode As Node
    Dim WebNode As clsHTMLPageResourceNode
    
    On Error Resume Next
    Set CurrentTreeViewNode = tvURLTreeView.Nodes(CurrentWebNode.URL)
    If CurrentTreeViewNode Is Nothing Then
        If (CurrentWebNode.Parent Is Nothing) Then
            Set CurrentTreeViewNode = tvURLTreeView.Nodes.Add _
                (, , _
                 CurrentWebNode.URL, CurrentWebNode.Path, _
                 TreeNodeIcon, TreeNodeIcon)
          Else
            Set CurrentTreeViewNode = tvURLTreeView.Nodes.Add _
                (CurrentWebNode.Parent.URL, tvwChild, _
                 CurrentWebNode.URL, CurrentWebNode.Path, _
                 TreeNodeIcon, TreeNodeIcon)
          End If
      Else
        tvURLTreeView.Nodes(CurrentWebNode.URL).Image = TreeNodeIcon
        tvURLTreeView.Nodes(CurrentWebNode.URL).SelectedImage = TreeNodeIcon
      End If
    If Not (CurrentTreeViewNode Is Nothing) Then
        Set ActiveTreeNode = CurrentTreeViewNode
        CurrentTreeViewNode.EnsureVisible
      End If
    
    End Sub

Public Function DispatchNewDiplomat(ByVal URL As String) As clsDiplomat
    Dim AgentFileReceived As Boolean
    Dim Diplomat As clsDiplomat
    
    URL = GetServerURLFrom(URL)
    AgentFileReceived = AbleToGetWebDocument(URL & "/robots.txt")
    If HTTPSemaphore.Cancelled Then
        Set DispatchNewDiplomat = Nothing
      Else
        Set Diplomat = New clsDiplomat
        Diplomat.Init URL, txtOutput.Text
        Set DispatchNewDiplomat = Diplomat
      End If
    End Function

Public Function AbleToGetWebDocument(URL As String) As Boolean
    
    ' Initialize Semaphore Flags.
    HTTPSemaphore.Error = False
    HTTPSemaphore.Timeout = False
    HTTPSemaphore.Connected = False
    HTTPSemaphore.ConnectionFailed = False
    HTTPSemaphore.Complete = False
    HTTPSemaphore.Cancelled = False
    
    ' Request HTML document retrieval.
    txtOutput.Text = ""
    bPerformingRequest = True
    
    Inet1.Execute URL
    
    ' Wait until asynchronous transfer terminates:
    Do
        DoEvents
        Debug.Print "Waiting for transfer..."
        Loop Until (HTTPSemaphore.Complete Or _
                    HTTPSemaphore.Cancelled Or _
                    bSearchCancelled)
                    
    ' I also monitored for a ConnectionFailed event:
    
    '                HTTPSemaphore.ConnectionFailed Or _
    '
    ' but this seemed to fire even when the document transfer
    ' was able to continue and complete successfully,
    ' which caused the program to make the next request
    ' before the previous request had finished, resulting
    ' in an error.
    
    ' The correct way to detect activity on the Inet control
    ' would be to watch the StillExecuting property. But, at
    ' this writing, this property never indicates True, which
    ' prevents me from using it:
    'Do While Inet1.StillExecuting
    '    DoEvents
    '    Loop
    
    AbleToGetWebDocument = HTTPSemaphore.Complete
    End Function

Public Function AbleToGetWebDocumentHeaders(URL As String) As Boolean
    ' Initialize Semaphore Flags.
    HTTPSemaphore.Error = False
    HTTPSemaphore.Timeout = False
    HTTPSemaphore.Connected = False
    HTTPSemaphore.ConnectionFailed = False
    HTTPSemaphore.Complete = False
    HTTPSemaphore.Cancelled = False
    
    ' Request HTML document header retrieval.
    txtOutput.Text = ""
    bPerformingRequest = True
    bRequestingHeaders = True
    
    Inet1.Execute URL, "HEAD"
    
    ' Wait until asynchronous transfer terminates:
    Do
        DoEvents
        Loop Until (HTTPSemaphore.Complete Or _
                    HTTPSemaphore.Cancelled Or _
                    bSearchCancelled)
                    
    bRequestingHeaders = False
    AbleToGetWebDocumentHeaders = HTTPSemaphore.Complete

    End Function

Private Sub ExtractLinksFromDocument(Document As String, _
                                     TagParamName As String, _
                                     CurrentWebNode As clsHTMLPageResourceNode)
    Dim HostCollection As Collection
    Dim ParentURL As String
    Dim ServerURL As String
    Dim TargetURL As String
    Dim URLType As String
    Dim StringPos As Long
    Dim CurrentStartPos As Long
    Dim WebNode As clsHTMLPageResourceNode
    Dim ExistingNode As clsHTMLPageResourceNode
    
    Document = LCase(Document)
    CurrentWebNode.URL = LCase(CurrentWebNode.URL)
    ' Extract ParentURL from Document URL.
    ParentURL = GetParentURLFrom(CurrentWebNode.URL)
    ' Extract ServerURL from Document URL.
    ServerURL = GetServerURLFrom(CurrentWebNode.URL)
    
    CurrentStartPos = 1
    Do
        TargetURL = GetNextTargetURLFromDocument(Document, CurrentStartPos, TagParamName)
        If TargetURL <> "" Then
            On Error Resume Next
            StringPos = InStr(TargetURL, ":")
            If StringPos > 0 Then
                URLType = Left(TargetURL, StringPos - 1)
                If URLType = "http" Then
                    Set WebNode = New clsHTMLPageResourceNode
                    Set WebNode.Parent = CurrentWebNode
                    WebNode.URL = TargetURL
                    WebNode.Path = TargetURL
                    Set HostCollection = SelectHostCollection(WebNode.URL, CurrentWebNode)
                    If Not (HostCollection Is Nothing) Then
                        If Not WebNode.Root.AlreadyContains(WebNode.URL, ExistingNode) Then
                            HostCollection.Add WebNode, WebNode.URL
                          End If
                      End If
                  End If
              ElseIf Left(TargetURL, 1) = "/" Then
                Set WebNode = New clsHTMLPageResourceNode
                Set WebNode.Parent = CurrentWebNode
                WebNode.URL = ServerURL & TargetURL
                WebNode.Path = TargetURL
                Set HostCollection = SelectHostCollection(WebNode.URL, CurrentWebNode)
                If Not (HostCollection Is Nothing) Then
                    If Not WebNode.Root.AlreadyContains(WebNode.URL, ExistingNode) Then
                        HostCollection.Add WebNode, WebNode.URL
                      End If
                  End If
              Else
                Set WebNode = New clsHTMLPageResourceNode
                Set WebNode.Parent = CurrentWebNode
                WebNode.URL = ParentURL & TargetURL
                WebNode.Path = TargetURL
                Set HostCollection = SelectHostCollection(WebNode.URL, CurrentWebNode)
                If Not (HostCollection Is Nothing) Then
                    If Not WebNode.Root.AlreadyContains(WebNode.URL, ExistingNode) Then
                        HostCollection.Add WebNode, WebNode.URL
                      End If
                  End If
              End If
            Debug.Print "Collection Count: ", HostCollection.Count
          Else
            ' No URL found.
          End If 'TargetURL <> ""
        Loop While (CurrentStartPos > 0)
    End Sub

Private Function GetNextTargetURLFromDocument(Document As String, _
                                             CurrentStartPos As Long, _
                                             TagParamName As String) As String
    Dim TempURL As String
    Dim EndOfCurrentTagPos As Long
    Dim CurrentEndPos As Long
    
    TempURL = ""
    CurrentStartPos = InStr(CurrentStartPos, Document, TagParamName)
    If CurrentStartPos > 0 Then
        EndOfCurrentTagPos = InStr(CurrentStartPos, Document, ">")
        CurrentStartPos = InStr(CurrentStartPos, Document, """")
        If CurrentStartPos > 0 Then
            CurrentStartPos = CurrentStartPos + 1
            CurrentEndPos = InStr(CurrentStartPos, Document, """")
            If (CurrentEndPos > CurrentStartPos) And _
               (CurrentEndPos < EndOfCurrentTagPos) Then
                TempURL = Mid(Document, CurrentStartPos, _
                              CurrentEndPos - CurrentStartPos)
              Else
                CurrentStartPos = EndOfCurrentTagPos
              End If
          End If
      End If
    GetNextTargetURLFromDocument = TempURL
            
    End Function

Public Function SelectHostCollection(URL As String, _
                                     WebNode As clsHTMLPageResourceNode) As Collection
    Dim sHeaderValue As String
    
    If AbleToGetWebDocumentHeaders(URL) Then
        sHeaderValue = Inet1.GetHeader("Content-type")
        If (InStr(1, sHeaderValue, "text/", 1) > 0) Then
            Set SelectHostCollection = WebNode.PageLinks
          Else
            Set SelectHostCollection = WebNode.PageElements
          End If
      Else
        Set SelectHostCollection = Nothing
      End If
    Debug.Print URL
    Debug.Print sHeaderValue
    End Function
上一页 1 23
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -