📄 crobotcrawl.cpp
字号:
////////////////////////////////////////////////////////////////////
//
// CRobotCrawl.cpp - CRobotCrawl class implementation
//
// Source: "Programming Robots, Spiders and Intelligent Agents
// in Microsoft Visual C++"
//
// Copyright (c) 1999 David Pallmann. All rights reserved.
#include <stdafx.h>
#include <afxinet.h>
#include "CRobot.h"
#include "CRobotCrawl.h"
//------------------------------------------------------------------
// Constructor
CRobotCrawl::CRobotCrawl()
{
m_pInternet = NULL;
m_bAllocatedInternet = false;
m_bDomainScope = false;
m_bRobotExclusion = true;
m_sUserAgent = "";
}
//------------------------------------------------------------------
// Destructor
CRobotCrawl::~CRobotCrawl()
{
}
//------------------------------------------------------------------
// *************** public
// * *
// * CrawlSite *
// * *
// ***************
//
// Function: Crawls a site
//
// Inputs: sURL - The URL to crawl
// nDepth - Crawl depth (minimum value: 1)
//
// Outputs: <function_result> - True if at least once URL was
// successfully accessed; false if
// no sites were accessed.
BOOL CRobotCrawl::CrawlSite(const CString& sURL, const int& nDepth)
{
int nResult;
CString sErrMsg;
CString sHTML;
DWORD dwService;
INTERNET_PORT nPort;
CString sObject;
int nPos;
BOOL bReturnValue = false;
m_nURLs = 0;
m_nCurrLevel = 0;
m_sURL = sURL;
// If no CRobotInternet instance exists, allocate one now
if (!m_pInternet)
{
m_pInternet = new CRobotInternet;
m_bAllocatedInternet = true;
} // End if
if (m_sUserAgent != "")
m_pInternet->m_sUserAgent = m_sUserAgent;
// Set m_sURL, m_sServer, and m_sDomain data members
if (m_sURL.Left(5) != "http:")
{
if (m_sURL.Left(2) != "//")
m_sURL = "//" + m_sURL;
m_sURL = "http:" + m_sURL;
} // End if
AfxParseURL(m_sURL, dwService, m_sServer, sObject, nPort);
m_sDomain = m_sServer;
nPos = m_sDomain.Find(".");
if (nPos != -1)
m_sDomain = m_sDomain.Mid(nPos + 1);
if (m_bRobotExclusion)
{
if (m_pInternet->httpGet(m_sServer + "/robots.txt",
m_sRobotPolicy,
nResult,
sErrMsg))
m_bRobotPolicyExists = true;
else
m_bRobotPolicyExists = false;
} // End if
if (m_bRobotExclusion && m_bRobotPolicyExists)
{
if (m_pInternet->RobotExcluded(m_sRobotPolicy, m_sURL))
{
bReturnValue = false;
goto Finished;
} // End if
} // End if
if (m_pInternet->httpGet(m_sURL, sHTML, nResult, sErrMsg))
{
CrawlPage(m_sURL, sHTML, nDepth);
bReturnValue = true;
} // End if
else
{
bReturnValue = false;
} // End else
Finished:
if (m_bAllocatedInternet)
{
delete m_pInternet;
m_pInternet = NULL;
m_bAllocatedInternet = false;
} // End if
return bReturnValue;
}
//------------------------------------------------------------------
// *************** private
// * *
// * CrawlPage *
// * *
// ***************
// Function: Crawls a site
//
// Inputs: m_sHTML - HTML to scan
//
// Outputs: <function_result> - Always true
BOOL CRobotCrawl::CrawlPage(CString sPageURL,
CString sPageHTML,
int nLevel)
{
CString sLinkURL, sAbsURL;
CString sNewHTML;
CString sHtml = sPageHTML;
CString sHTML = sPageHTML; sHTML.MakeUpper();
CString sChar;
int nResult;
CString sErrMsg;
BOOL bProceed = false;
m_nCurrLevel++;
int nPos = sHTML.Find("HREF");
while (nPos != -1)
{
sHtml = sHtml.Mid(nPos+4); sHtml.TrimLeft();
sHTML = sHTML.Mid(nPos+4); sHTML.TrimLeft();
if (sHTML.Left(1) == "=")
{
sHtml = sHtml.Mid(1); sHtml.TrimLeft();
sHTML = sHTML.Mid(1); sHTML.TrimLeft();
if (sHTML.Left(1) == "\"")
{
sHtml = sHtml.Mid(1); sHtml.TrimLeft();
sHTML = sHTML.Mid(1); sHTML.TrimLeft();
nPos = sHTML.Find("\"");
if (nPos != -1)
{
sLinkURL = sHtml.Left(nPos);
sAbsURL = MakeAbsoluteURL(sLinkURL, sPageURL);
sHtml = sHtml.Mid(nPos + 1); sHtml.TrimLeft();
sHTML = sHTML.Mid(nPos + 1); sHTML.TrimLeft();
if (IsHtmlURL(sAbsURL) &&
IsUrlInScope(sAbsURL) &&
!LinkExists(sAbsURL))
{
m_sBaseURL[m_nURLs] = sPageURL;
m_sLinkURL[m_nURLs] = sLinkURL;
m_sAbsURL[m_nURLs] = sAbsURL;
m_nLevel[m_nURLs] = m_nCurrLevel;
m_nURLs++;
if (nLevel > 1)
{
if (m_bRobotExclusion
&& m_bRobotPolicyExists)
{
bProceed = true;
if (m_pInternet->RobotExcluded(
m_sRobotPolicy,
sAbsURL))
bProceed = false;
} // End if
else
bProceed = true;
if (bProceed
&& m_pInternet->httpGet(sAbsURL,
sNewHTML,
nResult,
sErrMsg))
{
CrawlPage(sAbsURL,
sNewHTML,
nLevel - 1);
} // End if
} // End if
} // End if
} // End if
} // End if
} // End if
nPos = sHTML.Find("HREF");
} // End while
m_nCurrLevel--;
return true;
}
//------------------------------------------------------------------
// **************** private
// * *
// * LinkExists *
// * *
// ****************
// Function: Returns true if specified URL is already stored
//
// Inputs: sURL - Link URL to check
//
// Outputs: <function_result> - True if URL already on file
BOOL CRobotCrawl::LinkExists(const CString& sURL)
{
for (int u = 0; u < m_nURLs; u++)
{
if (m_sAbsURL[u] == sURL)
return true;
} // end for
return false;
}
//------------------------------------------------------------------
// ********************* private
// * *
// * MakeAbsoluteURL *
// * *
// *********************
// Function: Returns absolute edition of a URL
//
// Inputs: sURL - Link URL to check
//
// Outputs: <function_result> - True if URL already on file
CString CRobotCrawl::MakeAbsoluteURL(const CString& sURL,
const CString& sBaseURL)
{
int nPos;
CString sAbsURL = sURL;
CString sTempBaseURL = sBaseURL;
// If URL begins with http:, consider it absolute
if (sAbsURL.Left(5) == "http:")
return sAbsURL;
// If URL has a colon within the first 9 characters,
//consider it absolute.
nPos = sAbsURL.Find(":");
if (nPos != -1 && nPos < 8)
return sAbsURL;
// If URL contains //, consider it absolute
if (sAbsURL.Find("//") != -1)
return sAbsURL;
/* If URL begins with a slash, pre-pend just the server part
of the base URL */
if (sAbsURL.Left(1) == "/")
{
nPos= sTempBaseURL.Find("//");
if (nPos != -1)
sTempBaseURL = sTempBaseURL.Mid(nPos + 2);
nPos = sTempBaseURL.Find("/");
if (nPos != -1)
sAbsURL = sTempBaseURL.Left(nPos) + sAbsURL;
else
sAbsURL = sTempBaseURL + sAbsURL;
} // End if
// If URL does not begin with a slash, prepend entire base URL
else
{
if (sTempBaseURL.Right(1) != "/")
sAbsURL = "/" + sAbsURL;
sAbsURL = sTempBaseURL + sAbsURL;
} // End else
if (sAbsURL.Find(":") == -1)
sAbsURL = "http://" + sAbsURL;
return sAbsURL;
}
//------------------------------------------------------------------
// *************** private
// * *
// * IsHtmlURL *
// * *
// ***************
// Function: Returns true if a URL leads to a HTML page
//
// Inputs: sAbsURL - Absolute URL to check
//
// Outputs: <function_result> - True if URL links to an HTML page
BOOL CRobotCrawl::IsHtmlURL(const CString& sAbsURL)
{
CString sURL = sAbsURL;
sURL.MakeUpper();
int nPos;
// If protocol is not http:, return false
if (sAbsURL.Left(5) != "http:")
return false;
// If there is no slash in the URL, return true
sURL = sURL.Mid(7); // Bypass http://
nPos = sURL.Find("/");
if (nPos == -1)
return true;
sURL = sURL.Mid(nPos);
// If there is no file.ext in the URL, return true
if (sURL.Find(".") == -1)
return true;
// If extension is .htm, .html, or .asp, return true
if (sURL.Find(".HTM") != -1)
return true;
if (sURL.Find(".ASP") != -1)
return true;
// If URL contains .exe? or .dll?, return true
if (sURL.Find(".EXE?") != -1)
return true;
if (sURL.Find(".DLL?") != -1)
return true;
return false;
}
//------------------------------------------------------------------
// ****************** private
// * *
// * IsUrlInScope *
// * *
// ******************
// Function: Returns true if a URL is in scope
//
// Inputs: sAbsURL - Absolute URL to check
// m_sURL - Base URL of server
// m_bDomainScope - True = domain scope,
// False = server scope
//
// Outputs: <function_result> - True if URL is in scope
BOOL CRobotCrawl::IsUrlInScope(const CString& sAbsURL)
{
CString sSubstring;
CString sURL = sAbsURL;
sURL.MakeUpper();
if (m_bDomainScope)
sSubstring = m_sDomain;
else
sSubstring = m_sServer;
sSubstring.MakeUpper();
if (sURL.Find(sSubstring) != -1)
return true;
else
return false;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -