📄 crobotcrawl.h
字号:
////////////////////////////////////////////////////////////////////
//
// CRobotCrawl.h - CRobotCrawl class declarations
//
// Source: "Programming Robots, Spiders, and Intelligent Agents
// in Microsoft Visual C++"
//
// Copyright (c) 1999 David Pallmann. All rights reserved.
#include "CRobotInternet.h"
#define MAX_URLS 5000
class CRobotCrawl
{
public:
CRobotInternet *m_pInternet; // CRobotInternet pointer
BOOL m_bRobotExclusion; // True = check/honor robot
// exclusion standard
BOOL m_bDomainScope; // True = extend scope to domain;
// False = server only
CString m_sUserAgent; // User agent name (override)
int m_nURLs; // Number of URLs discovered
// while crawling
CString m_sBaseURL[MAX_URLS]; // Base URLs of link URLs
CString m_sLinkURL[MAX_URLS]; // Link URLs encountered
CString m_sAbsURL[MAX_URLS]; // Absolute form of URLs
// in LinkURL[]
int m_nLevel[MAX_URLS]; // Crawl level
private:
BOOL m_bAllocatedInternet; // True = this class created a
// CRobotInternet object
CString m_sURL; // Root URL of crawl
CString m_sServer; // Root server of crawl
CString m_sDomain; // Root domain of crawl
CString m_sRobotPolicy; // Contents of site's
// robots.txt file
BOOL m_bRobotPolicyExists; // True = robots.txt was found
int m_nCurrLevel;
//---- Public functions ----
public:
CRobotCrawl();
~CRobotCrawl();
BOOL CrawlSite(const CString& sURL, const int& nDepth);
//---- Private functions ----
private:
BOOL CrawlPage(CString sPageURL, CString sPageHTML, int nLevel);
BOOL LinkExists(const CString& sURL);
CString MakeAbsoluteURL(const CString& sURL,
const CString& sBaseURL);
BOOL IsHtmlURL(const CString& sAbsURL);
BOOL IsUrlInScope(const CString& sAbsURL);
};
////////////////////////////////////////////////////////////////////
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -