⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 cowebspider.cpp

📁 基于com的网络爬虫程序
💻 CPP
📖 第 1 页 / 共 2 页
字号:
//
// This file is part of UniWebSpider Project.
//
// UniWebSpider is free software; you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// UniWebSpider is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with UniWebSpider; if not, write to the Free Software Foundation, Inc.,
// 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
//

// WRITTEN BY	: ValGarn

// CoWebSpider.cpp : Implementation of CCoWebSpider
#include "stdafx.h"
#include "WebSpider.h"
#include "CoWebSpider.h"
#include "DownloadInfo.h"
#include "AFXPRIV.H"
#include "MainThreadInfo.h"
#include "SearchInfo.h"
#include "FillURLsInfo.h"

#include "math.h"

/////////////////////////////////////////////////////////////////////////////
// CCoWebSpider

UINT Search( LPVOID pParam )
{
	CoInitialize(NULL);
	CSearchInfo* pSearchInfo = (CSearchInfo*)pParam;
	IRegExp2Ptr pRegExp;
	pRegExp.CreateInstance(__uuidof(RegExp));
	CADO* pDatabase = 0;
	try
	{
		pDatabase = new CADO();
		pDatabase->OpenConnection(_bstr_t(pSearchInfo->m_sConnectionString),_bstr_t(""),_bstr_t(""));

		_RecordsetPtr RegExps;
		if(	pDatabase->Execute(_bstr_t("mp_GetSearchRegExps"), &RegExps))
		{
			Sleep(pSearchInfo->m_nPause);
			_RecordsetPtr HTMLs;
			if(	pDatabase->Execute(_bstr_t("mp_GetNotSearchedSources"), &HTMLs))
			for(; !HTMLs->adoEOF; HTMLs->MoveNext() )
			{
				RegExps->MoveFirst();
				for(; !RegExps->adoEOF; RegExps->MoveNext() )
				{
					_variant_t vField = RegExps->GetCollect(_variant_t("sRegExp"));
					if(vField.vt != VT_NULL)
					{
						pRegExp->put_IgnoreCase(RegExps->GetCollect(_variant_t("bIgnoreCase")).boolVal);
						pRegExp->put_Multiline(VARIANT_TRUE);
						pRegExp->put_Pattern(SysAllocString(vField.bstrVal));

						vField = HTMLs->GetCollect(_variant_t("sResult"));
						BSTR sHTML = SysAllocString(vField.bstrVal);
						IMatchCollection2Ptr matches;						
						for( matches = pRegExp->Execute(sHTML); matches->Count>0; matches = pRegExp->Execute(sHTML) )
							if( matches->Count>0 )
							{
								CString sValue = "";
								IMatch2Ptr match = matches->Item[0];
								ISubMatchesPtr submatches = match->GetSubMatches();
								vField = RegExps->GetCollect(_variant_t("nSubMutchIndex"));
								if( submatches->Count >= vField.intVal)
								{
									int n = submatches->Count;
									int m = vField.intVal;
									vField = submatches->Item[vField.intVal];
									sValue = vField.bstrVal;
									sValue.Replace("'", "''");
									char t[20];

									_RecordsetPtr Excepts;
									bool b = true;
									CString sId = itoa(HTMLs->GetCollect(_variant_t("nId")).intVal, t, 10);
									Sleep(pSearchInfo->m_nPause);
									if(	pDatabase->Execute(_bstr_t("mp_GetSearchExcepts "+sId), &Excepts))
									{
										for(; b && !Excepts->adoEOF; Excepts->MoveNext() )
										{
											_variant_t vField = Excepts->GetCollect(_variant_t("sRegExp"));
											if(vField.vt != VT_NULL)
											{
												pRegExp->put_IgnoreCase(VARIANT_TRUE);
												pRegExp->put_Multiline(VARIANT_TRUE);
												pRegExp->put_Pattern(SysAllocString(vField.bstrVal));

												bool b1 = (pRegExp->Test(sValue.AllocSysString())==VARIANT_TRUE)?true:false;
												bool b2 = (Excepts->GetCollect(_variant_t("bExcept")).boolVal==VARIANT_TRUE)?true:false;

												b = (b1 != b2);
											}
										}
									}
									Excepts->Close();
	
									HANDLE hEvent = OpenEvent( EVENT_ALL_ACCESS, TRUE, pSearchInfo->m_sConnectionString+"StartStop" );
									if( !hEvent&&WaitForSingleObject(hEvent, 1) ) throw(new CUserException());


									Sleep(pSearchInfo->m_nPause);
									if(b) 
										pDatabase->ExecuteCommand(_bstr_t("mp_InsertSearchResult "+
																		CString(itoa(RegExps->GetCollect(_variant_t("nId")).intVal, t, 10))+
																		","+CString(itoa(HTMLs->GetCollect(_variant_t("nId")).intVal, t, 10))+
																		",'"+sValue+"';"));
								}

								CString s = sHTML;
								SysFreeString(sHTML);
								sHTML = s.Mid(s.Find(sValue)+sValue.GetLength()).AllocSysString();

								pRegExp->put_IgnoreCase(RegExps->GetCollect(_variant_t("bIgnoreCase")).boolVal);
								pRegExp->put_Multiline(VARIANT_TRUE);
								pRegExp->put_Pattern(SysAllocString(RegExps->GetCollect(_variant_t("sRegExp")).bstrVal));
							}
						SysFreeString(sHTML);
						char t[20];
						pDatabase->ExecuteCommand(_bstr_t("mp_MarkAsSearched "+CString(itoa(HTMLs->GetCollect(_variant_t("nId")).intVal, t, 10))+";"));
						Sleep(pSearchInfo->m_nPause);
					}
				}
			}
			HTMLs->Close();
		}		
		RegExps->Close();
	}	
	catch( _com_error e )	{ }
	catch(...)	{ }
	pRegExp.Release();
	pSearchInfo->m_bSearchStarted = false;
	if( pDatabase )
	{
		pDatabase->CloseConnection();
		delete pDatabase;
	}
	CoUninitialize();
	AfxEndThread(0);
	return 0;
}

UINT DownloadThread( LPVOID pParam )
{
	CoInitialize(NULL);
	CDownloadInfo* pDownloadInfo = (CDownloadInfo*)pParam;
	CWinThread* pThread = pDownloadInfo->m_pThread;
	CADO* pDatabase = 0;
	try
	{
		pDatabase = new CADO();
		pDatabase->OpenConnection(_bstr_t(pDownloadInfo->m_sConnectionString),_bstr_t(""),_bstr_t(""));

		CCoWebSpider* pWebSpider = (CCoWebSpider*)(pDownloadInfo->m_pWebSpider);
		CTime CurrentTime = CTime::GetCurrentTime();
		pDownloadInfo->m_nStartTime = (long)ceil(CurrentTime.GetTime()/1000);

	    char buff[4097];
		CString sResult = "";
		CStdioFile *pFile = pDownloadInfo->m_pInetSession->OpenURL(pDownloadInfo->m_sURL,1,INTERNET_FLAG_TRANSFER_BINARY|INTERNET_FLAG_RELOAD);
		int n = 0; 
        while(n = pFile->Read(buff, 4096))
		{
			buff[n]=0;
			sResult+=buff;
		}
		pFile->Close();

		HANDLE hEvent = OpenEvent( EVENT_ALL_ACCESS, TRUE, pDownloadInfo->m_sConnectionString+"StartStop" );
		if( !hEvent&&WaitForSingleObject(hEvent, 1) ) throw(new CUserException());

		CString sTest = sResult;
		sTest.MakeLower();
		if(sTest.Find("<title>404 not found</title>")==-1)
		{
			sResult.Replace("'", "''");
			char t[20];
			pDatabase->ExecuteCommand(_bstr_t("mp_InsertSource '"+CString(itoa(pDownloadInfo->m_nId,t,10))+"', '"+sResult+"';"));
		}
	}
	catch( _com_error e ) { }
	catch(...)	{ }
	if(pThread == pDownloadInfo->m_pThread)
	{
		pDownloadInfo->m_pThread = 0;
		pDownloadInfo->m_nStartTime = 0;
	}
	if( pDatabase )
	{
		pDatabase->CloseConnection();
		delete pDatabase;
	}
	CoUninitialize();
	AfxEndThread(0);
	return 0;
}

UINT FillURLs( LPVOID pParam )
{
	CoInitialize(NULL);
	CFillURLsInfo* pFillURLsInfo = (CFillURLsInfo*)pParam;
	IRegExp2Ptr pRegExp;
	pRegExp.CreateInstance(__uuidof(RegExp));
	CADO* pDatabase = 0;
	try
	{	
		pDatabase = new CADO();
		pDatabase->OpenConnection(_bstr_t(pFillURLsInfo->m_sConnectionString),_bstr_t(""),_bstr_t(""));

		_RecordsetPtr RegExps;
		Sleep(pFillURLsInfo->m_nPause);
		if(	pDatabase->Execute(_bstr_t("mp_GetLinksRegExps"), &RegExps))
		{
			_RecordsetPtr HTMLs;	
			Sleep(pFillURLsInfo->m_nPause);
			if(	pDatabase->Execute(_bstr_t("mp_GetNotParsedSources"), &HTMLs))
			{
					for(; !HTMLs->adoEOF; HTMLs->MoveNext() )
					{							
						RegExps->MoveFirst();
						for(; !RegExps->adoEOF; RegExps->MoveNext() )
						{							
							_variant_t vField = RegExps->GetCollect(_variant_t("sRegExp"));
							if(vField.vt != VT_NULL)
							{
								pRegExp->put_IgnoreCase(VARIANT_TRUE);
								pRegExp->put_Multiline(VARIANT_TRUE);
								pRegExp->put_Pattern(SysAllocString(vField.bstrVal));

								int nType = 0;
								vField = RegExps->GetCollect(_variant_t("nType"));
								if(vField.vt != VT_NULL) nType = vField.intVal;
	
								vField = HTMLs->GetCollect(_variant_t("sResult"));
								BSTR sHTML = SysAllocString(vField.bstrVal);
								IMatchCollection2Ptr matches;
								char t[20];
								for( matches = pRegExp->Execute(sHTML); matches->Count>0; matches = pRegExp->Execute(sHTML) )
									if( matches->Count>0 )
									{
										CString sValue = "";
										CString sURL = "";
										IMatch2Ptr match = matches->Item[0];
										ISubMatchesPtr submatches = match->GetSubMatches();
										vField = RegExps->GetCollect(_variant_t("nSubMutchIndex"));
										if( submatches->Count >= vField.intVal)
										{
											int n = submatches->Count;
											int m = vField.intVal;
											vField = submatches->Item[vField.intVal];
											sValue = vField.bstrVal;
											if(sValue=="") continue;
											unsigned short nPort;
											DWORD nServiceType;
											CString sServer;
											CString sObject;											
											char c;
											switch(nType)
											{
											case 1:
												sURL = HTMLs->GetCollect(_variant_t("sURL")).bstrVal;
												AfxParseURL(sURL, nServiceType, sServer, sObject, nPort);
												sURL = sURL.Left(sURL.Find(sServer)+sServer.GetLength())+':'+itoa(nPort, t, 10)+sValue;												
												break;
											case 2:
												sURL = HTMLs->GetCollect(_variant_t("sURL")).bstrVal;										
												AfxParseURL(sURL, nServiceType, sServer, sObject, nPort);
												c = sObject[sObject.GetLength()-1];
												if( c!='\\' && c!='/' )
												{
													int n1=sObject.ReverseFind('/');
													int n2=sObject.ReverseFind('\\');

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -