⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 cowebspider.cpp

📁 基于com的网络爬虫程序
💻 CPP
📖 第 1 页 / 共 2 页
字号:
													n1 = max(n1, n2);
													if(n1>-1) sObject = sObject.Left(n1);													
												}
												if(sObject == "") sObject = "/";
												if(sObject[0] != '/' && sObject[0] != '\\') sObject = '/'+sObject;
												if(sObject[sObject.GetLength()-1] != '/' && sObject[sObject.GetLength()-1] != '\\') sObject += '/';
												sURL = sURL.Left(sURL.Find(sServer)+sServer.GetLength())+':'+itoa(nPort, t, 10)+sObject+sValue;
												break;
											default: sURL=sValue;
											}
										}
										else continue;
		
										_RecordsetPtr Excepts;
										bool b = true;
										CString sId = itoa(HTMLs->GetCollect(_variant_t("nId")).intVal, t, 10);
										Sleep(pFillURLsInfo->m_nPause);
										if(	pDatabase->Execute(_bstr_t("mp_GetLinksExcepts "+sId), &Excepts))
										{
											for(; b && !Excepts->adoEOF; Excepts->MoveNext() )
											{
												_variant_t vField = Excepts->GetCollect(_variant_t("sRegExp"));
												if(vField.vt != VT_NULL)
												{
													pRegExp->put_IgnoreCase(VARIANT_TRUE);
													pRegExp->put_Multiline(VARIANT_TRUE);
													pRegExp->put_Pattern(SysAllocString(vField.bstrVal));

													bool b1 = (pRegExp->Test(sURL.AllocSysString())==VARIANT_TRUE)?true:false;
													bool b2 = (Excepts->GetCollect(_variant_t("bExcept")).boolVal==VARIANT_TRUE)?true:false;

													b = (b1 != b2);
												}
											}
										}
										Excepts->Close();
										Sleep(pFillURLsInfo->m_nPause);

										HANDLE hEvent = OpenEvent( EVENT_ALL_ACCESS, TRUE, pFillURLsInfo->m_sConnectionString+"StartStop" );
										if( !hEvent&&WaitForSingleObject(hEvent, 1) ) throw(new CUserException());

										if(b) pDatabase->ExecuteCommand(_bstr_t("mp_InsertURL '"+
																	sURL+"', "+itoa(nType, t, 10)+", "+sId+";"));

										Sleep(pFillURLsInfo->m_nPause);

										CString s = sHTML;
										SysFreeString(sHTML);
										sHTML = s.Mid(s.Find(sValue)+1).AllocSysString();
										
										pRegExp->put_IgnoreCase(VARIANT_TRUE);
										pRegExp->put_Multiline(VARIANT_TRUE);
										pRegExp->put_Pattern(SysAllocString(RegExps->GetCollect(_variant_t("sRegExp")).bstrVal));
									}
								SysFreeString(sHTML);				
								Sleep(pFillURLsInfo->m_nPause);
								pDatabase->ExecuteCommand(_bstr_t("mp_MarkAsParsed "+CString(itoa(HTMLs->GetCollect(_variant_t("nId")).intVal, t, 10))+";"));
							}
						}
					
				}
			}
			HTMLs->Close();
		}
		RegExps->Close();
	}
	catch( _com_error e ) { }
	catch(...) { }
	pRegExp.Release();
	pFillURLsInfo->m_bFillingStarted = false;
	if( pDatabase )
	{
		pDatabase->CloseConnection();
		delete pDatabase;
	}
	CoUninitialize();
	AfxEndThread(0);
	return 0;
}

UINT MainThread( LPVOID pParam )
{	
	CoInitialize(NULL);
	HANDLE hEvent = 0;
	HANDLE hStartStopEvent = 0;
	CADO* pDatabase = 0;
	try
	{
		CMainThreadInfo* pMainThreadInfo = (CMainThreadInfo*)pParam;
		CCoWebSpider* pWebSpider = (CCoWebSpider*)(pMainThreadInfo->m_pWebSpider);
		CADO* pDatabase = new CADO();
		pDatabase->OpenConnection(pMainThreadInfo->m_bstrConnectionString,_bstr_t(""),_bstr_t(""));

		CString sStartStopEvent = CString(pMainThreadInfo->m_bstrConnectionString)+"StartStop";	
		HANDLE hStartStopEvent = CreateEvent( NULL, TRUE, TRUE, sStartStopEvent);
		
		CInternetSession* pInetSession;
		CString sProxy = pWebSpider->GetOption("Proxy", pDatabase);
		if(sProxy != "")
			pInetSession = new CInternetSession("WebSpiderSession",	1, INTERNET_OPEN_TYPE_PROXY, sProxy);
		else pInetSession = new CInternetSession("WebSpiderSession",1, INTERNET_OPEN_TYPE_PRECONFIG);

		int nThreadsCount =  atoi(pWebSpider->GetOption("Threads", pDatabase));
		CDownloadInfo* arThreads = new CDownloadInfo[nThreadsCount];
		for(int i=0; i<nThreadsCount; i++) arThreads[i].m_pThread=0;

		int nTimeout =  atoi(pWebSpider->GetOption("Timeout", pDatabase));
		int nPause =  atoi(pWebSpider->GetOption("Pause", pDatabase));
		
		_RecordsetPtr URLs;
		CString s = CString(pMainThreadInfo->m_bstrConnectionString);	
		hEvent = OpenEvent( EVENT_ALL_ACCESS, TRUE, s);

		bool bFillURLs = true;
		CFillURLsInfo* pFillURLsInfo = new CFillURLsInfo();
		pFillURLsInfo->m_pWebSpider = pWebSpider;
		pFillURLsInfo->m_nPause = nPause;
		pFillURLsInfo->m_sConnectionString = s;

		CSearchInfo* pSearchInfo = new CSearchInfo();
		pSearchInfo->m_pWebSpider = pWebSpider;
		pSearchInfo->m_nPause = nPause;
		pSearchInfo->m_sConnectionString = s;

		while(hEvent && WaitForSingleObject(hEvent, 1))
		{
		try
		{
			Sleep(pFillURLsInfo->m_nPause);
			if(pWebSpider->GetOption("Status", pDatabase)!="Stoped")
			{
				pWebSpider->SetOption("RealStatus", "Worked", pDatabase);
				ResetEvent(hStartStopEvent);

				if( bFillURLs && !pFillURLsInfo->m_bFillingStarted )
				{
					pDatabase->Execute(_bstr_t("mp_GetNotTreatedURLsCount"), &URLs);
					if( nThreadsCount > URLs->GetCollect(_variant_t("nCount")).intVal )
					{
						pFillURLsInfo->m_bFillingStarted = true;						  
						AfxBeginThread( FillURLs, pFillURLsInfo, THREAD_PRIORITY_NORMAL );
					}
				}

				Sleep(pFillURLsInfo->m_nPause);
				bFillURLs = true;
				if(	pDatabase->Execute(_bstr_t("mp_GetNotTreatedURLs"), &URLs))
				{
					for(; !URLs->adoEOF; URLs->MoveNext() )
					{			
						int nCurrentEmptyThread = -1;
						for(int i=0; i<nThreadsCount; i++ )
							if(arThreads[i].m_pThread==0)
							{
								nCurrentEmptyThread = i;
								break;
							}
							else 
							{
								CTime CurrentTime = CTime::GetCurrentTime();
								arThreads[i].m_nStartTime = (long)ceil(CurrentTime.GetTime()/1000);
								if(arThreads[i].m_nStartTime && 
									(CurrentTime.GetTime()/1000-arThreads[i].m_nStartTime) > nTimeout)
								{
									nCurrentEmptyThread = i;
									break;
								}
							}
						if(nCurrentEmptyThread != -1)
						{
							_variant_t vField = URLs->GetCollect(_variant_t("sURL"));
							if(vField.vt != VT_NULL)
							{
								arThreads[nCurrentEmptyThread].m_nId = URLs->GetCollect(_variant_t("nId")).intVal;
								arThreads[nCurrentEmptyThread].m_sURL = vField.bstrVal;
								arThreads[nCurrentEmptyThread].m_pWebSpider = pMainThreadInfo->m_pWebSpider;
								arThreads[nCurrentEmptyThread].m_sConnectionString = pSearchInfo->m_sConnectionString;

								arThreads[nCurrentEmptyThread].m_pInetSession = pInetSession;

								arThreads[nCurrentEmptyThread].m_pThread = AfxBeginThread( DownloadThread, &arThreads[nCurrentEmptyThread], THREAD_PRIORITY_LOWEST );
								char t[20];
								pDatabase->ExecuteCommand(_bstr_t("mp_MarkAsTreated '"+CString(itoa(URLs->GetCollect(_variant_t("nId")).intVal, t, 10))+"';"));
							}
						}
						else bFillURLs = false;
					}
					URLs->Close();					
					pDatabase->CloseConnection();
					Sleep(pFillURLsInfo->m_nPause);
					pDatabase->OpenConnection(pDatabase->m_Connection->ConnectionString,_bstr_t(""),_bstr_t(""));
					if( !pSearchInfo->m_bSearchStarted ) 
					{
						pSearchInfo->m_bSearchStarted = true;
						AfxBeginThread( Search, pSearchInfo, THREAD_PRIORITY_NORMAL);
					}
				}
			}
			else
			{
				SetEvent(hStartStopEvent);
				for(int i=0; i<nThreadsCount; i++ )
					if(arThreads[i].m_pThread!=0) break;
				if(  i==nThreadsCount && !pSearchInfo->m_bSearchStarted && !pFillURLsInfo->m_bFillingStarted )
				pWebSpider->SetOption("RealStatus", "Stoped", pDatabase);
			}
			Sleep(nPause);
		}
		catch( _com_error e ) {  }
		catch(...) {  }	
		};
		delete pSearchInfo;
		delete pFillURLsInfo;
	}
	catch( _com_error e ) {  }
	catch(...) {  }	
	if(pDatabase) pDatabase->CloseConnection();
	if(hEvent) CloseHandle(hEvent);
	CoUninitialize();
	AfxEndThread(0);
	return 0;
}

CString CCoWebSpider::GetOption(CString sName, CADO* pDatabase)
{
	_RecordsetPtr Options;
	sName.Replace("'", "''");
	if(	pDatabase->Execute(_bstr_t("mp_GetOption '"+sName+"'"), &Options))
	{
		_variant_t vField = Options->GetCollect(_variant_t("sValue"));
		if(vField.vt != VT_NULL) return vField.bstrVal;
	}	
	Options->Close();
	return "";
}

void CCoWebSpider::SetOption(CString sName, CString sValue, CADO* pDatabase)
{
	sName.Replace("'", "''");
	sValue.Replace("'", "''");
	pDatabase->ExecuteCommand(_bstr_t("mp_SetOption '"+sName+"','"+sValue+"';"));
}

STDMETHODIMP CCoWebSpider::Begin(BSTR *bstrConnectionString)
{
	AFX_MANAGE_STATE(AfxGetStaticModuleState())
	VARIANT_BOOL bVal;	
	IsExists(bstrConnectionString, &bVal);
	if(bVal==VARIANT_FALSE)
	{
		CString s = CString(*bstrConnectionString);	
		HANDLE hEvent = CreateEvent( NULL, TRUE, TRUE, s);
		ResetEvent(hEvent);
		CWinThread* pThread = AfxBeginThread( MainThread, new CMainThreadInfo( SysAllocString(*bstrConnectionString), this ), THREAD_PRIORITY_NORMAL);
	}
	return S_OK;
}

STDMETHODIMP CCoWebSpider::Finish(BSTR *bstrConnectionString)
{
	AFX_MANAGE_STATE(AfxGetStaticModuleState())
	VARIANT_BOOL bVal;	
	IsExists(bstrConnectionString, &bVal);
	if(bVal==VARIANT_TRUE)
	{
		CString s = CString(*bstrConnectionString);
		HANDLE hEvent = OpenEvent( EVENT_ALL_ACCESS, TRUE, s);
		SetEvent(hEvent);
		do 	IsExists(bstrConnectionString, &bVal); while (bVal==VARIANT_TRUE);		
	}
	return S_OK;
}

STDMETHODIMP CCoWebSpider::IsExists(BSTR *bstrConnectionString, VARIANT_BOOL *bVal)
{
	AFX_MANAGE_STATE(AfxGetStaticModuleState())
	CString s = CString(*bstrConnectionString);
	HANDLE hEvent = OpenEvent( EVENT_ALL_ACCESS, TRUE, s);
	*bVal = hEvent&&WaitForSingleObject(hEvent, 1)?VARIANT_TRUE:VARIANT_FALSE;
	return S_OK;
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -