📄 cowebspider.cpp
字号:
//
// This file is part of UniWebSpider Project.
//
// UniWebSpider is free software; you can redistribute it and/or modify
// it under the terms of the GNU Lesser General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// UniWebSpider is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with UniWebSpider; if not, write to the Free Software Foundation, Inc.,
// 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// WRITTEN BY : ValGarn
// CoWebSpider.cpp : Implementation of CCoWebSpider
#include "stdafx.h"
#include "WebSpider.h"
#include "CoWebSpider.h"
#include "DownloadInfo.h"
#include "AFXPRIV.H"
#include "MainThreadInfo.h"
#include "SearchInfo.h"
#include "FillURLsInfo.h"
#include "math.h"
/////////////////////////////////////////////////////////////////////////////
// CCoWebSpider
UINT Search( LPVOID pParam )
{
CoInitialize(NULL);
CSearchInfo* pSearchInfo = (CSearchInfo*)pParam;
IRegExp2Ptr pRegExp;
pRegExp.CreateInstance(__uuidof(RegExp));
CADO* pDatabase = 0;
try
{
pDatabase = new CADO();
pDatabase->OpenConnection(_bstr_t(pSearchInfo->m_sConnectionString),_bstr_t(""),_bstr_t(""));
_RecordsetPtr RegExps;
if( pDatabase->Execute(_bstr_t("mp_GetSearchRegExps"), &RegExps))
{
Sleep(pSearchInfo->m_nPause);
_RecordsetPtr HTMLs;
if( pDatabase->Execute(_bstr_t("mp_GetNotSearchedSources"), &HTMLs))
for(; !HTMLs->adoEOF; HTMLs->MoveNext() )
{
RegExps->MoveFirst();
for(; !RegExps->adoEOF; RegExps->MoveNext() )
{
_variant_t vField = RegExps->GetCollect(_variant_t("sRegExp"));
if(vField.vt != VT_NULL)
{
pRegExp->put_IgnoreCase(RegExps->GetCollect(_variant_t("bIgnoreCase")).boolVal);
pRegExp->put_Multiline(VARIANT_TRUE);
pRegExp->put_Pattern(SysAllocString(vField.bstrVal));
vField = HTMLs->GetCollect(_variant_t("sResult"));
BSTR sHTML = SysAllocString(vField.bstrVal);
IMatchCollection2Ptr matches;
for( matches = pRegExp->Execute(sHTML); matches->Count>0; matches = pRegExp->Execute(sHTML) )
if( matches->Count>0 )
{
CString sValue = "";
IMatch2Ptr match = matches->Item[0];
ISubMatchesPtr submatches = match->GetSubMatches();
vField = RegExps->GetCollect(_variant_t("nSubMutchIndex"));
if( submatches->Count >= vField.intVal)
{
int n = submatches->Count;
int m = vField.intVal;
vField = submatches->Item[vField.intVal];
sValue = vField.bstrVal;
sValue.Replace("'", "''");
char t[20];
_RecordsetPtr Excepts;
bool b = true;
CString sId = itoa(HTMLs->GetCollect(_variant_t("nId")).intVal, t, 10);
Sleep(pSearchInfo->m_nPause);
if( pDatabase->Execute(_bstr_t("mp_GetSearchExcepts "+sId), &Excepts))
{
for(; b && !Excepts->adoEOF; Excepts->MoveNext() )
{
_variant_t vField = Excepts->GetCollect(_variant_t("sRegExp"));
if(vField.vt != VT_NULL)
{
pRegExp->put_IgnoreCase(VARIANT_TRUE);
pRegExp->put_Multiline(VARIANT_TRUE);
pRegExp->put_Pattern(SysAllocString(vField.bstrVal));
bool b1 = (pRegExp->Test(sValue.AllocSysString())==VARIANT_TRUE)?true:false;
bool b2 = (Excepts->GetCollect(_variant_t("bExcept")).boolVal==VARIANT_TRUE)?true:false;
b = (b1 != b2);
}
}
}
Excepts->Close();
HANDLE hEvent = OpenEvent( EVENT_ALL_ACCESS, TRUE, pSearchInfo->m_sConnectionString+"StartStop" );
if( !hEvent&&WaitForSingleObject(hEvent, 1) ) throw(new CUserException());
Sleep(pSearchInfo->m_nPause);
if(b)
pDatabase->ExecuteCommand(_bstr_t("mp_InsertSearchResult "+
CString(itoa(RegExps->GetCollect(_variant_t("nId")).intVal, t, 10))+
","+CString(itoa(HTMLs->GetCollect(_variant_t("nId")).intVal, t, 10))+
",'"+sValue+"';"));
}
CString s = sHTML;
SysFreeString(sHTML);
sHTML = s.Mid(s.Find(sValue)+sValue.GetLength()).AllocSysString();
pRegExp->put_IgnoreCase(RegExps->GetCollect(_variant_t("bIgnoreCase")).boolVal);
pRegExp->put_Multiline(VARIANT_TRUE);
pRegExp->put_Pattern(SysAllocString(RegExps->GetCollect(_variant_t("sRegExp")).bstrVal));
}
SysFreeString(sHTML);
char t[20];
pDatabase->ExecuteCommand(_bstr_t("mp_MarkAsSearched "+CString(itoa(HTMLs->GetCollect(_variant_t("nId")).intVal, t, 10))+";"));
Sleep(pSearchInfo->m_nPause);
}
}
}
HTMLs->Close();
}
RegExps->Close();
}
catch( _com_error e ) { }
catch(...) { }
pRegExp.Release();
pSearchInfo->m_bSearchStarted = false;
if( pDatabase )
{
pDatabase->CloseConnection();
delete pDatabase;
}
CoUninitialize();
AfxEndThread(0);
return 0;
}
UINT DownloadThread( LPVOID pParam )
{
CoInitialize(NULL);
CDownloadInfo* pDownloadInfo = (CDownloadInfo*)pParam;
CWinThread* pThread = pDownloadInfo->m_pThread;
CADO* pDatabase = 0;
try
{
pDatabase = new CADO();
pDatabase->OpenConnection(_bstr_t(pDownloadInfo->m_sConnectionString),_bstr_t(""),_bstr_t(""));
CCoWebSpider* pWebSpider = (CCoWebSpider*)(pDownloadInfo->m_pWebSpider);
CTime CurrentTime = CTime::GetCurrentTime();
pDownloadInfo->m_nStartTime = (long)ceil(CurrentTime.GetTime()/1000);
char buff[4097];
CString sResult = "";
CStdioFile *pFile = pDownloadInfo->m_pInetSession->OpenURL(pDownloadInfo->m_sURL,1,INTERNET_FLAG_TRANSFER_BINARY|INTERNET_FLAG_RELOAD);
int n = 0;
while(n = pFile->Read(buff, 4096))
{
buff[n]=0;
sResult+=buff;
}
pFile->Close();
HANDLE hEvent = OpenEvent( EVENT_ALL_ACCESS, TRUE, pDownloadInfo->m_sConnectionString+"StartStop" );
if( !hEvent&&WaitForSingleObject(hEvent, 1) ) throw(new CUserException());
CString sTest = sResult;
sTest.MakeLower();
if(sTest.Find("<title>404 not found</title>")==-1)
{
sResult.Replace("'", "''");
char t[20];
pDatabase->ExecuteCommand(_bstr_t("mp_InsertSource '"+CString(itoa(pDownloadInfo->m_nId,t,10))+"', '"+sResult+"';"));
}
}
catch( _com_error e ) { }
catch(...) { }
if(pThread == pDownloadInfo->m_pThread)
{
pDownloadInfo->m_pThread = 0;
pDownloadInfo->m_nStartTime = 0;
}
if( pDatabase )
{
pDatabase->CloseConnection();
delete pDatabase;
}
CoUninitialize();
AfxEndThread(0);
return 0;
}
UINT FillURLs( LPVOID pParam )
{
CoInitialize(NULL);
CFillURLsInfo* pFillURLsInfo = (CFillURLsInfo*)pParam;
IRegExp2Ptr pRegExp;
pRegExp.CreateInstance(__uuidof(RegExp));
CADO* pDatabase = 0;
try
{
pDatabase = new CADO();
pDatabase->OpenConnection(_bstr_t(pFillURLsInfo->m_sConnectionString),_bstr_t(""),_bstr_t(""));
_RecordsetPtr RegExps;
Sleep(pFillURLsInfo->m_nPause);
if( pDatabase->Execute(_bstr_t("mp_GetLinksRegExps"), &RegExps))
{
_RecordsetPtr HTMLs;
Sleep(pFillURLsInfo->m_nPause);
if( pDatabase->Execute(_bstr_t("mp_GetNotParsedSources"), &HTMLs))
{
for(; !HTMLs->adoEOF; HTMLs->MoveNext() )
{
RegExps->MoveFirst();
for(; !RegExps->adoEOF; RegExps->MoveNext() )
{
_variant_t vField = RegExps->GetCollect(_variant_t("sRegExp"));
if(vField.vt != VT_NULL)
{
pRegExp->put_IgnoreCase(VARIANT_TRUE);
pRegExp->put_Multiline(VARIANT_TRUE);
pRegExp->put_Pattern(SysAllocString(vField.bstrVal));
int nType = 0;
vField = RegExps->GetCollect(_variant_t("nType"));
if(vField.vt != VT_NULL) nType = vField.intVal;
vField = HTMLs->GetCollect(_variant_t("sResult"));
BSTR sHTML = SysAllocString(vField.bstrVal);
IMatchCollection2Ptr matches;
char t[20];
for( matches = pRegExp->Execute(sHTML); matches->Count>0; matches = pRegExp->Execute(sHTML) )
if( matches->Count>0 )
{
CString sValue = "";
CString sURL = "";
IMatch2Ptr match = matches->Item[0];
ISubMatchesPtr submatches = match->GetSubMatches();
vField = RegExps->GetCollect(_variant_t("nSubMutchIndex"));
if( submatches->Count >= vField.intVal)
{
int n = submatches->Count;
int m = vField.intVal;
vField = submatches->Item[vField.intVal];
sValue = vField.bstrVal;
if(sValue=="") continue;
unsigned short nPort;
DWORD nServiceType;
CString sServer;
CString sObject;
char c;
switch(nType)
{
case 1:
sURL = HTMLs->GetCollect(_variant_t("sURL")).bstrVal;
AfxParseURL(sURL, nServiceType, sServer, sObject, nPort);
sURL = sURL.Left(sURL.Find(sServer)+sServer.GetLength())+':'+itoa(nPort, t, 10)+sValue;
break;
case 2:
sURL = HTMLs->GetCollect(_variant_t("sURL")).bstrVal;
AfxParseURL(sURL, nServiceType, sServer, sObject, nPort);
c = sObject[sObject.GetLength()-1];
if( c!='\\' && c!='/' )
{
int n1=sObject.ReverseFind('/');
int n2=sObject.ReverseFind('\\');
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -