📄 cowebspider.cpp
字号:
n1 = max(n1, n2);
if(n1>-1) sObject = sObject.Left(n1);
}
if(sObject == "") sObject = "/";
if(sObject[0] != '/' && sObject[0] != '\\') sObject = '/'+sObject;
if(sObject[sObject.GetLength()-1] != '/' && sObject[sObject.GetLength()-1] != '\\') sObject += '/';
sURL = sURL.Left(sURL.Find(sServer)+sServer.GetLength())+':'+itoa(nPort, t, 10)+sObject+sValue;
break;
default: sURL=sValue;
}
}
else continue;
_RecordsetPtr Excepts;
bool b = true;
CString sId = itoa(HTMLs->GetCollect(_variant_t("nId")).intVal, t, 10);
Sleep(pFillURLsInfo->m_nPause);
if( pDatabase->Execute(_bstr_t("mp_GetLinksExcepts "+sId), &Excepts))
{
for(; b && !Excepts->adoEOF; Excepts->MoveNext() )
{
_variant_t vField = Excepts->GetCollect(_variant_t("sRegExp"));
if(vField.vt != VT_NULL)
{
pRegExp->put_IgnoreCase(VARIANT_TRUE);
pRegExp->put_Multiline(VARIANT_TRUE);
pRegExp->put_Pattern(SysAllocString(vField.bstrVal));
bool b1 = (pRegExp->Test(sURL.AllocSysString())==VARIANT_TRUE)?true:false;
bool b2 = (Excepts->GetCollect(_variant_t("bExcept")).boolVal==VARIANT_TRUE)?true:false;
b = (b1 != b2);
}
}
}
Excepts->Close();
Sleep(pFillURLsInfo->m_nPause);
HANDLE hEvent = OpenEvent( EVENT_ALL_ACCESS, TRUE, pFillURLsInfo->m_sConnectionString+"StartStop" );
if( !hEvent&&WaitForSingleObject(hEvent, 1) ) throw(new CUserException());
if(b) pDatabase->ExecuteCommand(_bstr_t("mp_InsertURL '"+
sURL+"', "+itoa(nType, t, 10)+", "+sId+";"));
Sleep(pFillURLsInfo->m_nPause);
CString s = sHTML;
SysFreeString(sHTML);
sHTML = s.Mid(s.Find(sValue)+1).AllocSysString();
pRegExp->put_IgnoreCase(VARIANT_TRUE);
pRegExp->put_Multiline(VARIANT_TRUE);
pRegExp->put_Pattern(SysAllocString(RegExps->GetCollect(_variant_t("sRegExp")).bstrVal));
}
SysFreeString(sHTML);
Sleep(pFillURLsInfo->m_nPause);
pDatabase->ExecuteCommand(_bstr_t("mp_MarkAsParsed "+CString(itoa(HTMLs->GetCollect(_variant_t("nId")).intVal, t, 10))+";"));
}
}
}
}
HTMLs->Close();
}
RegExps->Close();
}
catch( _com_error e ) { }
catch(...) { }
pRegExp.Release();
pFillURLsInfo->m_bFillingStarted = false;
if( pDatabase )
{
pDatabase->CloseConnection();
delete pDatabase;
}
CoUninitialize();
AfxEndThread(0);
return 0;
}
UINT MainThread( LPVOID pParam )
{
CoInitialize(NULL);
HANDLE hEvent = 0;
HANDLE hStartStopEvent = 0;
CADO* pDatabase = 0;
try
{
CMainThreadInfo* pMainThreadInfo = (CMainThreadInfo*)pParam;
CCoWebSpider* pWebSpider = (CCoWebSpider*)(pMainThreadInfo->m_pWebSpider);
CADO* pDatabase = new CADO();
pDatabase->OpenConnection(pMainThreadInfo->m_bstrConnectionString,_bstr_t(""),_bstr_t(""));
CString sStartStopEvent = CString(pMainThreadInfo->m_bstrConnectionString)+"StartStop";
HANDLE hStartStopEvent = CreateEvent( NULL, TRUE, TRUE, sStartStopEvent);
CInternetSession* pInetSession;
CString sProxy = pWebSpider->GetOption("Proxy", pDatabase);
if(sProxy != "")
pInetSession = new CInternetSession("WebSpiderSession", 1, INTERNET_OPEN_TYPE_PROXY, sProxy);
else pInetSession = new CInternetSession("WebSpiderSession",1, INTERNET_OPEN_TYPE_PRECONFIG);
int nThreadsCount = atoi(pWebSpider->GetOption("Threads", pDatabase));
CDownloadInfo* arThreads = new CDownloadInfo[nThreadsCount];
for(int i=0; i<nThreadsCount; i++) arThreads[i].m_pThread=0;
int nTimeout = atoi(pWebSpider->GetOption("Timeout", pDatabase));
int nPause = atoi(pWebSpider->GetOption("Pause", pDatabase));
_RecordsetPtr URLs;
CString s = CString(pMainThreadInfo->m_bstrConnectionString);
hEvent = OpenEvent( EVENT_ALL_ACCESS, TRUE, s);
bool bFillURLs = true;
CFillURLsInfo* pFillURLsInfo = new CFillURLsInfo();
pFillURLsInfo->m_pWebSpider = pWebSpider;
pFillURLsInfo->m_nPause = nPause;
pFillURLsInfo->m_sConnectionString = s;
CSearchInfo* pSearchInfo = new CSearchInfo();
pSearchInfo->m_pWebSpider = pWebSpider;
pSearchInfo->m_nPause = nPause;
pSearchInfo->m_sConnectionString = s;
while(hEvent && WaitForSingleObject(hEvent, 1))
{
try
{
Sleep(pFillURLsInfo->m_nPause);
if(pWebSpider->GetOption("Status", pDatabase)!="Stoped")
{
pWebSpider->SetOption("RealStatus", "Worked", pDatabase);
ResetEvent(hStartStopEvent);
if( bFillURLs && !pFillURLsInfo->m_bFillingStarted )
{
pDatabase->Execute(_bstr_t("mp_GetNotTreatedURLsCount"), &URLs);
if( nThreadsCount > URLs->GetCollect(_variant_t("nCount")).intVal )
{
pFillURLsInfo->m_bFillingStarted = true;
AfxBeginThread( FillURLs, pFillURLsInfo, THREAD_PRIORITY_NORMAL );
}
}
Sleep(pFillURLsInfo->m_nPause);
bFillURLs = true;
if( pDatabase->Execute(_bstr_t("mp_GetNotTreatedURLs"), &URLs))
{
for(; !URLs->adoEOF; URLs->MoveNext() )
{
int nCurrentEmptyThread = -1;
for(int i=0; i<nThreadsCount; i++ )
if(arThreads[i].m_pThread==0)
{
nCurrentEmptyThread = i;
break;
}
else
{
CTime CurrentTime = CTime::GetCurrentTime();
arThreads[i].m_nStartTime = (long)ceil(CurrentTime.GetTime()/1000);
if(arThreads[i].m_nStartTime &&
(CurrentTime.GetTime()/1000-arThreads[i].m_nStartTime) > nTimeout)
{
nCurrentEmptyThread = i;
break;
}
}
if(nCurrentEmptyThread != -1)
{
_variant_t vField = URLs->GetCollect(_variant_t("sURL"));
if(vField.vt != VT_NULL)
{
arThreads[nCurrentEmptyThread].m_nId = URLs->GetCollect(_variant_t("nId")).intVal;
arThreads[nCurrentEmptyThread].m_sURL = vField.bstrVal;
arThreads[nCurrentEmptyThread].m_pWebSpider = pMainThreadInfo->m_pWebSpider;
arThreads[nCurrentEmptyThread].m_sConnectionString = pSearchInfo->m_sConnectionString;
arThreads[nCurrentEmptyThread].m_pInetSession = pInetSession;
arThreads[nCurrentEmptyThread].m_pThread = AfxBeginThread( DownloadThread, &arThreads[nCurrentEmptyThread], THREAD_PRIORITY_LOWEST );
char t[20];
pDatabase->ExecuteCommand(_bstr_t("mp_MarkAsTreated '"+CString(itoa(URLs->GetCollect(_variant_t("nId")).intVal, t, 10))+"';"));
}
}
else bFillURLs = false;
}
URLs->Close();
pDatabase->CloseConnection();
Sleep(pFillURLsInfo->m_nPause);
pDatabase->OpenConnection(pDatabase->m_Connection->ConnectionString,_bstr_t(""),_bstr_t(""));
if( !pSearchInfo->m_bSearchStarted )
{
pSearchInfo->m_bSearchStarted = true;
AfxBeginThread( Search, pSearchInfo, THREAD_PRIORITY_NORMAL);
}
}
}
else
{
SetEvent(hStartStopEvent);
for(int i=0; i<nThreadsCount; i++ )
if(arThreads[i].m_pThread!=0) break;
if( i==nThreadsCount && !pSearchInfo->m_bSearchStarted && !pFillURLsInfo->m_bFillingStarted )
pWebSpider->SetOption("RealStatus", "Stoped", pDatabase);
}
Sleep(nPause);
}
catch( _com_error e ) { }
catch(...) { }
};
delete pSearchInfo;
delete pFillURLsInfo;
}
catch( _com_error e ) { }
catch(...) { }
if(pDatabase) pDatabase->CloseConnection();
if(hEvent) CloseHandle(hEvent);
CoUninitialize();
AfxEndThread(0);
return 0;
}
CString CCoWebSpider::GetOption(CString sName, CADO* pDatabase)
{
_RecordsetPtr Options;
sName.Replace("'", "''");
if( pDatabase->Execute(_bstr_t("mp_GetOption '"+sName+"'"), &Options))
{
_variant_t vField = Options->GetCollect(_variant_t("sValue"));
if(vField.vt != VT_NULL) return vField.bstrVal;
}
Options->Close();
return "";
}
void CCoWebSpider::SetOption(CString sName, CString sValue, CADO* pDatabase)
{
sName.Replace("'", "''");
sValue.Replace("'", "''");
pDatabase->ExecuteCommand(_bstr_t("mp_SetOption '"+sName+"','"+sValue+"';"));
}
STDMETHODIMP CCoWebSpider::Begin(BSTR *bstrConnectionString)
{
AFX_MANAGE_STATE(AfxGetStaticModuleState())
VARIANT_BOOL bVal;
IsExists(bstrConnectionString, &bVal);
if(bVal==VARIANT_FALSE)
{
CString s = CString(*bstrConnectionString);
HANDLE hEvent = CreateEvent( NULL, TRUE, TRUE, s);
ResetEvent(hEvent);
CWinThread* pThread = AfxBeginThread( MainThread, new CMainThreadInfo( SysAllocString(*bstrConnectionString), this ), THREAD_PRIORITY_NORMAL);
}
return S_OK;
}
STDMETHODIMP CCoWebSpider::Finish(BSTR *bstrConnectionString)
{
AFX_MANAGE_STATE(AfxGetStaticModuleState())
VARIANT_BOOL bVal;
IsExists(bstrConnectionString, &bVal);
if(bVal==VARIANT_TRUE)
{
CString s = CString(*bstrConnectionString);
HANDLE hEvent = OpenEvent( EVENT_ALL_ACCESS, TRUE, s);
SetEvent(hEvent);
do IsExists(bstrConnectionString, &bVal); while (bVal==VARIANT_TRUE);
}
return S_OK;
}
STDMETHODIMP CCoWebSpider::IsExists(BSTR *bstrConnectionString, VARIANT_BOOL *bVal)
{
AFX_MANAGE_STATE(AfxGetStaticModuleState())
CString s = CString(*bstrConnectionString);
HANDLE hEvent = OpenEvent( EVENT_ALL_ACCESS, TRUE, s);
*bVal = hEvent&&WaitForSingleObject(hEvent, 1)?VARIANT_TRUE:VARIANT_FALSE;
return S_OK;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -