📄 spidercore.cpp
字号:
int SpiderCore::StartupThread(string thread_name){ for(vector<geturlParam *>::iterator it=thread_info.begin(); it != thread_info.end(); it++) { if((*it)->thread_name == thread_name) { geturlParam *lp = (geturlParam *) new geturlParam; lp->thread_number = (*it)->thread_number; lp->thread_name = (*it)->thread_name; lp->mapname = (*it)->mapname; lp->table = (*it)->mapname; lp->sc = (*it)->sc; lp->url_flag = (*it)->url_flag; lp->num_per_time = FETCH_NUM_DATA_PER_TIME; lp->pevent_handle = (*it)->pevent_handle; lp->pFunc = (*it)->pFunc; this->DoCreateThread((void *)lp); return 0; } } cout << "Sorry,can not find such thread:" << thread_name << ", please choose verify it" << endl; return 1;}int SpiderCore::CancelThread(string thread_name){ for(vector<geturlParam *>::iterator it=thread_info.begin(); it != thread_info.end(); it++) { if((*it)->thread_name == thread_name) { int ret = 0; ret = pthread_cancel(*((*it)->pthread_handle)); if(ret==0) { cout << "cancel succeed!" << endl; return 0; } } } cout << "can not find such thread:" << thread_name << endl; return 1;}int SpiderCore::ListRunningThreads(){ cout << "thread_name, thread_number\n" << endl; for (map<string, unsigned int>::iterator it=thread_number.begin(); it != thread_number.end(); it++) { cout << it->first << "," << it->second << endl; } cout << endl; return 0;}int SpiderCore::ListStartUpThreads(){ cout << "thread_name, thread_number\n" << endl; for(vector<geturlParam *>::iterator it=thread_info.begin(); it != thread_info.end(); it++) { cout << (*it)->thread_name << "," << (*it)->thread_number << endl; } cout << endl; return 0;}int SpiderCore::ThreadControleCenter(string action){ if(action=="help") { cout << "------------------" << endl; cout << "PID:" << getpid() << endl; cout << "type the words below:" << endl; cout << "liststartup list start up threads, some maybe end up" << endl; cout << "listrunning list the running threads" << endl; cout << "stop end up the spider program" << endl; cout << "startthread start up a thread" << endl; cout << "cancelthread end up a thread" << endl; cout << "help show the actions you could do" << endl; cout << "------------------" << endl; }else if(action=="startthread"){ string name=""; cout << "please enter the thread name:" << endl; cin >> name; if(name.size()>0) { this->StartupThread(name); } }else if(action=="cancelthread"){ string name=""; cout << "please enter the thread name:" << endl; cin >> name; if(name.size()>0) { this->CancelThread(name); } }else if(action == "stop"){ SPIDER_STOP = true; }else if(action == "listrunning"){ this->ListRunningThreads(); }else if(action == "liststartup"){ this->ListStartUpThreads(); }else{ cout << "Unknown Option:" << action << endl; } return 0;}/* *Thread Controle above */void SpiderCore::StartSpiderThreads(){ vector<string> nodes; string content = ""; string subContent = ""; string mapname = "main"; int ret = 0; unsigned int depth = 1; typedef multimap<int, string>::iterator multimap_it; unsigned int handle_i = 0; geturlParam* plparamThread = 0; pthread_t *phandle = 0; unsigned int t_number; pthread_cond_t *pEhandle = 0; pthread_mutex_t *pEMutex = 0; pthread_mutex_init(&count_lock,NULL); fo.WriteToFileLn(spider_log_file, "", 0); fo.WriteToFileLn(spider_insert_error_log_file, "", 0); string logstr = "the program start..."; cout << logstr << endl; fo.WriteToFileLn(spider_log_file, logstr, 1); //开始遍历带正则的节点 while (depth <= this->depth) { typedef multimap<int, string>::iterator mapname_it; mapname_it sta = this->regex_mapname.lower_bound(depth),end = this->regex_mapname.upper_bound(depth); //sta->first:multimap iterator first , number type //sta->second:.. .. second, string type, the map regex node while (sta != end) { map<string, string>::iterator map_it; map_it = this->config_map.find(sta->second + "_childnum");//map_it childnum node mapname = sta->second; if (map_it == this->config_map.end()) { string logstr = "In function StartSpiderThreads:can not find:" + sta->second + "_childnum"; fo.WriteToFileLn(spider_log_file, logstr, 1); return;//can not find mapname } if (map_it->second != "0") //有子节点 { //配置抓链接线程参数 //创建事件对象 //sta++;continue; string mapname_url_tmp = mapname+"_url"; pEhandle = (pthread_cond_t *) new pthread_cond_t; pthread_cond_init(pEhandle, NULL); thread_event.insert(make_pair(mapname_url_tmp, pEhandle)); pEMutex = (pthread_mutex_t *) new pthread_mutex_t; pthread_mutex_init(pEMutex, NULL); thread_event_lock.insert(make_pair(mapname_url_tmp, pEMutex)); t_number = Functions::PowUInt(2, handle_i); plparamThread = ( geturlParam* )new geturlParam; //set up param for geturl plparamThread->thread_number = t_number; plparamThread->thread_name = mapname_url_tmp; plparamThread->mapname = mapname; plparamThread->table = mapname; plparamThread->sc = this; plparamThread->url_flag = "0"; plparamThread->num_per_time = FETCH_NUM_DATA_PER_TIME; plparamThread->pevent_handle = pEhandle; //启动抓首页链接线程 phandle = (pthread_t *)new pthread_t; if (depth == 1) { plparamThread->pFunc = FunGetMainURLProc; ret = pthread_create(phandle, NULL, FunGetMainURLProc, (void *)plparamThread); } else { plparamThread->pFunc = FunGetSubURLProc; ret = pthread_create(phandle, NULL, FunGetSubURLProc, (void *)plparamThread); } if (ret) { string logstr = "In function StartSpiderThreads:CreateThread " + mapname + "_url" + "failed!"; fo.WriteToFileLn(spider_log_file, logstr, 1); } plparamThread->pthread_handle = phandle; thread_number.insert(make_pair(mapname+"_url", t_number)); thread_info.push_back(plparamThread); handle_i++; //配置抓内容链接参数 t_number = Functions::PowUInt(2, handle_i); //创建事件对象 string mapname_content_tmp = mapname + "_content"; pEhandle = (pthread_cond_t *) new pthread_cond_t; pthread_cond_init(pEhandle, NULL); thread_event.insert(make_pair(mapname_content_tmp, pEhandle)); pEMutex = (pthread_mutex_t *) new pthread_mutex_t; pthread_mutex_init(pEMutex, NULL); thread_event_lock.insert(make_pair(mapname_content_tmp, pEMutex)); plparamThread = ( geturlParam* )new geturlParam; //set up param for geturl plparamThread->thread_number = t_number; plparamThread->thread_name = mapname_content_tmp; plparamThread->mapname = mapname; plparamThread->table = mapname; plparamThread->sc = this; plparamThread->url_flag="0"; plparamThread->num_per_time=FETCH_NUM_DATA_PER_TIME; plparamThread->pevent_handle = pEhandle; plparamThread->pFunc = FunGetContentURLProc; //启动抓内容链接线程 phandle = (pthread_t *)new pthread_t; ret=pthread_create(phandle, NULL, FunGetContentURLProc, (void *)plparamThread); if (ret) { string logstr="In function StartSpiderThreads:CreateThread " + mapname_content_tmp + "failed!"; fo.WriteToFileLn(spider_log_file, logstr, 1); } plparamThread->pthread_handle = phandle; thread_number.insert(make_pair(mapname+"_content", t_number)); thread_info.push_back(plparamThread); handle_i++; } else { //内容抓取线程从数据库中读取资料时对互斥操作 pEMutex = (pthread_mutex_t *) new pthread_mutex_t; pthread_mutex_init(pEMutex, NULL); thread_event_lock.insert(make_pair(mapname, pEMutex)); for (int content_thread_i=0; content_thread_i<FETCH_CONTENT_THREADS_NUM; content_thread_i++) { t_number = Functions::PowUInt(2, handle_i); plparamThread = ( geturlParam* )new geturlParam; //set up param for geturl plparamThread->thread_number = t_number; plparamThread->thread_name = mapname; plparamThread->mapname = mapname; plparamThread->table = mapname; plparamThread->sc = this; plparamThread->url_flag = "1"; plparamThread->num_per_time = FETCH_NUM_DATA_PER_TIME; plparamThread->pevent_handle = NULL; plparamThread->pFunc = FunGetContentProc; plparamThread->thread_type = 1; phandle = (pthread_t *)new pthread_t; ret = pthread_create(phandle, NULL, FunGetContentProc, (void *)plparamThread); if (ret) { string logstr = "In function StartSpiderThreads:CreateThread " + mapname + "failed!"; fo.WriteToFileLn(spider_log_file, logstr, 1); } plparamThread->pthread_handle = phandle; thread_number.insert(make_pair(mapname, t_number)); thread_info.push_back(plparamThread); } handle_i++; } sta++; } depth++; } string action=""; while(1) { cout << "you could enter words, type 'help' to get the detal" << endl; cin >> action; if(action == "quit" || action == "q") { break; } if(action.size()>0) { this->ThreadControleCenter(action); } action = ""; } cout << "you have typed '" << action << "',to quit navigation" << endl; while (1) { if (thread_number.empty()) { string logstr="the program finilished the target and exit!"; cout << logstr << endl; fo.WriteToFileLn(spider_log_file, logstr, 1); break; } sleep(5); } //释放线程句柄开辟的空间 for(vector<geturlParam *>::iterator it = thread_info.begin(); it != thread_info.end(); it++) { delete (*it); } //释放条件变量开辟的空间 for(map<string, pthread_cond_t *>::iterator it = thread_event.begin(); it != thread_event.end(); it++) { delete it->second; } //释放互斥锁开辟的空间 for(map<string, pthread_mutex_t *>::iterator it = thread_event_lock.begin(); it != thread_event_lock.end(); it++) { pthread_mutex_destroy(it->second); delete it->second; }}void SpiderCore::print_config_map(){ map<string, string>::iterator it; string context=""; fo.WriteToFileLn("spider_config_map.txt", "config_map", 0); for (it=this->config_map.begin();it !=this->config_map.end(); it++) { context = it->first + ":" + it->second; fo.WriteToFileLn("spider_config_map.txt", context, 1); cout << context << endl; }}void SpiderCore::print_spider_regex_node_tree(){ multimap<int, string>::iterator it; char depth[12]={0}; string context=""; fo.WriteToFileLn("spider_regex_node_tree.txt", "spider_regex_node_tree", 0); for (it=this->r
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -