📄 spidercore.cpp
字号:
#define FETCH_NUM_DATA_PER_TIME 10#define FETCH_CONTENT_THREADS_NUM 20#include "spider.h"FileOperator fo;Functions func;bool SPIDER_STOP = false;struct geturlParam{public: geturlParam() { thread_name=""; mapname=""; table=""; sc=NULL; num_per_time=0; thread_type = 0; url_flag=""; pevent_handle=NULL; pthread_handle=NULL; pFunc = NULL; } unsigned int thread_number; string thread_name; int thread_type; string mapname; string table; SpiderCore *sc; int num_per_time; string url_flag;//数据库中的url是否被读过 pthread_cond_t *pevent_handle; pthread_t *pthread_handle; void *(*pFunc)(void *param2);};//线程信息vector<geturlParam *> thread_info;//同步对象,对thread_number进行操作时加锁pthread_mutex_t count_lock;//线程mapname和线程numbermap<string, unsigned int> thread_number;//日志文件string spider_log_file="SpiderLog.txt";string spider_insert_error_log_file = "SpiderDatabaseLog.txt";//配置脚本文件名string spider_config_file = "spider.ini";map<string, pthread_cond_t *> thread_event;map<string, pthread_mutex_t *> thread_event_lock;SpiderCore::SpiderCore(){ this->stop = false; this->domain = ""; this->ipaddr = ""; this->path = ""; this->depth = 0; this->port = 0; this->encode = ""; this->db_host = ""; this->db_user = ""; this->db_pwd = ""; this->db_name = ""; this->stop = false;}SpiderCore::~SpiderCore(){}int SpiderCore::LoadConfigFile(){ ifstream config("spider.ini"); string line(""); string::size_type pos=0; string name(""); string value(""); multimap<int, string> regex_name; string logstr=""; unsigned int depth=1; unsigned int *pdepth=&depth; if (!config.good()) { logstr="In function LoadConfigFile:open file error!"; fo.WriteToFileLn(spider_log_file, logstr, 1); return 1; } //文件打开出错,可能文件不存在 while (getline(config, line)) { if(line.substr(0,1)=="#") { continue; } pos = line.find("="); name = line.substr(0, pos); value = line.substr(pos+1); if (name=="domain") { this->domain = value; } else if (name=="ipaddr") { this->ipaddr = value; } else if (name=="path") { this->path = value; } else if (name=="port") { this->port = atoi(value.c_str()); } else if (name=="encode") { this->encode=value; } else if (name=="db_host") { this->db_host=value; } else if (name=="db_user") { this->db_user=value; } else if (name=="db_pwd") { this->db_pwd=value; } else if (name=="db_name") { this->db_name=value; } else { this->config_map.insert(make_pair(name, value)); } } size_t map_rs = this->config_map.count("main"); if (!map_rs) { logstr="In function LoadConfigFile:can not find main node in config file(spider.ini)!"; fo.WriteToFileLn(spider_log_file, logstr, 1); return 6; } else if (this->domain.empty()) { logstr="In function LoadConfigFile:can not find website domain in config file(spider.ini)!"; fo.WriteToFileLn(spider_log_file, logstr, 1); return 2; } else if (this->ipaddr.empty()) { logstr="In function LoadConfigFile:can not find website ipaddr node in config file(spider.ini)!"; fo.WriteToFileLn(spider_log_file, logstr, 1); return 3; } else if (this->path.empty()) { logstr="In function LoadConfigFile:can not find website first path in config file(spider.ini)!"; fo.WriteToFileLn(spider_log_file, logstr, 1); return 4; } else if (this->port==0) { logstr="In function LoadConfigFile:can not find website port in config file(spider.ini)!"; fo.WriteToFileLn(spider_log_file, logstr, 1); return 5; } else if (this->db_name.empty()) { logstr="In function LoadConfigFile:can not find database name in config file(spider.ini)!"; fo.WriteToFileLn(spider_log_file, logstr, 1); return 6; } else if (this->db_host.empty()) { logstr="In function LoadConfigFile:can not find database host in config file(spider.ini)!"; fo.WriteToFileLn(spider_log_file, logstr, 1); return 7; } else if (this->db_user.empty()) { logstr="In function LoadConfigFile:can not find database user in config file(spider.ini)!"; fo.WriteToFileLn(spider_log_file, logstr, 1); return 8; } this->depth = *pdepth; int analytics_res=analytics("main", "", regex_name, pdepth); if (analytics_res==100) { return 9; } return 0;}//对配置文件中带有正则表达式的节点,同一深度组成一个数组int SpiderCore::analytics(string map_name, string parent, multimap<int, string> &mmap, unsigned int *depth){ int child =0; char num[12]={0}; //search map_name map<string, string>::iterator it=this->config_map.find(map_name); string logstr=""; //检查节点是否存在 if (it==this->config_map.end()) { logstr="In function analytics:can not find map node " + map_name + " in config file(spider.ini)!"; cout << logstr << endl; fo.WriteToFileLn(spider_log_file, logstr, 1); return 100;//can not find map_name } //检查子节点数目是否存在 int sub_num=0; int i=0; map<string, string>::iterator it_subnum=this->config_map.find(map_name + "_subnum"); if (it_subnum==this->config_map.end()) { logstr="In function analytics:can not find map node " + it_subnum->first + " in config file(spider.ini)!"; cout << logstr << endl; fo.WriteToFileLn(spider_log_file, logstr, 1); return 100;//can not find map_name } sub_num = atoi(it_subnum->second.c_str()); //开始搜索子表达式的配置 while (++i<=sub_num) { sprintf( num, "%d", i ); string sub_name = map_name + "_" + num; //检查子表达式配置是否存在 map<string, string>::iterator subit=this->config_map.find( sub_name ); if (subit==this->config_map.end()) { logstr="In function analytics:can not find sub node " + sub_name + " in config file(spider.ini)!"; cout << logstr << endl; fo.WriteToFileLn(spider_log_file, logstr, 1); return 100;//can not find map_name } //检查子表达式是否需要需要抓取 if (func.getString(subit->second, 1)=="1") { (*depth)++; child++; //需要抓取,递归调用analytics函数 int res = analytics(func.getString(subit->second, 2), it->first, mmap, depth); if (res==100) { return 100; } (*depth)--; } //检查子表达式是否需要请求新的页面 else if(func.getString(subit->second, 5)=="1") { //检查请求页面的节点是否存在 string socket_map = func.getString(subit->second,6); if(this->config_map.find(socket_map)==this->config_map.end()) { logstr="In function analytics:can not find map node " + socket_map + " in config file(spider.ini)!"; cout << logstr << endl; fo.WriteToFileLn(spider_log_file, logstr, 1); return 100;//can not find map_name } //检查子结点数目节点是否存在 map<string, string>::iterator socket_it_subnum=this->config_map.find(socket_map + "_subnum"); if (socket_it_subnum==this->config_map.end()) { logstr="In function analytics_map_extra:can not find map node " + socket_it_subnum->first + " in config file(spider.ini)!"; cout << logstr << endl; fo.WriteToFileLn(spider_log_file, logstr, 1); return 100;//can not find map_name } int socket_it_subnum_i=0; //依次检查子节点是否存在 while(++socket_it_subnum_i <= atoi(socket_it_subnum->second.c_str())) { sprintf(num,"%d",socket_it_subnum_i); string socket_map_sub = socket_map; socket_map_sub.append("_"); socket_map_sub.append(num); //检查子节点是否存在 if(this->config_map.find(socket_map_sub) == this->config_map.end()) { string logstr="In function analytics:can not find map node " + socket_map_sub + " in config file(spider.ini)!"; cout << logstr << endl; fo.WriteToFileLn(spider_log_file, logstr, 1); return 100;//can not find map_name } } //检查请求另外页面节点的额外节点是否存在 if(this->analytics_map_extra(map_name)==100) { return 100; } } mmap.insert(make_pair((*depth), sub_name)); } //检查节点的额外节点是否存在 if(this->analytics_map_extra(map_name)==100) { return 100; } //itoa(sub_num, num, 10); sprintf(num, "%d", sub_num); string snum=num; //add sub regex total number this->config_map.insert(make_pair(map_name + "_subnum", snum)); //add parent node this->config_map.insert(make_pair(map_name + "_parent", parent)); sprintf(num, "%d", child); //itoa(child, num, 10); snum=num; this->config_map.insert(make_pair(map_name + "_childnum", snum));//add child total number sprintf(num, "%d", *depth); //itoa(*depth, num, 10); snum = num; this->config_map.insert(make_pair(map_name + "_level", snum));//增加节点的级数 this->regex_mapname.insert(make_pair((*depth), map_name)); //deeper the depth of this config if (this->depth < (*depth)) { this->depth = (*depth); } return 0;}//分析节点的额外节点//mapname 节点名称int SpiderCore::analytics_map_extra(string mapname){ int i=1; char num[12]={0}; string extra_mapname = mapname + "_extra_"; //依次检查额外节点是否存在,如果不存在则退出,此循环结束,函数退出 while(true) { sprintf(num, "%d", i); extra_mapname.append(num); //检查额外节点是否存在 map<string, string>::iterator it=this->config_map.find(extra_mapname); if (it==this->config_map.end()) { break;//end extra map } //检查额外节点的子表达数目节点是否存在 map<string, string>::iterator it_subnum=this->config_map.find(extra_mapname + "_subnum"); if (it_subnum==this->config_map.end()) { string logstr="In function analytics_map_extra:can not find map node " + it_subnum->first + " in config file(spider.ini)!"; cout << logstr << endl;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -