⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 spidercore.cpp

📁 功能强大的网络蜘蛛软件
💻 CPP
📖 第 1 页 / 共 5 页
字号:
#define FETCH_NUM_DATA_PER_TIME 10#define FETCH_CONTENT_THREADS_NUM 20#include "spider.h"FileOperator fo;Functions func;bool SPIDER_STOP = false;struct geturlParam{public:    geturlParam()    {        thread_name="";        mapname="";        table="";        sc=NULL;        num_per_time=0;        thread_type = 0;        url_flag="";        pevent_handle=NULL;        pthread_handle=NULL;        pFunc = NULL;    }    unsigned int thread_number;    string thread_name;    int thread_type;    string mapname;    string table;    SpiderCore *sc;    int num_per_time;    string url_flag;//数据库中的url是否被读过    pthread_cond_t *pevent_handle;    pthread_t *pthread_handle;    void *(*pFunc)(void *param2);};//线程信息vector<geturlParam *> thread_info;//同步对象,对thread_number进行操作时加锁pthread_mutex_t count_lock;//线程mapname和线程numbermap<string, unsigned int> thread_number;//日志文件string spider_log_file="SpiderLog.txt";string spider_insert_error_log_file = "SpiderDatabaseLog.txt";//配置脚本文件名string spider_config_file = "spider.ini";map<string, pthread_cond_t *> thread_event;map<string, pthread_mutex_t *> thread_event_lock;SpiderCore::SpiderCore(){    this->stop = false;    this->domain = "";    this->ipaddr = "";    this->path = "";    this->depth = 0;    this->port = 0;    this->encode = "";    this->db_host = "";    this->db_user = "";    this->db_pwd = "";    this->db_name = "";    this->stop = false;}SpiderCore::~SpiderCore(){}int SpiderCore::LoadConfigFile(){    ifstream config("spider.ini");    string line("");    string::size_type pos=0;    string name("");    string value("");    multimap<int, string> regex_name;    string logstr="";    unsigned int depth=1;    unsigned int *pdepth=&depth;    if (!config.good())    {        logstr="In function LoadConfigFile:open file error!";        fo.WriteToFileLn(spider_log_file, logstr, 1);        return 1;    }    //文件打开出错,可能文件不存在    while (getline(config, line))    {        if(line.substr(0,1)=="#")        {            continue;        }        pos = line.find("=");        name = line.substr(0, pos);        value = line.substr(pos+1);        if (name=="domain")        {            this->domain = value;        }        else if (name=="ipaddr")        {            this->ipaddr = value;        }        else if (name=="path")        {            this->path = value;        }        else if (name=="port")        {            this->port = atoi(value.c_str());        }        else if (name=="encode")        {            this->encode=value;        }        else if (name=="db_host")        {            this->db_host=value;        }        else if (name=="db_user")        {            this->db_user=value;        }        else if (name=="db_pwd")        {            this->db_pwd=value;        }        else if (name=="db_name")        {            this->db_name=value;        }        else        {            this->config_map.insert(make_pair(name, value));        }    }    size_t map_rs = this->config_map.count("main");    if (!map_rs)    {        logstr="In function LoadConfigFile:can not find main node in config file(spider.ini)!";        fo.WriteToFileLn(spider_log_file, logstr, 1);        return 6;    }    else if (this->domain.empty())    {        logstr="In function LoadConfigFile:can not find website domain in config file(spider.ini)!";        fo.WriteToFileLn(spider_log_file, logstr, 1);        return 2;    }    else if (this->ipaddr.empty())    {        logstr="In function LoadConfigFile:can not find website ipaddr node in config file(spider.ini)!";        fo.WriteToFileLn(spider_log_file, logstr, 1);        return 3;    }    else if (this->path.empty())    {        logstr="In function LoadConfigFile:can not find website first path in config file(spider.ini)!";        fo.WriteToFileLn(spider_log_file, logstr, 1);        return 4;    }    else if (this->port==0)    {        logstr="In function LoadConfigFile:can not find website port in config file(spider.ini)!";        fo.WriteToFileLn(spider_log_file, logstr, 1);        return 5;    }    else if (this->db_name.empty())    {        logstr="In function LoadConfigFile:can not find database name in config file(spider.ini)!";        fo.WriteToFileLn(spider_log_file, logstr, 1);        return 6;    }    else if (this->db_host.empty())    {        logstr="In function LoadConfigFile:can not find database host in config file(spider.ini)!";        fo.WriteToFileLn(spider_log_file, logstr, 1);        return 7;    }    else if (this->db_user.empty())    {        logstr="In function LoadConfigFile:can not find database user in config file(spider.ini)!";        fo.WriteToFileLn(spider_log_file, logstr, 1);        return 8;    }    this->depth = *pdepth;    int analytics_res=analytics("main", "", regex_name, pdepth);    if (analytics_res==100)    {        return 9;    }    return 0;}//对配置文件中带有正则表达式的节点,同一深度组成一个数组int SpiderCore::analytics(string map_name, string parent, multimap<int, string> &mmap, unsigned int *depth){    int child =0;    char num[12]={0};    //search map_name    map<string, string>::iterator it=this->config_map.find(map_name);    string logstr="";    //检查节点是否存在    if (it==this->config_map.end())    {        logstr="In function analytics:can not find map node " + map_name + " in config file(spider.ini)!";        cout << logstr << endl;        fo.WriteToFileLn(spider_log_file, logstr, 1);        return 100;//can not find map_name    }    //检查子节点数目是否存在    int sub_num=0;    int i=0;    map<string, string>::iterator it_subnum=this->config_map.find(map_name + "_subnum");    if (it_subnum==this->config_map.end())    {        logstr="In function analytics:can not find map node " + it_subnum->first + " in config file(spider.ini)!";        cout << logstr << endl;        fo.WriteToFileLn(spider_log_file, logstr, 1);        return 100;//can not find map_name    }    sub_num = atoi(it_subnum->second.c_str());    //开始搜索子表达式的配置    while (++i<=sub_num)    {        sprintf( num, "%d", i );        string sub_name = map_name + "_" + num;        //检查子表达式配置是否存在        map<string, string>::iterator subit=this->config_map.find( sub_name );        if (subit==this->config_map.end())        {            logstr="In function analytics:can not find sub node " + sub_name + " in config file(spider.ini)!";            cout << logstr << endl;            fo.WriteToFileLn(spider_log_file, logstr, 1);            return 100;//can not find map_name        }        //检查子表达式是否需要需要抓取        if (func.getString(subit->second, 1)=="1")        {            (*depth)++;            child++;            //需要抓取,递归调用analytics函数            int res = analytics(func.getString(subit->second, 2), it->first, mmap, depth);            if (res==100)            {                return 100;            }            (*depth)--;        }        //检查子表达式是否需要请求新的页面        else if(func.getString(subit->second, 5)=="1")        {            //检查请求页面的节点是否存在            string socket_map = func.getString(subit->second,6);            if(this->config_map.find(socket_map)==this->config_map.end())            {                logstr="In function analytics:can not find map node " + socket_map + " in config file(spider.ini)!";                cout << logstr << endl;                fo.WriteToFileLn(spider_log_file, logstr, 1);                return 100;//can not find map_name            }            //检查子结点数目节点是否存在            map<string, string>::iterator socket_it_subnum=this->config_map.find(socket_map + "_subnum");            if (socket_it_subnum==this->config_map.end())            {                logstr="In function analytics_map_extra:can not find map node " + socket_it_subnum->first + " in config file(spider.ini)!";                cout << logstr << endl;                fo.WriteToFileLn(spider_log_file, logstr, 1);                return 100;//can not find map_name            }            int socket_it_subnum_i=0;            //依次检查子节点是否存在            while(++socket_it_subnum_i <= atoi(socket_it_subnum->second.c_str()))            {                sprintf(num,"%d",socket_it_subnum_i);                string socket_map_sub = socket_map;                socket_map_sub.append("_");                socket_map_sub.append(num);                //检查子节点是否存在                if(this->config_map.find(socket_map_sub) == this->config_map.end())                {                    string logstr="In function analytics:can not find map node " + socket_map_sub + " in config file(spider.ini)!";                    cout << logstr << endl;                    fo.WriteToFileLn(spider_log_file, logstr, 1);                    return 100;//can not find map_name                }            }            //检查请求另外页面节点的额外节点是否存在            if(this->analytics_map_extra(map_name)==100)            {                return 100;            }        }        mmap.insert(make_pair((*depth), sub_name));    }    //检查节点的额外节点是否存在    if(this->analytics_map_extra(map_name)==100)    {        return 100;    }    //itoa(sub_num, num, 10);    sprintf(num, "%d", sub_num);    string snum=num;    //add sub regex total number    this->config_map.insert(make_pair(map_name + "_subnum", snum));    //add parent node    this->config_map.insert(make_pair(map_name + "_parent", parent));    sprintf(num, "%d", child);    //itoa(child, num, 10);    snum=num;    this->config_map.insert(make_pair(map_name + "_childnum", snum));//add child total number    sprintf(num, "%d", *depth);    //itoa(*depth, num, 10);    snum = num;    this->config_map.insert(make_pair(map_name + "_level", snum));//增加节点的级数    this->regex_mapname.insert(make_pair((*depth), map_name));    //deeper the depth of this config    if (this->depth < (*depth))    {        this->depth = (*depth);    }    return 0;}//分析节点的额外节点//mapname 节点名称int SpiderCore::analytics_map_extra(string mapname){    int i=1;    char num[12]={0};    string extra_mapname = mapname + "_extra_";    //依次检查额外节点是否存在,如果不存在则退出,此循环结束,函数退出    while(true)    {        sprintf(num, "%d", i);        extra_mapname.append(num);        //检查额外节点是否存在        map<string, string>::iterator it=this->config_map.find(extra_mapname);        if (it==this->config_map.end())        {            break;//end extra map        }        //检查额外节点的子表达数目节点是否存在        map<string, string>::iterator it_subnum=this->config_map.find(extra_mapname + "_subnum");        if (it_subnum==this->config_map.end())        {            string logstr="In function analytics_map_extra:can not find map node " + it_subnum->first + " in config file(spider.ini)!";            cout << logstr << endl;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -