⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 gopherenum-breadth.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 2 页
字号:
    int y;    if ((tup = url_open(url)) == NULL)	return 0;;    if (url_in_db(tup->url)) {	/* Have we been here? */	Debug(43, 1, ("Already Visited URL: %s\n", tup->url));	url_close(tup);	return 0;    }    if ((y = filter_selection(tup))) {	Debug(43, 1, ("Removing Candidate: [%s] %s\n",		Filter_Type_Name[y], tup->url));	if (not_visited)	    fprintf(not_visited, "[FILTER] %s\n", tup->url);	url_close(tup);	return 0;    }    if (!visit_server(tup)) {	Debug(43, 1, ("Server count exceeded: %s\n",		tup->url));	if (not_visited)	    fprintf(not_visited, "[SERVER] %s\n", tup->url);	url_close(tup);	return 0;    }    if (!RobotsTxtCheck(tup)) {	Debug(43, 1, ("Disallowed by robots.txt: %s\n", tup->url));	if (not_visited)	    fprintf(not_visited, "[ROBOTS.TXT] %s\n", tup->url);	url_close(tup);	return 0;    }    return 1;}static int gopher_enum(up, depth)     URL *up;     int depth;{    FILE *fp = NULL;    char *s = NULL;    char *p = NULL;    char *q = NULL;    char *gopher_name = NULL;    char *gopher_path = NULL;    char *gopher_host = NULL;    char *gopher_port = NULL;    int nurls = 0;    static char buf[BUFSIZ];    static char urlbuf[BUFSIZ];    static char newurl[BUFSIZ];    if (url_in_db(up->url)) {	/* Have we been here? */	Debug(43, 1, ("Already Visited URL: %s\n", up->url));	return 0;    }    if (url_retrieve(up)) {	/* Grab the URL; success? */	Debug(43, 1, ("Cannot Retrieve URL: %s\n", up->url));#ifdef DONT_RETRY_FAILS        mark_failed(up);#endif	return 0;    }    if (up->md5 && md5_in_db(up->md5)) {	/* Have we been here? */	Debug(43, 1, ("Already Visited MD5: %s\n", up->url));	return 0;    }    /* Remember that we've been here before */    if (up->md5)	mark_retrieved(up);    if (up->gophertype == 0)	return 0;    /*     *  For each meny entry, convert it to a URL, and add it to     *  the global list of URLs to process.     */    if ((fp = fopen(up->filename, "r")) == NULL) {	log_errno2(__FILE__, __LINE__, up->filename);	return 0;    }    while (fgets(buf, BUFSIZ, fp)) {	if ((s = strchr(buf, '\r')))	    *s = (char) '\n';	strcpy(urlbuf, buf);	if ((s = strchr(buf, '\n')))	    *s = (char) '\0';	Debug(43, 5, ("Input: %s\n", buf));	if (!strcmp(buf, "."))	    break;	p = urlbuf;	if ((q = strchr(p, '\t')) == NULL) {	    errorlog("Illegal Gopher format: No Name: %s\n", buf);	    goto gopher_enum_cont;	}	*q = (char) '\0';	gopher_name = xstrdup(p);	Debug(43, 5, ("gopher_name = '%s'\n", gopher_name));	p = q + 1;	if ((q = strchr(p, '\t')) == NULL) {	    errorlog("Illegal Gopher format: No Path: %s\n", buf);	    goto gopher_enum_cont;	}	*q = (char) '\0';	gopher_path = xstrdup(rfc1738_escape(p));	Debug(43, 5, ("gopher_path = '%s'\n", gopher_path));	p = q + 1;	if ((q = strchr(p, '\t')) == NULL) {	    errorlog("Illegal Gopher format: No Host: %s\n", buf);	    goto gopher_enum_cont;	}	*q = (char) '\0';	gopher_host = xstrdup(p);	Debug(43, 5, ("gopher_host = '%s'\n", gopher_host));	p = q + 1;	q = strchr(p, '\t');	if (q == NULL)	    q = strchr(p, '\n');	if (q == NULL) {	    errorlog("Illegal Gopher format: No Port: %s\n", buf);	    goto gopher_enum_cont;	}	*q = (char) '\0';	gopher_port = xstrdup(p);	Debug(43, 5, ("gopher_port = '%s'\n", gopher_port));	/* Fix for wierd cross-site Gopher links - wessels */	if (!strncasecmp(gopher_path, "ftp%3a", 6))	    goto gopher_enum_cont;	if (!strncasecmp(gopher_path, "ftp:", 4))	    goto gopher_enum_cont;	if (!strncasecmp(gopher_path, "exec%3a", 7))	    goto gopher_enum_cont;	if (!strncasecmp(gopher_path, "exec:", 5))	    goto gopher_enum_cont;	sprintf(newurl, "gopher://%s:%d/%c%s", gopher_host,	    atoi(gopher_port), gopher_name[0], gopher_path);	if (url_is_allowed(newurl)) {	    add_to_list(newurl, depth);	    nurls++;	}      gopher_enum_cont:	xfree(gopher_name);	gopher_name = NULL;	xfree(gopher_path);	gopher_path = NULL;	xfree(gopher_host);	gopher_host = NULL;	xfree(gopher_port);	gopher_port = NULL;    }    fclose(fp);    Debug(43, 1, ("Adding %d URLs from %s to workload\n", nurls, up->url));    return 1;}/* ---------------------------------------------------------------------- *//* *  initialize() - Basic init routines */static void initialize(){    char *s = NULL;    extern int liburl_conform_rfc1738;    FILE *logfp = NULL;#ifdef USE_HOST_CACHE    host_cache_init();#endif    cur_depth = max_depth = url_max = host_max = 0;    if ((s = getenv("HARVEST_URL_MAX")) != NULL)	url_max = atoi(s);    if ((s = getenv("HARVEST_HOST_MAX")) != NULL)	host_max = atoi(s);    if ((s = getenv("HARVEST_DEPTH_MAX")) != NULL)	max_depth = atoi(s);    if ((s = getenv("HARVEST_DEPTH_CUR")) != NULL)	cur_depth = atoi(s);    if (url_max < 1)	url_max = 250;		/* hard-coded maximum */    if (host_max < 1)	host_max = 1;		/* hard-coded maximum */    if (max_depth < 1)	max_depth = 0;		/* hard-coded maximum */    host_filterfile = getenv("HARVEST_HOST_FILTER");    url_filterfile = getenv("HARVEST_URL_FILTER");    access_types = getenv("HARVEST_ACCESS_TYPES");    if ((s = getenv("HARVEST_GATHERER_LOGFILE")) != (char *) NULL)	logfp = fopen(s, "a+");    if (logfp == (FILE *) NULL)	logfp = stderr;    init_log3("gopherenum-breadth", logfp, stderr);    init_url();    liburl_conform_rfc1738 = 1;    filter_initialize();    Debug(43, 5, ("access_mask: %#02X\n", access_mask));    /* Open GDBM databases to keep track of where we've been */    urldb_filename = xstrdup(tempnam(NULL, "Gurl"));    urldbf = gdbm_open(urldb_filename, 0, GDBM_NEWDB, 0644, NULL);    if (urldbf == NULL) {	log_errno(urldb_filename);	fatal("gdbm_open: %s: %s", urldb_filename,	    gdbm_strerror(gdbm_errno));    }    hostdb_filename = xstrdup(tempnam(NULL, "Ghost"));    hostdbf = gdbm_open(hostdb_filename, 0, GDBM_NEWDB, 0644, NULL);    if (hostdbf == NULL) {	log_errno(hostdb_filename);	fatal("gdbm_open: %s: %s", hostdb_filename,	    gdbm_strerror(gdbm_errno));    }    md5db_filename = xstrdup(tempnam(NULL, "Gmd5"));    md5dbf = gdbm_open(md5db_filename, 0, GDBM_NEWDB, 0644, NULL);    if (md5dbf == NULL) {	log_errno(md5db_filename);	fatal("gdbm_open: %s: %s", md5db_filename,	    gdbm_strerror(gdbm_errno));    }    /* open not-visited file */    if ((s = getenv("HARVEST_NOT_VISITED_LOG")) != NULL)	not_visited = fopen(s, "a+");    if (not_visited)	setbuf(not_visited, NULL);}/* Die gracefully */static void sigdie(x)     int x;{    int i;#ifdef USE_HOST_CACHE    dump_host_cache(43, 9);#endif    if (urldbf != NULL)	gdbm_close(urldbf);    if (hostdbf != NULL)	gdbm_close(hostdbf);    if (md5dbf != NULL)	gdbm_close(md5dbf);    if (not_visited)	fclose(not_visited);    /* (void) unlink(urldb_filename); */    crremove(urldb_filename);    xfree(urldb_filename);    /* (void) unlink(hostdb_filename); */    crremove(hostdb_filename);    xfree(hostdb_filename);    /* (void) unlink(md5db_filename); */    crremove(md5db_filename);    xfree(md5db_filename);    for (i = 0; i < 100; i++) {	if (i > max_depth && depth_hist[i] == 0)	    break;	Log("Found %8d objects at depth %d\n", depth_hist[i], i);    }    Debug(43, 1, ("gopherenum-breadth: exiting (signal %d)\n", x));    exit(0);}/* ---------------------------------------------------------------------- */static void usage(){    fprintf(stderr, "Usage: gopherenum-breadth gopher-URL\n");    exit(1);}int main(argc, argv)     int argc;     char **argv;{    URL *up = NULL;    list_t *l = NULL;    char *url = NULL;    int depth = 0;    debug_init();		/* from $HARVEST_DEBUG */    for (argc--, argv++; argc > 0 && **argv == '-'; argc--, argv++) {	if (!strncmp(*argv, "-D", 2)) {	    debug_flag(*argv);	}    }    if (argc != 1)	usage();    for (depth = 0; depth < 100; depth++)	depth_hist[depth] = 0;    signal(SIGTERM, sigdie);	/* Die gracefully */    signal(SIGINT, sigdie);    signal(SIGPIPE, sigdie);	/* Quickly clean up on broken pipe */    initialize();		/* Initialize */    Debug(43, 1, ("gopherenum-breadth: Starting...\n"));    /* Grab the RootNode URL from the command line */    if ((up = url_open(*argv)) == NULL || up->type != URL_GOPHER) {	usage();    }    /* Mark the RootNode */    tree_root = xstrdup(up->url);    Tail = &head;    /*     * helpdesk@ecs.soton.ac.uk -- Gatherer visits too many hosts     * 6/3/96. Make sure the first URL we start with is added     *         to the list of servers visited.     */    (void)visit_server(up);    printf("%s\n", up->url);	/* Print tree root */    add_to_list(up->url, cur_depth);	/* start at depth = 0 */    url_close(up);    for (l = head; l; l = free_from_list(l)) {	url = (char *) l->ptr;	depth = l->depth;	if (depth < 100)	    depth_hist[depth]++;	if (max_depth > 0 && depth > max_depth) {	    if (not_visited)		fprintf(not_visited, "[DEPTH] %s\n", url);	    Debug(43, 1, ("Maximum Depth of %d Reached: %s\n",		    max_depth, url));	    continue;	}	Debug(43, 1, ("Processing: [%2d] %s\n", depth, url));	if ((up = url_open(url)) == NULL)	    continue;	/* search for more links from this one */	gopher_enum(up, depth + 1);	url_close(up);    }    finish_url();    sigdie(0);    /* NOTREACHED */}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -