⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 gopherenum-depth.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 2 页
字号:
    char *gopher_port = NULL;    int y;    if (url_in_db(up->url)) {	/* Have we been here? */	Debug(43, 1, ("Already Visited URL: %s\n", up->url));	return (NULL);    }    if ((y = filter_selection(up))) {	/* Match the URL based on REs */	Debug(43, 1, ("Removing Candidate: [%s] %s\n",		Filter_Type_Name[y], up->url));	return (NULL);    }    if (!visit_server(up)) {	/* Can we visit this server? */	Debug(43, 1, ("Disallowed to Visit Server: %s\n", up->url));	return (NULL);    }    if (!RobotsTxtCheck(up)) {        Debug(43, 1, ("Disallowed by robots.txt: %s\n", up->url));	return (NULL);    }    if (url_retrieve(up)) {	/* Grab the URL; success? */	Debug(43, 1, ("Cannot Retrieve URL: %s\n", up->url));#ifdef DONT_RETRY_FAILS	mark_failed(up);#endif	return (NULL);    }    if (up->md5 && md5_in_db(up->md5)) {	/* Have we been here? */	Debug(43, 1, ("Already Visited MD5: %s\n", up->url));	return (NULL);    }    /* Remember that we've been here before */    if (up->md5 != NULL)	mark_retrieved(up);    if (up->gophertype == 0)	return (NULL);    /*     *  For each pointer, convert it to a URL, and add it to     *  the list of URLs to return.     */    if ((fp = fopen(up->filename, "r")) == NULL) {	log_errno2(__FILE__, __LINE__, up->filename);	return (NULL);    }    Tail = &head;    while (fgets(buf, BUFSIZ, fp)) {	if (buf[0] == '.' || buf[0] == '\n')	    break;	urlbuf = xstrdup(buf);	if ((q = strrchr(buf, '\n')))	    *q = (char) '\0';	p = urlbuf;	if ((q = strchr(p, '\t')) == NULL) {	    errorlog("Illegal Gopher format: No Name: %s\n", buf);	    goto gopher_enum_cont;	}	*q = (char) '\0';	gopher_name = xstrdup(p);	p = q + 1;	if ((q = strchr(p, '\t')) == NULL) {	    errorlog("Illegal Gopher format: No Path: %s\n", buf);	    goto gopher_enum_cont;	}	*q = (char) '\0';	gopher_path = xstrdup(rfc1738_escape(p));	p = q + 1;	if ((q = strchr(p, '\t')) == NULL) {	    errorlog("Illegal Gopher format: No Host: %s\n", buf);	    goto gopher_enum_cont;	}	*q = (char) '\0';	gopher_host = xstrdup(p);	p = q + 1;	if ((q = strchr(p, '\n')) == NULL) {	    errorlog("Illegal Gopher format: No Port: %s\n", buf);	    goto gopher_enum_cont;	}	*q = (char) '\0';	gopher_port = xstrdup(p);	/* Fix for wierd cross-site Gopher links - wessels */	if (!strncasecmp(gopher_path, "ftp%3a", 6))	    goto gopher_enum_cont;	if (!strncasecmp(gopher_path, "ftp:", 4))	    goto gopher_enum_cont;	if (!strncasecmp(gopher_path, "exec%3a", 7))	    goto gopher_enum_cont;	if (!strncasecmp(gopher_path, "exec:", 5))	    goto gopher_enum_cont;	sprintf(newurl, "gopher://%s:%d/%c%s\n", gopher_host,	    atoi(gopher_port), gopher_name[0], gopher_path);	l = (list_t *) xmalloc(sizeof(list_t));	l->ptr = (void *) xstrdup(newurl);	l->next = (list_t *) NULL;	*Tail = l;	Tail = &(l->next);      gopher_enum_cont:	if (gopher_name)	    xfree(gopher_name);	gopher_name = NULL;	if (gopher_path)	    xfree(gopher_path);	gopher_path = NULL;	if (gopher_host)	    xfree(gopher_host);	gopher_host = NULL;	if (gopher_port)	    xfree(gopher_port);	gopher_port = NULL;	if (urlbuf)	    xfree(urlbuf);	gopher_name = NULL;    }    fclose(fp);    return (head);}/* *  process_url() - Retrieves the given URL, computes an MD5, *  and extracts the list of menu pointers within the documents. */static void process_url(up, depth)     URL *up;     int depth;{    list_t *head = 0;    list_t *l = 0;    list_t *next_l = 0;    char *url;    URL *tup;    if (max_depth > 0 && depth > max_depth) {	Debug(43, 1, ("Maximum Depth of %d Reached: %s\n",		max_depth, up->url));	url_close(up);	return;    }    Debug(43, 1, ("Processing: [%2d] %s\n", depth, up->url));    if ((head = gopher_enum(up)) == NULL) {	url_close(up);	return;    }    url_close(up);    /*     *  Now, for each URL in the list, call process_url() if     *  the URL is a Gopher url and it is on the same host     */    for (l = head; l; l = next_l) {	next_l = l->next;	url = (char *) l->ptr;	if (url == (char *) NULL)	    goto free_list_entry;	if ((tup = url_open(url)) == NULL)	    goto free_list_entry;	if ((tup->type != URL_GOPHER)) {	    url_close(tup);	    goto free_list_entry;	}	if (tup->gophertype >= 2) {	/* ignore everything 2 or higher */	    url_close(tup);	    goto free_list_entry;	}	process_url(tup, depth + 1);	/* should be a 1 - menu */      free_list_entry:	xfree(l->ptr);	xfree(l);    }}/* ---------------------------------------------------------------------- *//* *  initialize() - Basic init routines */static void initialize(){    char *s;    FILE *logfp = NULL;#ifdef USE_HOST_CACHE    host_cache_init();#endif    max_depth = url_max = host_max = 0;    if ((s = getenv("HARVEST_URL_MAX")) != NULL)	url_max = atoi(s);    if ((s = getenv("HARVEST_HOST_MAX")) != NULL)	host_max = atoi(s);    if ((s = getenv("HARVEST_DEPTH_MAX")) != NULL)	max_depth = atoi(s);    if ((s = getenv("HARVEST_DEPTH_CUR")) != NULL)	start_depth = atoi(s);    Debug(43, 9, ("HARVEST_DEPTH_CUR=%d\n", s ? s : "NULL"));    if (url_max < 1)	url_max = 250;		/* hard-coded maximum */    if (host_max < 1)	host_max = 1;		/* hard-coded maximum */    if (max_depth < 1)	max_depth = 0;		/* hard-coded maximum */    host_filterfile = getenv("HARVEST_HOST_FILTER");    url_filterfile = getenv("HARVEST_URL_FILTER");    if (getenv("HARVEST_GATHERER_LOGFILE") != (char *) NULL)	logfp = fopen(getenv("HARVEST_GATHERER_LOGFILE"), "a+");    if (logfp == (FILE *) NULL)	logfp = stderr;    init_log3("gopherenum-depth", logfp, stderr);    init_url();    filter_initialize();    /* Open GDBM databases to keep track of where we've been */    urldb_filename = xstrdup(tempnam(NULL, "Gurl"));    urldbf = gdbm_open(urldb_filename, 0, GDBM_NEWDB, 0644, NULL);    if (urldbf == NULL) {	log_errno(urldb_filename);	fatal("gdbm_open: %s: %s", urldb_filename,	    gdbm_strerror(gdbm_errno));    }    hostdb_filename = xstrdup(tempnam(NULL, "Ghost"));    hostdbf = gdbm_open(hostdb_filename, 0, GDBM_NEWDB, 0644, NULL);    if (hostdbf == NULL) {	log_errno(hostdb_filename);	fatal("gdbm_open: %s: %s", hostdb_filename,	    gdbm_strerror(gdbm_errno));    }    md5db_filename = xstrdup(tempnam(NULL, "Gmd5"));    md5dbf = gdbm_open(md5db_filename, 0, GDBM_NEWDB, 0644, NULL);    if (md5dbf == NULL) {	log_errno(md5db_filename);	fatal("gdbm_open: %s: %s", md5db_filename,	    gdbm_strerror(gdbm_errno));    }}/* Die gracefully */static void sigdie(){    if (urldbf != NULL)	gdbm_close(urldbf);    if (hostdbf != NULL)	gdbm_close(hostdbf);    if (md5dbf != NULL)	gdbm_close(md5dbf);    /*     * (void) unlink(urldb_filename);     * (void) unlink(hostdb_filename);     * (void) unlink(md5db_filename);     */    crremove(urldb_filename);    crremove(hostdb_filename);    crremove(md5db_filename);    exit(0);}/* ---------------------------------------------------------------------- */static void usage(){    fprintf(stderr, "Usage: gopherenum-depth gopher-URL\n");    exit(1);}int main(argc, argv)     int argc;     char **argv;{    URL *up;    debug_init();    for (argc--, argv++; argc > 0 && **argv == '-'; argc--, argv++) {	if (strncmp(*argv, "-D", 2) == 0) {	    debug_flag(*argv);	}    }    if (argc != 1)	usage();    signal(SIGTERM, sigdie);	/* Die gracefully */    signal(SIGINT, sigdie);    signal(SIGPIPE, sigdie);	/* Clean up on broken pipe */    initialize();		/* Initialize */    /* Grab the RootNode URL from the command line */    if ((up = url_open(*argv)) == NULL || up->type != URL_GOPHER)	usage();    /* Mark the RootNode */    tree_root = xstrdup(up->url);    printf("%s\n", up->url);	/* Print tree root */    process_url(up, start_depth);	/* Do the Enumeration recursively */    url_close(up);		/* Clean up */    finish_url();    sigdie();    /* NOTREACHED */}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -