⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 httpenum-depth.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 2 页
字号:
#ifdef DONT_RETRY_FAILS		mark_failed (up);#endif		return (NULL);	}	if (up->md5 && md5_in_db (up->md5)) {	/* Have we been here? */		Debug (42, 1, ("Already Visited MD5: %s\n", up->url));		return (NULL);	}	/* Remember that we've been here before */	if (up->md5)		mark_retrieved (up);	/* Are we dealing with HTML or SOIF, if not we can't get href links */	if (up->http_mime_hdr == NULL) {		fprintf (stdout, "%s\t%s\n", up->url, up->md5);		fflush (stdout);		return (NULL);	}	/* If we were given an HTML document, then use HTMLurls to summarise it */	if (strstr (up->http_mime_hdr, "text/html") != NULL) {		/* Extract the HREF's */		if (pipe (pipefds) < 0) {			log_errno ("pipe");			return (NULL);		}		if ((pid = fork ()) < 0) {			log_errno ("fork");			return (NULL);		}		if (pid == 0) {	/* child: HTMLurls */			char *argv[64];#if 0 /* kjl/7mar2002 */			char *enum_url = NULL;			enum_url = (char *) xmalloc (strlen (up->url) + 20);			sprintf (enum_url, "ENUMERATOR_URL=%s", up->url);			putenv (enum_url);#endif			close (pipefds[0]);	/* child wont read from pipe */			dup2 (pipefds[1], 1);	/* stdout -> write:pipe */			close (pipefds[1]);	/* close pipe, its now stdout */			argv[0] = "HTMLurls";			argv[1] = "--base-url";			argv[2] = up->url;			argv[3] = up->filename;			argv[4] = NULL;			execvp (argv[0], argv);			sprintf (buf, "execvp: %s", argv[0]);			log_errno (buf);			_exit (1);		}		close (pipefds[1]);	/* parent wont write */		if ((fp = fdopen (pipefds[0], "r")) == NULL) {			log_errno ("fdopen");			return (NULL);		}		/*		 *  For each HREF pointer, convert it to a URL, and add it to		 *  the list of URLs to return.		 */		Tail = &head;		while (fgets (buf, BUFSIZ, fp) != NULL) {			if ((s = strrchr (buf, '\n')) != NULL)				*s = '\0';	/* strip newline */			Debug (42, 1, ("httpenum-depth: Input: %s\n", buf));			/* Make a link in the list for this URL */			l = (list_t *) xmalloc (sizeof (list_t));			l->ptr = (void *) xstrdup (buf);			l->next = (list_t *) NULL;			*Tail = l;			Tail = &(l->next);			nurls++;		}		fclose (fp);		close (pipefds[0]);		/* Wait for HTMLurls to finish and decide on whether to index the page		 * according to its exit code		 */		if ((err = waitpid (pid, &status, 0)) != pid) {			Debug (42, 1,			       ("WARNING: waiting for child %d got %d...\n",				pid, err));		}		/* If we had an exit code of 0 then index it, otherwise don't */		if (WEXITSTATUS (status) == 0) {			fprintf (stdout, "%s\t%s\n", up->url, up->md5);			fflush (stdout);		}	} else if (strstr (up->http_mime_hdr, "text/x-soif") != NULL) {		/* If its a SOIF document then get it from the database */		char *refs = urldb_getrefs (up->url);		char *ptr;		char *abs_ref;		Log ("%s\tNot Modified\n", up->url);		/* Flush the URL */		fprintf (stdout, "%s\t%s\n", up->url, up->md5);		fflush (stdout);		Tail = &head;		for (ptr = strtok (refs, "\n"); ptr; ptr = strtok (NULL, "\n")) {			abs_ref = url_parse_relative (ptr, up->url);			if (url_is_allowed (abs_ref)) {				/* Make an absolute link in the list for this URL */				l = (list_t *) xmalloc (sizeof (list_t));				l->ptr = (void *) abs_ref;				l->next = (list_t *) NULL;				*Tail = l;				Tail = &(l->next);				nurls++;			} else {				xfree (abs_ref);			}		}		xfree (refs);	} else {		fprintf (stdout, "%s\t%s\n", up->url, up->md5);		fflush (stdout);		return (NULL);	}	Debug (42, 1, ("Adding %d URLs from %s to workload\n", nurls, up->url));	return (head);}/* *  process_url() - Retrieves the given URL (HTTP only), computes an MD5, *  and extracts the list of HREF pointers within the HTML document. *  If any of the HREF pointers are HTTP and the same host as the given *  URL up, then it calls process_url() recursively. *  up is closed on exit. */static voidprocess_url (up, depth)URL *up;int depth;{	list_t *head = NULL;	list_t *l = NULL;	list_t *next_l = NULL;	char *url = NULL;	URL *tup = NULL;	if (max_depth > 0 && depth > (max_depth + 1)) {		if (not_visited)			fprintf (not_visited, "[DEPTH] %s\n", up->url);		Debug (42, 1, ("Maximum Depth of %d Reached: %s\n",			       max_depth, up->url));		url_close (up);		return;	}	Debug (42, 1, ("Processing: [%2d] %s\n", depth - 1, up->url));	if (depth - 1 < 100)		depth_hist[depth - 1]++;	if ((head = http_enum (up)) == NULL) {		url_close (up);		return;	}	url_close (up);	/*	 *  Now, for each URL in the list, call process_url() if	 *  the URL is an HTTP url and it is on the same host	 */	for (l = head; l; l = next_l) {		next_l = l->next;		url = (char *) l->ptr;		if ((url != NULL) && ((tup = url_open (url)) != NULL)) {			if ((tup->type != URL_HTTP)) {				Debug (42, 1, ("Examining: %s\n", tup->url));				/* filter_selection() checks access_mask */				if (!filter_selection (tup)) {					int re_depth;					/* Print URL with bogus MD5 to enumerate; */					/* flush to keep pipe moving */					/* URL <tab> MD5 */					re_depth = max_depth - depth + 2;					if (re_depth < 1)						re_depth = 1;					fprintf (stdout, "%s\tDepth=%d\n",						 tup->url, re_depth);					fflush (stdout);					Debug (42, 1,					       ("HTTPENUM Re-enumeration: %s\tDepth=%d\n",						tup->url, re_depth));				}				url_close (tup);			} else {				process_url (tup, depth + 1);			}		}		/* Free List Entry */		xfree (l->ptr);		l->ptr = (void *) NULL;		l->next = (list_t *) NULL;		xfree (l);		l = (list_t *) NULL;	}}/* ---------------------------------------------------------------------- *//* *  initialize() - Basic init routines */static voidinitialize (){	char *s;	extern int liburl_conform_rfc1738;	FILE *logfp = NULL;#ifdef USE_HOST_CACHE	host_cache_init ();#endif	cur_depth = max_depth = url_max = host_max = 0;	if ((s = getenv ("HARVEST_URL_MAX")) != NULL)		url_max = atoi (s);	if ((s = getenv ("HARVEST_HOST_MAX")) != NULL)		host_max = atoi (s);	if ((s = getenv ("HARVEST_DEPTH_MAX")) != NULL)		max_depth = atoi (s);	if ((s = getenv ("HARVEST_DEPTH_CUR")) != NULL)		cur_depth = atoi (s);	if (url_max < 1)		url_max = 250;	/* hard-coded maximum */	if (host_max < 1)		host_max = 1;	/* hard-coded maximum */	if (max_depth < 1)		max_depth = 0;	/* hard-coded maximum */	host_filterfile = getenv ("HARVEST_HOST_FILTER");	url_filterfile = getenv ("HARVEST_URL_FILTER");	access_types = getenv ("HARVEST_ACCESS_TYPES");	if ((s = getenv ("HARVEST_GATHERER_LOGFILE")) != (char *) NULL)		logfp = fopen (s, "a+");	if (logfp == (FILE *) NULL)		logfp = stderr;	init_log3 ("httpenum-depth", logfp, stderr);	init_url ();	liburl_conform_rfc1738 = 1;	filter_initialize ();	Debug (42, 5, ("access_mask: %#02X\n", access_mask));	/* Open GDBM databases to keep track of where we've been */	urldb_filename = xstrdup (tempnam (NULL, "Hurl"));	urldbf = gdbm_open (urldb_filename, 0, GDBM_NEWDB, 0644, NULL);	if (urldbf == NULL) {		log_errno (urldb_filename);		fatal ("gdbm_open: %s: %s", urldb_filename,		       gdbm_strerror (gdbm_errno));	}	hostdb_filename = xstrdup (tempnam (NULL, "Hhost"));	hostdbf = gdbm_open (hostdb_filename, 0, GDBM_NEWDB, 0644, NULL);	if (hostdbf == NULL) {		log_errno (hostdb_filename);		fatal ("gdbm_open: %s: %s", hostdb_filename,		       gdbm_strerror (gdbm_errno));	}	md5db_filename = xstrdup (tempnam (NULL, "Hmd5"));	md5dbf = gdbm_open (md5db_filename, 0, GDBM_NEWDB, 0644, NULL);	if (md5dbf == NULL) {		log_errno (md5db_filename);		fatal ("gdbm_open: %s: %s", md5db_filename,		       gdbm_strerror (gdbm_errno));	}	/* open not-visited file */	if ((s = getenv ("HARVEST_NOT_VISITED_LOG")) != NULL)		not_visited = fopen (s, "a+");	if (not_visited)		setbuf (not_visited, NULL);}/* Die gracefully */static voidsigdie (x)int x;{	int i;#ifdef USE_HOST_CACHE	dump_host_cache (42, 9);#endif	if (urldbf != NULL)		gdbm_close (urldbf);	if (hostdbf != NULL)		gdbm_close (hostdbf);	if (md5dbf != NULL)		gdbm_close (md5dbf);	if (not_visited)		fclose (not_visited);	/* (void) unlink (urldb_filename); */	crremove (urldb_filename);	xfree (urldb_filename);	/* (void) unlink (hostdb_filename); */	crremove (hostdb_filename);	xfree (hostdb_filename);	/* (void) unlink (md5db_filename); */	crremove (md5db_filename);	xfree (md5db_filename);	for (i = 0; i < 100; i++) {		if (i > max_depth && depth_hist[i] == 0)			break;		Log ("Found %8d objects at depth %d\n", depth_hist[i], i);	}	Debug (42, 1, ("httpenum-depth: exiting (signal %d)\n", x));	exit (0);}/* ---------------------------------------------------------------------- */static voidusage (){	fprintf (stderr, "Usage: httpenum-depth http-URL\n");	exit (1);}intmain (argc, argv)int argc;char **argv;{	URL *up = NULL;	int depth = 0;	debug_init ();		/* from $HARVEST_DEBUG */	for (argc--, argv++; argc > 0 && **argv == '-'; argc--, argv++) {		if (!strncmp (*argv, "-D", 2)) {			debug_flag (*argv);		}	}	if (argc != 1)		usage ();	for (depth = 0; depth < 100; depth++)		depth_hist[depth] = 0;	signal (SIGTERM, sigdie);	/* Die gracefully */	signal (SIGINT, sigdie);	signal (SIGPIPE, sigdie);	/* Quickly clean up on broken pipe */	initialize ();		/* Initialize */	Debug (42, 1, ("httpenum-depth: Starting...\n"));	/* Grab the RootNode URL from the command line */	if ((up = url_open (*argv)) == NULL || up->type != URL_HTTP) {		usage ();	}	/* Mark the RootNode */	tree_root = xstrdup (up->url);	printf ("%s\n", up->url);	/* Print tree root */	process_url (up, cur_depth + 1);	/* Do the Enumeration recursively */	finish_url ();	sigdie (0);}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -