📄 httpenum-breadth.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 2 页
字号:
上一页 12
	int y;	if ((tup = url_open (url)) == NULL)		return 0;;	if (url_in_db (tup->url)) {	/* Have we been here? */		Debug (42, 1, ("Already Visited URL: %s\n", tup->url));		url_close (tup);		return 0;	}	if ((y = filter_selection (tup))) {		Debug (42, 1, ("Removing Candidate: [%s] %s\n",			       Filter_Type_Name[y], tup->url));		if (not_visited)			fprintf (not_visited, "[FILTER] %s\n", tup->url);		url_close (tup);		return 0;	}	if (!visit_server (tup)) {		Debug (42, 1, ("Server count exceeded: %s\n", tup->url));		if (not_visited)			fprintf (not_visited, "[SERVER] %s\n", tup->url);		url_close (tup);		return 0;	}	if (!RobotsTxtCheck (tup)) {		Debug (42, 1, ("Disallowed by robots.txt: %s\n", tup->url));		if (not_visited)			fprintf (not_visited, "[ROBOTS.TXT] %s\n", tup->url);		url_close (tup);		return 0;	}	url_close (tup);	return 1;}/* *  http_enum() - Builds a linked list of all the URLs in this object, *  returns -1 on error.  Checks for "text/html" and "text/x-soif" in MIME *  headers and then runs "HTMLurls" on the file or urldb_getrefs() on the URL. */static inthttp_enum (up, depth)URL *up;int depth;{	FILE *fp = NULL;#if 0 /* kjl/7mar2002 */	char *enum_url = NULL;#endif	char *s = NULL;	char *t0 = NULL;	char *t1 = NULL;	char *t2 = NULL;	int err;	int nurls = 0;	int pid;	int pipefds[2];	int status;	int count = 0;	char *argv[5];	static char buf[BUFSIZ];	if (url_in_db (up->url)) {	/* Have we been here? */		Debug (42, 1, ("Already Visited URL: %s\n", up->url));		return 0;	}	/*	 *  Ack.  Check for symbolic link loops in server generated HTML listings	 *  Do this by comparing the last two pathname components.  If they are	 *  the same then guess its a loop.	 */	s = xstrdup (up->pathname);	t0 = t1 = t2 = NULL;	for (t0 = strtok (s, "/"); t0; t0 = strtok (NULL, "/")) {		t2 = t1;		t1 = t0;	}	if (t1 != NULL && t2 != NULL) {		if (strcmp (t1, t2) == 0) {			Debug (42, 1, ("Possible symlink loop: %s\n", up->url));			xfree (s);			s = NULL;			return 0;		}	}	xfree (s);	s = NULL;	while (((status = url_retrieve (up)) == -1)	       && count < HTTP_MAX_REDIRECTS) {		count++;		if (!url_is_allowed (up->url))			return 0;	}	if (status) {		/* Grab the URL; success? */		Debug (42, 1, ("Cannot Retrieve URL: %s\n", up->url));#ifdef DONT_RETRY_FAILS		mark_failed (up);#endif		return 0;	}	if (up->md5 && md5_in_db (up->md5)) {	/* Have we been here? */		Debug (42, 1, ("Already Visited MD5: %s\n", up->url));		return 0;	}	/* Remember that we've been here before */	if (up->md5)		mark_retrieved (up);	argv[0] = argv[1] = argv[2] = argv[3] = argv[4] = NULL;	/* Are we dealing with HTML or SOIF, if not we can't get href links */	if (up->http_mime_hdr != NULL) {		if (strstr (up->http_mime_hdr, "text/html") != NULL) {			argv[0] = "HTMLurls";			argv[1] = "--base-url";			argv[2] = up->url;			argv[3] = up->filename;		} else if (strstr (up->http_mime_hdr, "text/x-soif") != NULL) {			char *refs;			char *ptr;			char *abs_ref;			Log ("%s\tNot Modified\n", up->url);			refs = urldb_getrefs (up->url);			if (!refs)				return -1;			for (ptr = strtok (refs, "\n"); ptr; ptr = strtok (NULL, "\n")) {				abs_ref = url_parse_relative (ptr, up->url);				if (url_is_allowed (abs_ref)) {					add_to_list (abs_ref, depth);				}				xfree (abs_ref);			}			xfree (refs);			return 1;		} else {			fprintf (stdout, "%s\t%s\n", up->url, up->md5);	/* URL <tab> MD5 */			fflush (stdout);			return -1;		}	}	/* Extract the HREF's */	if (pipe (pipefds) < 0) {		log_errno ("pipe");		return -1;	}	if ((pid = fork ()) < 0) {		log_errno ("fork");		close (pipefds[0]);		close (pipefds[1]);		return -1;	}	if (pid == 0) {		/* child: HTMLurls */#if 0 /* kjl/7mar2002 */		enum_url = (char *) xmalloc (strlen (up->url) + 20);		sprintf (enum_url, "ENUMERATOR_URL=%s", up->url);		putenv (enum_url);#endif		close (pipefds[0]);	/* child wont read from pipe */		dup2 (pipefds[1], 1);	/* stdout -> write:pipe */		close (pipefds[1]);	/* close pipe, its now stdout */		execvp (argv[0], argv);		sprintf (buf, "execvp: %s", argv[0]);		log_errno (buf);		_exit (1);	}	close (pipefds[1]);	/* parent wont write */	if ((fp = fdopen (pipefds[0], "r")) == NULL) {		log_errno ("fdopen");		return -1;	}	/*	 *  For each HREF pointer, convert it to a URL, and add it to	 *  the global list of URLs to process.	 */	while (fgets (buf, BUFSIZ, fp) != NULL) {		if ((s = strrchr (buf, '\n')) != NULL)			*s = '\0';	/* strip newline */		Debug (42, 1, ("Input: %s\n", buf));		if (url_is_allowed (buf)) {			add_to_list (buf, depth);			nurls++;		}	}	fclose (fp);	close (pipefds[0]);	/*	 *  Wait for HTMLurls to finish and decide on whether to index the page	 *  according to its exit code	 */	if ((err = waitpid (pid, &status, 0)) != pid) {		Debug (42, 1, ("WARNING: waiting for child %d got %d...\n",			       pid, err));	}	/* If we had an exit code of 0 then index it, otherwise don't */	if (WEXITSTATUS (status) == 0) {		fprintf (stdout, "%s\t%s\n", up->url, up->md5);		fflush (stdout);	}	Debug (42, 1, ("Adding %d URLs from %s to workload\n", nurls, up->url));	return 1;}/* ---------------------------------------------------------------------- *//* *  initialize() - Basic init routines */static voidinitialize (){	char *s = NULL;	extern int liburl_conform_rfc1738;	FILE *logfp = NULL;#ifdef USE_HOST_CACHE	host_cache_init ();#endif	cur_depth = max_depth = url_max = host_max = 0;	if ((s = getenv ("HARVEST_URL_MAX")) != NULL)		url_max = atoi (s);	if ((s = getenv ("HARVEST_HOST_MAX")) != NULL)		host_max = atoi (s);	if ((s = getenv ("HARVEST_DEPTH_MAX")) != NULL)		max_depth = atoi (s);	if ((s = getenv ("HARVEST_DEPTH_CUR")) != NULL)		cur_depth = atoi (s);	if (url_max < 1)		url_max = 250;	/* hard-coded maximum */	if (host_max < 1)		host_max = 1;	/* hard-coded maximum */	if (max_depth < 1)		max_depth = 0;	/* hard-coded maximum */	host_filterfile = getenv ("HARVEST_HOST_FILTER");	url_filterfile = getenv ("HARVEST_URL_FILTER");	access_types = getenv ("HARVEST_ACCESS_TYPES");	filename_candidates = getenv ("HARVEST_CANDIDATES");	if ((s = getenv ("HARVEST_GATHERER_LOGFILE")) != (char *) NULL)		logfp = fopen (s, "a+");	if (logfp == (FILE *) NULL)		logfp = stderr;	init_log3 ("httpenum-breadth", logfp, stderr);	init_url ();	liburl_conform_rfc1738 = 1;	filter_initialize ();	Debug (42, 5, ("access_mask: %#02X\n", access_mask));	/* Open GDBM databases to keep track of where we've been */	urldb_filename = xstrdup (tempnam (NULL, "Hurl"));	urldbf = gdbm_open (urldb_filename, 0, GDBM_NEWDB, 0644, NULL);	if (urldbf == NULL) {		log_errno (urldb_filename);		fatal ("gdbm_open: %s: %s", urldb_filename,		       gdbm_strerror (gdbm_errno));	}	hostdb_filename = xstrdup (tempnam (NULL, "Hhost"));	hostdbf = gdbm_open (hostdb_filename, 0, GDBM_NEWDB, 0644, NULL);	if (hostdbf == NULL) {		log_errno (hostdb_filename);		fatal ("gdbm_open: %s: %s", hostdb_filename,		       gdbm_strerror (gdbm_errno));	}	md5db_filename = xstrdup (tempnam (NULL, "Hmd5"));	md5dbf = gdbm_open (md5db_filename, 0, GDBM_NEWDB, 0644, NULL);	if (md5dbf == NULL) {		log_errno (md5db_filename);		fatal ("gdbm_open: %s: %s", md5db_filename,		       gdbm_strerror (gdbm_errno));	}	/* open not-visited file */	if ((s = getenv ("HARVEST_NOT_VISITED_LOG")) != NULL)		not_visited = fopen (s, "a+");	if (not_visited)		setbuf (not_visited, NULL);}/* Die gracefully */static voidsigdie (x)int x;{	int i, sum;#ifdef USE_HOST_CACHE	dump_host_cache (42, 9);#endif	if (urldbf != NULL)		gdbm_close (urldbf);	if (hostdbf != NULL)		gdbm_close (hostdbf);	if (md5dbf != NULL)		gdbm_close (md5dbf);	if (not_visited)		fclose (not_visited);	/* (void) unlink (urldb_filename); */	crremove (urldb_filename);	xfree (urldb_filename);	/* (void) unlink (hostdb_filename); */	crremove (hostdb_filename);	xfree (hostdb_filename);	/* (void) unlink (md5db_filename); */	crremove (md5db_filename);	xfree (md5db_filename);	Log ("Statistics for URL '%s':\n", tree_root);	while (depth_hist[max_depth] == 0)		max_depth--;	for (i = 0, sum = 0; i < 100; i++) {		if (i > max_depth && depth_hist[i] == 0)			break;		Log ("    Found %8d objects at depth %d\n", depth_hist[i], i);		sum += depth_hist[i];	}	Log ("Total for URL '%s': %8d objects\n", tree_root, sum);	Debug (42, 1, ("httpenum-breadth: exiting (signal %d)\n", x));	exit (0);}/* ---------------------------------------------------------------------- */static voidusage (){	fprintf (stderr, "Usage: httpenum-breadth http-URL\n");	exit (1);}intmain (argc, argv)int argc;char **argv;{	URL *up = NULL;	list_t *l = NULL;	char *url = NULL;	int depth = 0;	debug_init ();		/* from $HARVEST_DEBUG */	for (argc--, argv++; argc > 0 && **argv == '-'; argc--, argv++) {		if (!strncmp (*argv, "-D", 2)) {			debug_flag (*argv);		}	}	if (argc != 1)		usage ();	for (depth = 0; depth < 100; depth++)		depth_hist[depth] = 0;	signal (SIGTERM, sigdie);	/* Die gracefully */	signal (SIGINT, sigdie);	signal (SIGPIPE, sigdie);	/* Quickly clean up on broken pipe */	initialize ();		/* Initialize */	Debug (42, 1, ("httpenum-breadth: Starting...\n"));	/* Grab the RootNode URL from the command line */	if ((up = url_open (*argv)) == NULL || up->type != URL_HTTP) {		usage ();	}	/* Mark the RootNode */	tree_root = xstrdup (up->url);	Tail = &head;	Log ("Processing URL: '%s'\n", tree_root);	printf ("%s\n", up->url);	/* Print tree root */	/*	 *  helpdesk@ecs.soton.ac.uk -- Gatherer visits too many hosts	 *  6/3/96. Make sure the first URL we start with is added	 *  to the list of servers visited.	 */	(void) visit_server (up);	add_to_list (up->url, cur_depth);	/* start at depth = 0 */	url_close (up);	for (l = head; l; l = free_from_list (l)) {		url = (char *) l->ptr;		depth = l->depth;		if (max_depth > 0 && depth > max_depth) {			if (not_visited)				fprintf (not_visited, "[DEPTH] %s\n", url);			Debug (42, 1, ("Maximum Depth of %d Reached: %s\n",				       max_depth, url));			continue;		}		Debug (42, 1, ("Processing: [%2d] %s\n", depth, url));		if ((up = url_open (url)) == NULL)			continue;		if ((up->type != URL_HTTP)) {			Debug (42, 1,			       ("Examining: [%d:%d] %s\n", depth, max_depth,				up->url));			/* filter_selection() checks access_mask */			if (!filter_selection (up) && (depth <= max_depth)) {				/*				 *  Print URL with bogus MD5 to enumerate;				 *  flush to keep pipe moving				 *  URL <tab> MD5				 */				fprintf (stdout, "%s\tDepth=%d:%d\n", up->url,					 depth, max_depth);				fflush (stdout);				Debug (42, 1,				       ("HTTPENUM Re-enumeration: %s\tDepth=%d:%d\n",					up->url, depth, max_depth));			}			url_close (up);			continue;		}		/* search for more links from this one */		if (http_enum (up, depth + 1) != 0 && depth < 100)			/* Increase retrieved count only for new URLs - HrS */			depth_hist[depth]++;		url_close (up);	}	finish_url ();	sigdie (0);	/* NOTREACHED */}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -