📄 httpenum-depth.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 2 页
字号:
12 下一页
static char rcsid[] =    "$Id: httpenum-depth.c,v 2.8 2000/02/03 12:45:56 sxw Exp $";/* *  httpenum-depth.c - Depth First RootNode URL enumerator for HTTP URLs * *  Usage: httpenum-depth http-URL * *  Outputs the following format: * *      URL of tree root *      URL <tab> md5 *      ... *      URL <tab> md5 * *  DEBUG: section  42, level 1, 5, 9   Gatherer enumeration for HTTP *  AUTHOR: Harvest derived * *  Harvest Indexer http://harvest.sourceforge.net/ *  ----------------------------------------------- * *  The Harvest Indexer is a continued development of code developed by *  the Harvest Project. Development is carried out by numerous individuals *  in the Internet community, and is not officially connected with the *  original Harvest Project or its funding sources. * *  Please mail lee@arco.de if you are interested in participating *  in the development effort. * *  This program is free software; you can redistribute it and/or modify *  it under the terms of the GNU General Public License as published by *  the Free Software Foundation; either version 2 of the License, or *  (at your option) any later version. * *  This program is distributed in the hope that it will be useful, *  but WITHOUT ANY WARRANTY; without even the implied warranty of *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *  GNU General Public License for more details. * *  You should have received a copy of the GNU General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */#include <stdio.h>#include <string.h>#include <signal.h>#include <stdlib.h>#include <gdbm.h>#include "util.h"#include "url.h"#define PUBLIC extern#include "filter.h"typedef struct _list_t {	void *ptr;	struct _list_t *next;} list_t;/* *  define HOST_COUNT_IP to 'count' visited hosts based on IP, not the *  given hostname.  This way aliased machines will be properly *  enumerated *  If you define HOST_COUNT_IP then support for HTTP/1.1 virtual hosts *  and for hosts that have more than one IP address (for round-robin *  load balancing) may not work */#undef HOST_COUNT_IP/* Global variables */int max_depth = 0;int cur_depth = 0;int depth_hist[100];/* Local variables */static int url_max = 0;static int nurls = 0;static int host_max = 0;static int nhosts = 0;static char *tree_root = NULL;static char *urldb_filename = NULL;static char *hostdb_filename = NULL;static char *md5db_filename = NULL;static GDBM_FILE urldbf = NULL;static GDBM_FILE hostdbf = NULL;static GDBM_FILE md5dbf = NULL;static FILE *not_visited = NULL;/* Local functions */static void process_url ();static void usage ();static void mark_retrieved ();static void mark_failed ();static void sigdie ();static int url_in_db ();static int md5_in_db ();static list_t *http_enum ();/* From robots-txt.c */extern int RobotsTxtCheck _PARAMS ((URL *));/* ---------------------------------------------------------------------- *//* *  mark_failed() - Mark that a URL failed to be retrieved, so that the *  enumerator doesn't try it again. This option may not be wanted by *  some users and so should be configurable. */static voidmark_failed (URL * up){	datum k, d;	Debug (42, 9, ("mark_failed: url='%s'", up->url));	k.dptr = xstrdup (up->url);	k.dsize = strlen (k.dptr) + 1;	d.dptr = xstrdup ("FailedAccess");	d.dsize = strlen (d.dptr) + 1;	if (!gdbm_exists (urldbf, k) && gdbm_store (urldbf, k, d, GDBM_INSERT))		fatal ("GDBM URLDB: %s: %s", k.dptr,		       gdbm_strerror (gdbm_errno));	xfree (k.dptr);	xfree (d.dptr);}/* *  mark_retrieved() - Mark that the given URL was successfully retrieved, *  so that the URL is not retrieved again.  This prevents cycles in the *  enumeration. */static voidmark_retrieved (up)URL *up;{	datum k, d;	Debug (42, 9,	       ("mark_retrieved: url='%s', md5='%s'\n", up->url, up->md5));	k.dptr = xstrdup (up->url);	k.dsize = strlen (k.dptr) + 1;	d.dptr = xstrdup (up->md5);	d.dsize = strlen (d.dptr) + 1;	if (!gdbm_exists (urldbf, k) && gdbm_store (urldbf, k, d, GDBM_INSERT))		fatal ("GDBM URLDB: %s: %s", k.dptr,		       gdbm_strerror (gdbm_errno));	if (!gdbm_exists (md5dbf, d) && gdbm_store (md5dbf, d, k, GDBM_INSERT))		fatal ("GDBM MD5DB: %s: %s", k.dptr,		       gdbm_strerror (gdbm_errno));	xfree (k.dptr);	xfree (d.dptr);	if (up->redir_from_url != (char *) NULL) {		Debug (42, 9, ("mark_retrieved: url='%s', md5='%s'\n",			       up->redir_from_url, up->md5));		k.dptr = xstrdup (up->redir_from_url);		k.dsize = strlen (k.dptr) + 1;		d.dptr = xstrdup (up->md5);		d.dsize = strlen (d.dptr) + 1;		if (!gdbm_exists (urldbf, k)		    && gdbm_store (urldbf, k, d, GDBM_INSERT))			fatal ("GDBM URLDB: %s: %s", k.dptr,			       gdbm_strerror (gdbm_errno));		xfree (k.dptr);		xfree (d.dptr);	}	if (nurls++ >= url_max) {		Log ("Truncating RootNode %s at %d LeafNode URLs\n",		     tree_root, url_max);		url_close (up);		up = NULL;		sigdie (0);	}}/* *  url_in_db() - check to see if the URL is in the database */static inturl_in_db (url)char *url;{	datum k;	int r;	Debug (42, 9, ("url_in_db: checking for url='%s'\n", url));	k.dptr = xstrdup (url);	k.dsize = strlen (k.dptr) + 1;	r = gdbm_exists (urldbf, k);	xfree (k.dptr);	return (r);}/* *  md5_in_db() - check to see if the MD5 is in the database */static intmd5_in_db (md5)char *md5;{	datum k;	int r;	k.dptr = xstrdup (md5);	k.dsize = strlen (k.dptr) + 1;	r = gdbm_exists (md5dbf, k);	xfree (k.dptr);	return (r);}/* *  host_in_db() - check to see if the host is in the database */static inthost_in_db (host)char *host;{	datum k;	int r;#ifdef HOST_COUNT_IP	Host *h;	h = get_host (host);	if (!h)		return 0;	k.dptr = xstrdup (h->dotaddr);#else	k.dptr = xstrdup (host);#endif	k.dsize = strlen (k.dptr) + 1;	r = gdbm_exists (hostdbf, k);	xfree (k.dptr);	return (r);}/* *  visit_server() - Determine if we should visit the server.  Return *  zero if we should not process the URL; otherwise, return non-zero. */static intvisit_server (up)URL *up;{	datum k, d;#ifdef HOST_COUNT_IP	Host *h = NULL;#endif	if (host_in_db (up->host))	/* Host is already in the db */		return (1);	if (++nhosts > host_max)		return (0);#ifdef HOST_COUNT_IP	h = get_host (up->host);	if (!h)		return (0);	k.dptr = xstrdup (h->dotaddr);#else	k.dptr = xstrdup (up->host);#endif	k.dsize = strlen (k.dptr) + 1;	d.dptr = xstrdup (up->url);	d.dsize = strlen (d.dptr) + 1;	if (gdbm_store (hostdbf, k, d, GDBM_INSERT))		fatal ("GDBM HOSTDB: %s: %s", k.dptr,		       gdbm_strerror (gdbm_errno));	xfree (k.dptr);	xfree (d.dptr);	return (1);}inturl_is_allowed (up)URL *up;{	int y;	Debug (42, 1, ("Checking URL: %s\nHost: %s\n", up->url, up->host));	if (url_in_db (up->url)) {	/* Have we been here? */		Debug (42, 1, ("Already Visited URL: %s\n", up->url));		return 0;	}	if (y = filter_selection (up)) {	/* Match the URL based on REs */		Debug (42, 1, ("Removing Candidate: [%s] %s\n",			       Filter_Type_Name[y], up->url));		if (not_visited)			fprintf (not_visited, "[FILTER] %s\n", up->url);		return 0;	}	if (!visit_server (up)) {	/* Can we visit this server? */		Debug (42, 1, ("Disallowed to Visit Server: %s\n", up->url));		if (not_visited)			fprintf (not_visited, "[SERVER] %s\n", up->url);		return 0;	}	if (!RobotsTxtCheck (up)) {	/* Is it disallowed by the robots.txt file */		Debug (42, 1, ("Disallowed by robots.txt file: %s\n", up->url));		if (not_visited)			fprintf (not_visited, "[ROBOTS.TXT] %s\n", up->url);		return 0;	}	return 1;}/* *  http_enum() - Returns a linked list of all the URLs in this object, *  or NULL on error.  Checks for "text/html" and "text/x-soif" in MIME *  headers and then runs "HTMLurls" on the file or urldb_getrefs() on the URL. */static list_t *http_enum (up)URL *up;{	list_t *head = NULL;	list_t **Tail = NULL;	list_t *l = NULL;	FILE *fp = NULL;	char *s = NULL;	char *t0 = NULL;	char *t1 = NULL;	char *t2 = NULL;	URL *tup = NULL;	char buf[BUFSIZ];	int pipefds[2];	int pid;	int err;	int status;	int count = 0;	int nurls = 0;	/* Check to see if we're allowed to visit the URL */	if (!url_is_allowed (up))		return (NULL);	/*	 *  Ack.  Check for symbolic link loops in server generated HTML listings	 *  Do this by comparing the last two pathname components.  If they are	 *  the same then guess its a loop.	 */	s = xstrdup (up->pathname);	t0 = t1 = t2 = NULL;	for (t0 = strtok (s, "/"); t0; t0 = strtok (NULL, "/")) {		t2 = t1;		t1 = t0;	}	if (t1 != NULL && t2 != NULL) {		if (strcmp (t1, t2) == 0) {			Debug (42, 1, ("Possible symlink loop: %s\n", up->url));			xfree (s);			s = NULL;			return (NULL);		}	}	xfree (s);	s = NULL;	/* Recurse over the redirect chain */	while (((status = url_retrieve (up)) == -1)	       && count < HTTP_MAX_REDIRECTS) {		count++;		if (!url_is_allowed (up))			return (NULL);	}	if (status) {		/* Grab the URL; success? */		Debug (42, 1, ("Cannot Retrieve URL: %s\n", up->url));
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -