📄 httpenum-breadth.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 2 页
字号:
12 下一页
static char rcsid[] =    "$Id: httpenum-breadth.c,v 2.9 2000/02/03 12:45:56 sxw Exp $";/* *  httpenum-breadth.c - Breadth first RootNode URL enumerator for HTTP URLs * *  Usage: httpenum-breadth http-URL * *  Outputs the following format: * *      URL of tree root *      URL <tab> md5 *      ... *      URL <tab> md5 * *  DEBUG: section  42, level 1, 5, 9   Gatherer enumeration for HTTP *  AUTHOR: Harvest derived * *  Harvest Indexer http://harvest.sourceforge.net/ *  ----------------------------------------------- * *  The Harvest Indexer is a continued development of code developed by *  the Harvest Project. Development is carried out by numerous individuals *  in the Internet community, and is not officially connected with the *  original Harvest Project or its funding sources. * *  Please mail lee@arco.de if you are interested in participating *  in the development effort. * *  This program is free software; you can redistribute it and/or modify *  it under the terms of the GNU General Public License as published by *  the Free Software Foundation; either version 2 of the License, or *  (at your option) any later version. * *  This program is distributed in the hope that it will be useful, *  but WITHOUT ANY WARRANTY; without even the implied warranty of *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *  GNU General Public License for more details. * *  You should have received a copy of the GNU General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//*  ---------------------------------------------------------------------- *  Copyright (c) 1994, 1995.  All rights reserved. * *    The Harvest software was developed by the Internet Research Task *    Force Research Group on Resource Discovery (IRTF-RD): * *          Mic Bowman of Transarc Corporation. *          Peter Danzig of the University of Southern California. *          Darren R. Hardy of the University of Colorado at Boulder. *          Udi Manber of the University of Arizona. *          Michael F. Schwartz of the University of Colorado at Boulder. *          Duane Wessels of the University of Colorado at Boulder. * *    This copyright notice applies to software in the Harvest *    ``src/'' directory only.  Users should consult the individual *    copyright notices in the ``components/'' subdirectories for *    copyright information about other software bundled with the *    Harvest source code distribution. * *  TERMS OF USE * *    The Harvest software may be used and re-distributed without *    charge, provided that the software origin and research team are *    cited in any use of the system.  Most commonly this is *    accomplished by including a link to the Harvest Home Page *    (http://harvest.cs.colorado.edu/) from the query page of any *    Broker you deploy, as well as in the query result pages.  These *    links are generated automatically by the standard Broker *    software distribution. * *    The Harvest software is provided ``as is'', without express or *    implied warranty, and with no support nor obligation to assist *    in its use, correction, modification or enhancement.  We assume *    no liability with respect to the infringement of copyrights, *    trade secrets, or any patents, and are not responsible for *    consequential damages.  Proper use of the Harvest software is *    entirely the responsibility of the user. * *  DERIVATIVE WORKS * *    Users may make derivative works from the Harvest software, subject *    to the following constraints: * *      - You must include the above copyright notice and these *        accompanying paragraphs in all forms of derivative works, *        and any documentation and other materials related to such *        distribution and use acknowledge that the software was *        developed at the above institutions. * *      - You must notify IRTF-RD regarding your distribution of *        the derivative work. * *      - You must clearly notify users that your are distributing *        a modified version and not the original Harvest software. * *      - Any derivative product is also subject to these copyright *        and use restrictions. * *    Note that the Harvest software is NOT in the public domain.  We *    retain copyright, as specified above. * *  HISTORY OF FREE SOFTWARE STATUS * *    Originally we required sites to license the software in cases *    where they were going to build commercial products/services *    around Harvest.  In June 1995 we changed this policy.  We now *    allow people to use the core Harvest software (the code found in *    the Harvest ``src/'' directory) for free.  We made this change *    in the interest of encouraging the widest possible deployment of *    the technology.  The Harvest software is really a reference *    implementation of a set of protocols and formats, some of which *    we intend to standardize.  We encourage commercial *    re-implementations of code complying to this set of standards. * */#include <sys/types.h>#include <sys/stat.h>#include <fcntl.h>#include <stdio.h>#include <stdlib.h>#include <unistd.h>#include <memory.h>#include <string.h>#include <signal.h>#include <gdbm.h>#include "util.h"#include "url.h"#define PUBLIC extern#include "filter.h"typedef struct _list_t {	void *ptr;	int depth;	struct _list_t *next;} list_t;list_t *head = NULL;list_t **Tail = NULL;/* List of candidate URL's */char *filename_candidates;int fd_candidates_read = -1, fd_candidates_write = -1;/* *  define HOST_COUNT_IP to 'count' visited hosts based on IP, not the *  given hostname.  This way aliased machines will be properly *  enumerated *  If you define HOST_COUNT_IP then support for HTTP/1.1 virtual hosts *  and for hosts that have more than one IP address (for round-robin *  load balancing) may not work */#undef HOST_COUNT_IP/* Global variables */int max_depth = 0;int cur_depth = 0;int depth_hist[100];/* Local variables */static int url_max = 0;static int nurls = 0;static int host_max = 0;static int nhosts = 0;static char *tree_root = NULL;static char *urldb_filename = NULL;static char *hostdb_filename = NULL;static char *md5db_filename = NULL;static GDBM_FILE urldbf = NULL;static GDBM_FILE hostdbf = NULL;static GDBM_FILE md5dbf = NULL;static FILE *not_visited = NULL;/* Local functions */static void usage ();static void mark_failed ();static void mark_retrieved ();static void sigdie ();static int url_in_db ();static int md5_in_db ();static int http_enum ();extern int RobotsTxtCheck _PARAMS ((URL *));list_t *free_from_list (l)list_t *l;{	list_t *r = NULL;	int len;	if (filename_candidates) {		if (fd_candidates_read == -1)			fd_candidates_read =			    open (filename_candidates, O_RDONLY);		if (fd_candidates_read > -1) {			r = (list_t *) xmalloc (sizeof (list_t));			if (read			    (fd_candidates_read, &(r->depth), sizeof (r->depth))			    <= 0) {				xfree (r);				return NULL;			}			read (fd_candidates_read, &(len), sizeof (len));			r->ptr = (void *) xmalloc (len + 1);			read (fd_candidates_read, r->ptr, len);			((char *) r->ptr)[len] = '\0';		}	} else		r = l->next;	if (l) {		xfree (l->ptr);		xfree (l);	}	return r;}voidadd_to_list (url, depth)char *url;int depth;{	list_t *l = NULL;	int len;	if (filename_candidates) {		if (fd_candidates_write == -1)			fd_candidates_write = open (filename_candidates,						    O_WRONLY | O_CREAT |						    O_TRUNC, 0600);		if (fd_candidates_write > -1) {			len = strlen (url);			write (fd_candidates_write, &depth, sizeof (depth));			write (fd_candidates_write, &len, sizeof (len));			write (fd_candidates_write, url, len);		}		if (!head)			head = free_from_list (0);		return;	}	/* No candidates file - keep list in memory. */	l = (list_t *) xmalloc (sizeof (list_t));	l->ptr = (void *) xstrdup (url);	l->next = (list_t *) NULL;	l->depth = depth;	*Tail = l;	Tail = &(l->next);	return;}/* ---------------------------------------------------------------------- *//* *  mark_failed() - Mark that a URL failed to be retrieved, so that the *  enumerator doesn't try it again. This option may not be wanted by *  some users and so should be configurable. */static voidmark_failed (URL * up){	datum k, d;	Debug (42, 9, ("mark_failed: url='%s'", up->url));	k.dptr = xstrdup (up->url);	k.dsize = strlen (k.dptr) + 1;	d.dptr = xstrdup ("FailedAccess");	d.dsize = strlen (d.dptr) + 1;	if (!gdbm_exists (urldbf, k) && gdbm_store (urldbf, k, d, GDBM_INSERT))		fatal ("GDBM URLDB: %s: %s", k.dptr,		       gdbm_strerror (gdbm_errno));	xfree (k.dptr);	xfree (d.dptr);}/* *  mark_retrieved() - Mark that the given URL was successfully retrieved, *  so that the URL is not retrieved again.  This prevents cycles in the *  enumeration. */static voidmark_retrieved (up)URL *up;{	datum k, d;	Debug (42, 9,	       ("mark_retrieved: url='%s', md5='%s'\n", up->url, up->md5));	k.dptr = xstrdup (up->url);	k.dsize = strlen (k.dptr) + 1;	d.dptr = xstrdup (up->md5);	d.dsize = strlen (d.dptr) + 1;	if (!gdbm_exists (urldbf, k) && gdbm_store (urldbf, k, d, GDBM_INSERT))		fatal ("GDBM URLDB: %s: %s", k.dptr,		       gdbm_strerror (gdbm_errno));	if (!gdbm_exists (md5dbf, d) && gdbm_store (md5dbf, d, k, GDBM_INSERT))		fatal ("GDBM MD5DB: %s: %s", k.dptr,		       gdbm_strerror (gdbm_errno));	xfree (k.dptr);	xfree (d.dptr);	if (up->redir_from_url != (char *) NULL) {		Debug (42, 9, ("mark_retrieved: url='%s', md5='%s'\n",			       up->redir_from_url, up->md5));		k.dptr = xstrdup (up->redir_from_url);		k.dsize = strlen (k.dptr) + 1;		d.dptr = xstrdup (up->md5);		d.dsize = strlen (d.dptr) + 1;		if (!gdbm_exists (urldbf, k)		    && gdbm_store (urldbf, k, d, GDBM_INSERT))			fatal ("GDBM URLDB: %s: %s", k.dptr,			       gdbm_strerror (gdbm_errno));		xfree (k.dptr);		xfree (d.dptr);	}	if (nurls++ >= url_max) {		Log ("Truncating RootNode %s at %d LeafNode URLs\n",		     tree_root, url_max);		url_close (up);		up = NULL;		sigdie (0);	}}/* *  url_in_db() - check to see if the URL is in the database */static inturl_in_db (url)char *url;{	datum k;	int r;	Debug (42, 9, ("url_in_db: checking for url='%s'\n", url));	k.dptr = xstrdup (url);	k.dsize = strlen (k.dptr) + 1;	r = gdbm_exists (urldbf, k);	xfree (k.dptr);	return (r);}/* *  md5_in_db() - check to see if the MD5 is in the database */static intmd5_in_db (md5)char *md5;{	datum k;	int r;	k.dptr = xstrdup (md5);	k.dsize = strlen (k.dptr) + 1;	r = gdbm_exists (md5dbf, k);	xfree (k.dptr);	return (r);}/* *  host_in_db() - check to see if the host is in the database */static inthost_in_db (host)char *host;{	datum k;	int r;#ifdef HOST_COUNT_IP	Host *h;	h = get_host (host);	if (!h)		return 0;	k.dptr = xstrdup (h->dotaddr);#else	k.dptr = xstrdup (host);#endif	k.dsize = strlen (k.dptr) + 1;	r = gdbm_exists (hostdbf, k);	xfree (k.dptr);	return (r);}/* *  visit_server() - Determine if we should visit the server.  Return *  zero if we should not process the URL; otherwise, return non-zero. */static intvisit_server (up)URL *up;{	datum k, d;#ifdef HOST_COUNT_IP	Host *h = NULL;#endif	if (host_in_db (up->host))	/* Host is already in the db */		return (1);	if (++nhosts > host_max)		return (0);#ifdef HOST_COUNT_IP	h = get_host (up->host);	if (!h)		return (0);	k.dptr = xstrdup (h->dotaddr);#else	k.dptr = xstrdup (up->host);#endif	k.dsize = strlen (k.dptr) + 1;	d.dptr = xstrdup (up->url);	d.dsize = strlen (d.dptr) + 1;	if (gdbm_store (hostdbf, k, d, GDBM_INSERT))		fatal ("GDBM HOSTDB: %s: %s", k.dptr,		       gdbm_strerror (gdbm_errno));	xfree (k.dptr);	xfree (d.dptr);	return (1);}inturl_is_allowed (url)char *url;{	URL *tup = NULL;
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -