📄 enum.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 2 页
字号:
12 下一页
static char rcsid[] = "$Id: enum.c,v 2.2 1997/08/27 18:06:28 sxw Exp $";/* *  enum.c - Enumerate RootNode URLs into LeafNode URLs.  Also produce *  Timestamps or MD5s for each LeafNode URL to support incremental *  Gatherering (using the dbcheck program). * *  Input: *    URL Option1 ... Option N * *  Output: *    URL\tMD5:xxx *    URL\tLast-Modification-Time:xxx * *  Usage:  enum [(-db | -del | -log | -tmpdb) file] [-Ds,l] [-delete] < url_enum_list * *  DEBUG: section  40, level 1, 5, 9   Gatherer URL enumeration *  AUTHOR: Harvest derived * *  Harvest Indexer http://harvest.sourceforge.net/ *  ----------------------------------------------- * *  The Harvest Indexer is a continued development of code developed by *  the Harvest Project. Development is carried out by numerous individuals *  in the Internet community, and is not officially connected with the *  original Harvest Project or its funding sources. * *  Please mail lee@arco.de if you are interested in participating *  in the development effort. * *  This program is free software; you can redistribute it and/or modify *  it under the terms of the GNU General Public License as published by *  the Free Software Foundation; either version 2 of the License, or *  (at your option) any later version. * *  This program is distributed in the hope that it will be useful, *  but WITHOUT ANY WARRANTY; without even the implied warranty of *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *  GNU General Public License for more details. * *  You should have received a copy of the GNU General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* *  ---------------------------------------------------------------------- *  Copyright (c) 1994, 1995.  All rights reserved. * *    The Harvest software was developed by the Internet Research Task *    Force Research Group on Resource Discovery (IRTF-RD): * *          Mic Bowman of Transarc Corporation. *          Peter Danzig of the University of Southern California. *          Darren R. Hardy of the University of Colorado at Boulder. *          Udi Manber of the University of Arizona. *          Michael F. Schwartz of the University of Colorado at Boulder. *          Duane Wessels of the University of Colorado at Boulder. * *    This copyright notice applies to software in the Harvest *    ``src/'' directory only.  Users should consult the individual *    copyright notices in the ``components/'' subdirectories for *    copyright information about other software bundled with the *    Harvest source code distribution. * *  TERMS OF USE * *    The Harvest software may be used and re-distributed without *    charge, provided that the software origin and research team are *    cited in any use of the system.  Most commonly this is *    accomplished by including a link to the Harvest Home Page *    (http://harvest.cs.colorado.edu/) from the query page of any *    Broker you deploy, as well as in the query result pages.  These *    links are generated automatically by the standard Broker *    software distribution. * *    The Harvest software is provided ``as is'', without express or *    implied warranty, and with no support nor obligation to assist *    in its use, correction, modification or enhancement.  We assume *    no liability with respect to the infringement of copyrights, *    trade secrets, or any patents, and are not responsible for *    consequential damages.  Proper use of the Harvest software is *    entirely the responsibility of the user. * *  DERIVATIVE WORKS * *    Users may make derivative works from the Harvest software, subject *    to the following constraints: * *      - You must include the above copyright notice and these *        accompanying paragraphs in all forms of derivative works, *        and any documentation and other materials related to such *        distribution and use acknowledge that the software was *        developed at the above institutions. * *      - You must notify IRTF-RD regarding your distribution of *        the derivative work. * *      - You must clearly notify users that your are distributing *        a modified version and not the original Harvest software. * *      - Any derivative product is also subject to these copyright *        and use restrictions. * *    Note that the Harvest software is NOT in the public domain.  We *    retain copyright, as specified above. * *  HISTORY OF FREE SOFTWARE STATUS * *    Originally we required sites to license the software in cases *    where they were going to build commercial products/services *    around Harvest.  In June 1995 we changed this policy.  We now *    allow people to use the core Harvest software (the code found in *    the Harvest ``src/'' directory) for free.  We made this change *    in the interest of encouraging the widest possible deployment of *    the technology.  The Harvest software is really a reference *    implementation of a set of protocols and formats, some of which *    we intend to standardize.  We encourage commercial *    re-implementations of code complying to this set of standards. * */#include <stdio.h>#include <string.h>#include <stdlib.h>#include <unistd.h>#include <memory.h>#include <limits.h>#include <fcntl.h>#include <sys/file.h>#include <sys/stat.h>#include <sys/types.h>#include <sys/errno.h>#include <signal.h>#include <gdbm.h>#include "util.h"#include "url.h"#include "template.h"/* Local variables *//* Easily define how to enumerate with the various access methods */#define Arg_URL         1	/* just give the url as the arg */#define Arg_Host_File   2	/* 'host path' are the args */#define Arg_Path        3	/* 'path' is the only arg */struct enumerators {		/* table is currently ignored */	int url_type;		/* type this enumerator handles */	char *url_enum_breadth;	/* the pgm for breadth first searches */	char *url_enum_depth;	/* the pgm for depth first searches */	char *special;		/* special flags if any */	char *stamp_kind;	/* proddb label for the stamp */	int arg_types;		/* sigh, need until all conform */} enums[] = {	{	URL_FILE, "fileenum", "fileenum", "", T_TIMESTAMP, Arg_URL}, {	URL_FTP, "ftpenum", "ftpenum", "", T_TIMESTAMP, Arg_URL}, {	URL_GOPHER, "gopherenum-breadth", "gopherenum-depth", "", T_MD5,		    Arg_URL}, {	URL_HTTP, "httpenum-breadth", "httpenum-depth", "", T_MD5, Arg_URL},	{	URL_NEWS, "newsenum", "newsenum", "", T_MD5, Arg_URL}};#define DBFILE  	"PRODUCTION.gdbm"	/* default proddb */#define DBNEWFILE 	"tmpdb.gdbm"	/* tmp log of all urls enumed */#define DELFILE 	"url.del"	/* Stale URLs in proddb */#define LOGFILE   	"enum.log"	/* default debug logfile */static char *tree_root = NULL;static char *addfile = NULL;static char *delfile = DELFILE;static char *proddbfile = DBFILE;static char *logfile = LOGFILE;static char *tmpdbfile = DBNEWFILE;static char *cur_attr;		/* set to md5 or time's label */static char *root;		/* set to root url of enum space */static int find_deleted = 0;	/* find any deleted objects */static char url[BUFSIZ];	/* URL */static int url_max;		/* Option 1 */static char url_filter[BUFSIZ];	/* Option 2 */static int host_max;		/* Option 3 */static char host_filter[BUFSIZ];	/* Option 4 */static int delay;		/* Option 5 */static int max_depth;		/* Option 6 */static int cur_depth;		/* Option 6 */static char access_types[64];	/* Option 7 */static char user_enum_pgm[BUFSIZ];	/* Option 8 */static char search_type[BUFSIZ];	/* Option 9 */FILE *fadd;			/* the add filehandles (stdout) */FILE *fdel;			/* the deleted urls */FILE *flog;			/* log output */GDBM_FILE proddb;		/* The Gatherer's production database */GDBM_FILE newdb;		/* A Temporary db to store timestamps */typedef struct _nodespec {	char *buf;	struct _nodespec *next;} nodespec;static nodespec *ns_head = 0, *ns_tail = 0;typedef struct _cpi {	int childpid;	char *result_file;	FILE *wfp;} cpi;/* Local functions */static int do_an_enumeration ();static int get_data ();static int do_kid ();static void usage ();static void init ();static void finish ();static void process_url_stamp ();static void do_add ();static void add_to_newdb ();static void add_to_todo ();static void enum_to_get_deleted ();static void set_envs ();static cpi *create_user_enum_pgm_pipes ();/* *  init() - Initializes the overall Enumeration process */static voidinit (argc, argv)int argc;char *argv[];{	debug_init ();	for (argc--, argv++; argc > 0 && **argv == '-'; argc--, argv++) {		if (strncmp (*argv, "-D", 2) == 0) {			debug_flag (*argv);		} else if (strcmp (*argv, "-delete") == 0) {			find_deleted = 1;		} else if (strcmp (*argv, "-db") == 0) {			if (--argc < 1)				usage ();			proddbfile = xstrdup (*++argv);		} else if (strcmp (*argv, "-add") == 0) {			if (--argc < 1)				usage ();			addfile = xstrdup (*++argv);		} else if (strcmp (*argv, "-tmpdb") == 0) {			if (--argc < 1)				usage ();			tmpdbfile = xstrdup (*++argv);		} else if (strcmp (*argv, "-del") == 0) {			if (--argc < 1)				usage ();			delfile = xstrdup (*++argv);		} else if (strcmp (*argv, "-log") == 0) {			if (--argc < 1)				usage ();			logfile = xstrdup (*++argv);		} else {			usage ();		}	}	if ((flog = fopen (logfile, "a+")) == NULL) {		if (getenv ("HARVEST_GATHERER_LOGFILE") != (char *) NULL)			flog =			    fopen (getenv ("HARVEST_GATHERER_LOGFILE"), "a+");		if (flog == (FILE *) NULL)			flog = stderr;	}	init_log3 ("enum", flog, stderr);	/*	 *  open the main proddb and output files; If proddb file is not found,	 *  we assume a new proddb	 */	Debug (40, 9, ("using: %s: for production db\n", proddbfile));	proddb = gdbm_open (proddbfile, 0, GDBM_READER, 0644, 0);	fadd = stdout;	if (addfile != NULL) {		Debug (40, 9, ("using: %s: for addfile\n", addfile));		if ((fadd = fopen (addfile, "w")) == NULL) {			log_errno (addfile);			fatal ("Internal enum error 2.\n");		}	}	fdel = NULL;	if (find_deleted) {		Debug (40, 9, ("using: %s: for delfile\n", delfile));		if ((fdel = fopen (delfile, "w")) == NULL) {			log_errno (delfile);			fatal ("Internal enum error 3.\n");		}	}}static voidfinish (){	if (proddb != NULL)		gdbm_close (proddb);	if (fdel != NULL)		fclose (fdel);	fclose (fadd);	fclose (flog);}/* *  process_url_stamp() - Check in the database for the URL name with *  timestamp/md5 stamp.  If the URL is registered in the database *  and the timestamps are the same, then ignore.  If they're different *  then add the name to the todo list. */static voidprocess_url_stamp (name, stamp)char *name, *stamp;{	datum key, d;	Template *template;	AVPair *avp;	key.dptr = name;	key.dsize = strlen (name);	d = gdbm_fetch (proddb, key);	if (d.dptr != NULL) {		init_parse_template_string (d.dptr, d.dsize);		if ((template = parse_template ()) == NULL)			fatal ("Cannot parse template with %s\n", key.dptr);		finish_parse_template ();		free (d.dptr);		avp = extract_AVPair (template->list, cur_attr);		if ((avp != NULL) && !strcmp (stamp, avp->value)) {			free_template (template);			return;		}		free_template (template);	}	do_add (name, stamp);}/* *  do_add() - save the URL/timestamp/md5 data in the to-add pile */static voiddo_add (name, stamp)char *name;			/* the url to 'add' */char *stamp;			/* the md5/timestamp from enum */{	Debug (40, 9, ("do_add: Saving %s, %s\n", name, stamp));	fprintf (fadd, "%s\t%s:%s\n", name, cur_attr, stamp);	fflush (fadd);		/* MUST flush */}static voidadd_to_newdb (name, timestamp)	/* save to my proddb */char *name, *timestamp;{	datum k;		/* the key  */	datum d;		/* the data */	k.dptr = name;		/* the key is url */	k.dsize = strlen (name) + 1;	/* save the null too */	d.dptr = timestamp;	d.dsize = strlen (timestamp) + 1;	Debug (40, 9, ("add_to_newdb: Marking %s\n", name));	if (gdbm_store (newdb, k, d, GDBM_INSERT))		Log ("Warning: Ignoring re-visited URL: %s\n", name);}static voidenum_to_get_deleted (){				/* get urls to delete */	datum k, nk;		/* current key in both proddb's */	datum d;		/* data (if any) in new proddb */	int rootlen;	rootlen = strlen (root);	/* don't include null here */	k = gdbm_firstkey (proddb);	/* get first key in ess proddb */	while (k.dptr != NULL) {	/* while there are keys */		if (strncmp (k.dptr, root, rootlen) == 0) {			/* if in root url */			d = gdbm_fetch (newdb, k);			/* see if I've seen this url */			if (d.dptr == NULL && fdel != NULL)				/* if not, then it's old! */				fprintf (fdel, "%s\n", k.dptr);			if (d.dptr != NULL)				free (d.dptr);		}		nk = gdbm_nextkey (proddb, k);		free (k.dptr);		k = nk;	}}static cpi *create_user_enum_pgm_pipes (enum_arg)char *enum_arg;			/* the URL from which we are doing this enumeration */{	int pfd[2], pid, fd;	char *argv[64], buf[BUFSIZ], *tmpfile;	static cpi *cpinfo;	/* need static to return var */	tmpfile = xstrdup (tempnam (NULL, "uep"));	Debug (40, 1, ("RUNNING enumerator: %s %s > %s\n", user_enum_pgm,		       enum_arg, tmpfile));	cpinfo = NULL;	if (pipe (pfd) < 0) {		fatal_errno ("pipe");	}	/* need to use fork() since parse_argv() will cause mem leak */	if ((pid = fork ()) < 0) {		fatal_errno ("fork");	}	if (pid == 0) {		/* child */		if ((fd = creat (tmpfile, 0664)) < 0) {			fatal_errno (tmpfile);		}
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -