⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 main.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 2 页
字号:
static char rcsid[] = "$Id: main.c,v 2.2 1997/08/27 18:09:31 sxw Exp $";/* *  main.c - User front-end for the Essence system. * *  DEBUG: section  62, level 1         Gatherer essence main *  AUTHOR: Harvest derived * *  Harvest Indexer http://www.tardis.ed.ac.uk/harvest/ *  --------------------------------------------------- * *  The Harvest Indexer is a continued development of code developed by *  the Harvest Project. Development is carried out by numerous individuals *  in the Internet community, and is not officially connected with the *  original Harvest Project or its funding sources. * *  Please mail harvest@tardis.ed.ac.uk if you are interested in participating *  in the development effort. * *  This program is free software; you can redistribute it and/or modify *  it under the terms of the GNU General Public License as published by *  the Free Software Foundation; either version 2 of the License, or *  (at your option) any later version. * *  This program is distributed in the hope that it will be useful, *  but WITHOUT ANY WARRANTY; without even the implied warranty of *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *  GNU General Public License for more details. * *  You should have received a copy of the GNU General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//*  ---------------------------------------------------------------------- *  Copyright (c) 1994, 1995.  All rights reserved. * *    The Harvest software was developed by the Internet Research Task *    Force Research Group on Resource Discovery (IRTF-RD): * *          Mic Bowman of Transarc Corporation. *          Peter Danzig of the University of Southern California. *          Darren R. Hardy of the University of Colorado at Boulder. *          Udi Manber of the University of Arizona. *          Michael F. Schwartz of the University of Colorado at Boulder. *          Duane Wessels of the University of Colorado at Boulder. * *    This copyright notice applies to software in the Harvest *    ``src/'' directory only.  Users should consult the individual *    copyright notices in the ``components/'' subdirectories for *    copyright information about other software bundled with the *    Harvest source code distribution. * *  TERMS OF USE * *    The Harvest software may be used and re-distributed without *    charge, provided that the software origin and research team are *    cited in any use of the system.  Most commonly this is *    accomplished by including a link to the Harvest Home Page *    (http://harvest.cs.colorado.edu/) from the query page of any *    Broker you deploy, as well as in the query result pages.  These *    links are generated automatically by the standard Broker *    software distribution. * *    The Harvest software is provided ``as is'', without express or *    implied warranty, and with no support nor obligation to assist *    in its use, correction, modification or enhancement.  We assume *    no liability with respect to the infringement of copyrights, *    trade secrets, or any patents, and are not responsible for *    consequential damages.  Proper use of the Harvest software is *    entirely the responsibility of the user. * *  DERIVATIVE WORKS * *    Users may make derivative works from the Harvest software, subject *    to the following constraints: * *      - You must include the above copyright notice and these *        accompanying paragraphs in all forms of derivative works, *        and any documentation and other materials related to such *        distribution and use acknowledge that the software was *        developed at the above institutions. * *      - You must notify IRTF-RD regarding your distribution of *        the derivative work. * *      - You must clearly notify users that your are distributing *        a modified version and not the original Harvest software. * *      - Any derivative product is also subject to these copyright *        and use restrictions. * *    Note that the Harvest software is NOT in the public domain.  We *    retain copyright, as specified above. * *  HISTORY OF FREE SOFTWARE STATUS * *    Originally we required sites to license the software in cases *    where they were going to build commercial products/services *    around Harvest.  In June 1995 we changed this policy.  We now *    allow people to use the core Harvest software (the code found in *    the Harvest ``src/'' directory) for free.  We made this change *    in the interest of encouraging the widest possible deployment of *    the technology.  The Harvest software is really a reference *    implementation of a set of protocols and formats, some of which *    we intend to standardize.  We encourage commercial *    re-implementations of code complying to this set of standards. * */#include <stdio.h>#include <stdlib.h>#include <unistd.h>#include <fcntl.h>#include <string.h>#include <memory.h>#include <signal.h>#include <sys/param.h>#include <sys/types.h>#include <netdb.h>#include <locale.h>#include "util.h"#define MAIN#include "essence.h"#undef MAIN#if defined(DEBUG) && defined(_HARVEST_OSF_)#include <malloc.h>#endif/* Local variables */static char byname[MAXPATHLEN + 1];	/* file naming config file */static char byurl[MAXPATHLEN + 1];	/* URL naming config file */static char bycontent[MAXPATHLEN + 1];	/* file content config file */static char magic[MAXPATHLEN + 1];	/* file(1) magic file */static char *dbdir = NULL;	/* directory to put database */static char *libpath = NULL;	/* directories to put config files */static char *logfile = NULL;	/* file to log messages to */static char *input_file = NULL;	/* file from which to get input files */static char *gname = NULL;	/* Gatherer name */static char *ghost = NULL;	/* Gatherer host */static char *gver = NULL;	/* Gatherer version */static int max_deletions = 0;	/* # of GDBM deletions before reorg */static char *pp_rules_file = 0;	/* Rules for Post-processing *//* Local functions */static void init_gatherer_id();static void nested_feeder();static void process_object();static void do_shutdown();static void do_startup();static void usage();static char default_libpath[BUFSIZ];static void usage(){	fprintf(stderr, "\Usage: essence [options] -f input-URLs\n\    OR essence [options] URL ...\n\\n\Essence version %s\n\\n\Options:\n\\n\  --allowlist filename    File with list of types to allow\n\  --confirm-host          Explicitly confirm that host is valid\n\  --dbdir directory       Directory to place database\n\  --default-ttl seconds   Default time-to-live value\n\  --default-refresh secs  Default refresh-rate value\n\  --delete-duplicates     Delete duplicates during ``exploder''\n\  --fake-md5s             Computes fake MD5s for SOIF generated by unnesting\n\  --fast                  Use fast algorithms when possible. (May be risky.)\n\  --fast-summarizing      Use ``fast'' summarizer.  (May be risky.)\n\  --full-text             Use entire file instead of summarizing\n\  --gatherer-host	  Gatherer-Host value\n\  --gatherer-name	  Gatherer-Name value\n\  --gatherer-version	  Gatherer-Version value\n\  --help		  Print usage information\n\  --libdir path           List of Directories to find configuration files\n\  --log logfile           Name of the file to log messages to\n\  --max-deletions n       Number of GDBM deletions before reorganization\n\  --max-refresh n         Maximum number of objects to refresh\n\  --memory-efficient      Try to be memory efficient at the expense of speed\n\  --minimal-bookkeeping   Generates a minimal amount of bookkeeping attrs\n\  --no-access		  Do not read contents of objects\n\  --no-keywords		  Do not automatically generate keywords\n\  --post-process filename Perform summary post-processing\n\  --quiet                 Minimize logging output\n\  --stoplist filename     File with list of types to remove\n\  --tmpdir directory      Name of directory to use for temporary files\n\  --type-only             Only type data; do not summarize objects\n\  --verbose               Verbose output (the default)\n\  --version               Version information\n",	    HARVEST_VERSION);	exit(1);}int main(argc, argv)     int argc;     char *argv[];{	DataObject *obj = NULL;	unsigned int object_flags = 0;	int nested;#ifdef HAVE_GETCWD	extern char *getcwd();#else	extern char *getwd();#endif	setlocale(LC_ALL, "");	/* Initialize Globals */	harvest_add_gatherer_path();	sprintf(default_libpath, "%s/gatherer", harvest_libdir());	verbose = 1;	do_dupremove = 0;	do_keywords = 1;	do_fulltext = 0;	do_typeonly = 0;	do_minimalbooks = 0;	do_fakemd5s = 0;	memefficient = 0;	do_confhost = 0;	do_cksumdups = 1;	do_fast = 0;	gatherer_id = NULL;	max_refresh = 0;	default_ttl = DEFAULT_TTL;	default_refresh = DEFAULT_REFRESH;	tmpdir = stoplist = allowlist = NULL;	topdir = xmalloc(MAXPATHLEN + 1);#ifdef HAVE_GETCWD	if (getcwd(topdir, MAXPATHLEN) == NULL) {		perror("getcwd");#else	if (getwd(topdir) == NULL) {		perror("getwd");#endif		exit(1);	}#ifdef USE_QUICKSUM	quicksum_file = NULL;#endif	/* Process command line */	if (argc < 2)		usage();	debug_init();	for (argc--, argv++; argc > 0 && **argv == '-'; argc--, argv++) {		if (!strcmp(*argv, "--help")) {			usage();		} else if (!strncmp(*argv, "-D", 2)) {			debug_flag(*argv);			verbose = 1;		} else if (!strcmp(*argv, "--fake-md5s")) {			do_fakemd5s = 1;		} else if (!strcmp(*argv, "--delete-duplicates")) {			do_dupremove = 1;		} else if (!strcmp(*argv, "--full-text")) {			do_fulltext = 1;		} else if (!strcmp(*argv, "--fast")) {			do_fast = 1;		} else if (!strcmp(*argv, "--fast-summarizing")) {			do_cksumdups = 0;		} else if (!strcmp(*argv, "--confirm-host")) {			do_confhost = 1;		} else if (!strcmp(*argv, "--minimal-bookkeeping")) {			do_minimalbooks = 1;		} else if (!strcmp(*argv, "--memory-efficient")) {			memefficient = 1;		} else if (!strcmp(*argv, "--no-keywords")) {			do_keywords = 0;		} else if (!strcmp(*argv, "--no-access")) {			object_flags |= F_NO_ACCESS;		} else if (!strcmp(*argv, "--type-only")) {			do_typeonly = 1;		} else if (!strcmp(*argv, "--verbose")) {			verbose = 1;		} else if (!strcmp(*argv, "--quiet")) {			verbose = 0;		} else if (!strcmp(*argv, "--post-process")) {			if (--argc < 1)				usage();			pp_rules_file = strdup(*++argv);		} else if (!strcmp(*argv, "--version")) {			printf("Version: %s\n", HARVEST_VERSION);			exit(0);		} else if (!strcmp(*argv, "--stoplist")) {			if (--argc < 1)				usage();			stoplist = strdup(*++argv);		} else if (!strcmp(*argv, "--allowlist")) {			if (--argc < 1)				usage();			allowlist = strdup(*++argv);		} else if (!strcmp(*argv, "--tmpdir")) {			if (--argc < 1)				usage();			tmpdir = strdup(*++argv);		} else if (!strcmp(*argv, "--log")) {			if (--argc < 1)				usage();			logfile = strdup(*++argv);		} else if (!strcmp(*argv, "--dbdir")) {			if (--argc < 1)				usage();			dbdir = strdup(*++argv);		} else if (!strcmp(*argv, "--libdir")) {			if (--argc < 1)				usage();			libpath = xmalloc(BUFSIZ);			sprintf(libpath, "%s:%s", *++argv, default_libpath);		} else if (!strcmp(*argv, "--max-deletions")) {			if (--argc < 1)				usage();			max_deletions = atoi(*++argv);		} else if (!strcmp(*argv, "--max-refresh")) {			if (--argc < 1)				usage();			max_refresh = atoi(*++argv);			if (max_refresh < 1)				max_refresh = 0;		} else if (!strcmp(*argv, "-f")) {			if (--argc < 1)				usage();			input_file = strdup(*++argv);		} else if (!strcmp(*argv, "--gatherer-host")) {			if (--argc < 1)				usage();			ghost = strdup(*++argv);		} else if (!strcmp(*argv, "--gatherer-name")) {			if (--argc < 1)				usage();			gname = strdup(*++argv);		} else if (!strcmp(*argv, "--gatherer-version")) {			if (--argc < 1)				usage();			gver = strdup(*++argv);		} else if (!strcmp(*argv, "--default-ttl")) {			if (--argc < 1)				usage();			default_ttl = atoi(*++argv);			if (default_ttl < 1)				default_ttl = DEFAULT_TTL;		} else if (!strcmp(*argv, "--default-refresh")) {			if (--argc < 1)				usage();			default_refresh = atoi(*++argv);			if (default_refresh < 1)				default_refresh = DEFAULT_REFRESH;		} else {			usage();		}	}	/* Set the fast algoritms */	if (do_fast) {		do_cksumdups = 0;	}	if (libpath == NULL)		libpath = strdup(default_libpath);	/* Do initializations */#ifdef HAVE_SETLINEBUF	setlinebuf(stdout);	setlinebuf(stderr);#else	setbuf(stdout, NULL);	setbuf(stderr, NULL);#endif	if (verbose) {		FILE *fp = stdout;		if (logfile != NULL) {			if ((fp = fopen(logfile, "a+")) == NULL) {				perror(logfile);				exit(1);			}			setbuf(fp, NULL);		}		init_log3("essence", fp, stderr);	} else {		init_log3("essence", NULL, stderr);	}	if (pp_rules_file)		if (pp_parse_rules(pp_rules_file)) {			errorlog("Unable to parse post-processing rules.\n");			exit(1);		}	/* Initialize and verify correct environment */#ifdef USE_QUICKSUM	if (quicksum_file == NULL) {		char buf[MAXPATHLEN + 1], *t, *s;		int fd;		s = strdup(libpath);		for (t = strtok(s, ":"); t; t = strtok(NULL, ":")) {			sprintf(buf, "%s/%s", t, USE_QUICKSUM_FILE);			if ((fd = open(buf, O_RDONLY)) >= 0) {				quicksum_file = strdup(buf);				close(fd);				break;			}		}		free(s);	}	if (quicksum_file == NULL) {		errorlog("Unable to locate %s in %s.\n",		    USE_QUICKSUM_FILE, libpath);		exit(1);	}#endif	if (stoplist == NULL) {		char buf[MAXPATHLEN + 1], *t, *s;		int fd;		s = strdup(libpath);		for (t = strtok(s, ":"); t; t = strtok(NULL, ":")) {

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -