📄 main.c
字号:
static char rcsid[] = "$Id: main.c,v 2.2 1997/08/27 18:09:31 sxw Exp $";/* * main.c - User front-end for the Essence system. * * DEBUG: section 62, level 1 Gatherer essence main * AUTHOR: Harvest derived * * Harvest Indexer http://www.tardis.ed.ac.uk/harvest/ * --------------------------------------------------- * * The Harvest Indexer is a continued development of code developed by * the Harvest Project. Development is carried out by numerous individuals * in the Internet community, and is not officially connected with the * original Harvest Project or its funding sources. * * Please mail harvest@tardis.ed.ac.uk if you are interested in participating * in the development effort. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* ---------------------------------------------------------------------- * Copyright (c) 1994, 1995. All rights reserved. * * The Harvest software was developed by the Internet Research Task * Force Research Group on Resource Discovery (IRTF-RD): * * Mic Bowman of Transarc Corporation. * Peter Danzig of the University of Southern California. * Darren R. Hardy of the University of Colorado at Boulder. * Udi Manber of the University of Arizona. * Michael F. Schwartz of the University of Colorado at Boulder. * Duane Wessels of the University of Colorado at Boulder. * * This copyright notice applies to software in the Harvest * ``src/'' directory only. Users should consult the individual * copyright notices in the ``components/'' subdirectories for * copyright information about other software bundled with the * Harvest source code distribution. * * TERMS OF USE * * The Harvest software may be used and re-distributed without * charge, provided that the software origin and research team are * cited in any use of the system. Most commonly this is * accomplished by including a link to the Harvest Home Page * (http://harvest.cs.colorado.edu/) from the query page of any * Broker you deploy, as well as in the query result pages. These * links are generated automatically by the standard Broker * software distribution. * * The Harvest software is provided ``as is'', without express or * implied warranty, and with no support nor obligation to assist * in its use, correction, modification or enhancement. We assume * no liability with respect to the infringement of copyrights, * trade secrets, or any patents, and are not responsible for * consequential damages. Proper use of the Harvest software is * entirely the responsibility of the user. * * DERIVATIVE WORKS * * Users may make derivative works from the Harvest software, subject * to the following constraints: * * - You must include the above copyright notice and these * accompanying paragraphs in all forms of derivative works, * and any documentation and other materials related to such * distribution and use acknowledge that the software was * developed at the above institutions. * * - You must notify IRTF-RD regarding your distribution of * the derivative work. * * - You must clearly notify users that your are distributing * a modified version and not the original Harvest software. * * - Any derivative product is also subject to these copyright * and use restrictions. * * Note that the Harvest software is NOT in the public domain. We * retain copyright, as specified above. * * HISTORY OF FREE SOFTWARE STATUS * * Originally we required sites to license the software in cases * where they were going to build commercial products/services * around Harvest. In June 1995 we changed this policy. We now * allow people to use the core Harvest software (the code found in * the Harvest ``src/'' directory) for free. We made this change * in the interest of encouraging the widest possible deployment of * the technology. The Harvest software is really a reference * implementation of a set of protocols and formats, some of which * we intend to standardize. We encourage commercial * re-implementations of code complying to this set of standards. * */#include <stdio.h>#include <stdlib.h>#include <unistd.h>#include <fcntl.h>#include <string.h>#include <memory.h>#include <signal.h>#include <sys/param.h>#include <sys/types.h>#include <netdb.h>#include <locale.h>#include "util.h"#define MAIN#include "essence.h"#undef MAIN#if defined(DEBUG) && defined(_HARVEST_OSF_)#include <malloc.h>#endif/* Local variables */static char byname[MAXPATHLEN + 1]; /* file naming config file */static char byurl[MAXPATHLEN + 1]; /* URL naming config file */static char bycontent[MAXPATHLEN + 1]; /* file content config file */static char magic[MAXPATHLEN + 1]; /* file(1) magic file */static char *dbdir = NULL; /* directory to put database */static char *libpath = NULL; /* directories to put config files */static char *logfile = NULL; /* file to log messages to */static char *input_file = NULL; /* file from which to get input files */static char *gname = NULL; /* Gatherer name */static char *ghost = NULL; /* Gatherer host */static char *gver = NULL; /* Gatherer version */static int max_deletions = 0; /* # of GDBM deletions before reorg */static char *pp_rules_file = 0; /* Rules for Post-processing *//* Local functions */static void init_gatherer_id();static void nested_feeder();static void process_object();static void do_shutdown();static void do_startup();static void usage();static char default_libpath[BUFSIZ];static void usage(){ fprintf(stderr, "\Usage: essence [options] -f input-URLs\n\ OR essence [options] URL ...\n\\n\Essence version %s\n\\n\Options:\n\\n\ --allowlist filename File with list of types to allow\n\ --confirm-host Explicitly confirm that host is valid\n\ --dbdir directory Directory to place database\n\ --default-ttl seconds Default time-to-live value\n\ --default-refresh secs Default refresh-rate value\n\ --delete-duplicates Delete duplicates during ``exploder''\n\ --fake-md5s Computes fake MD5s for SOIF generated by unnesting\n\ --fast Use fast algorithms when possible. (May be risky.)\n\ --fast-summarizing Use ``fast'' summarizer. (May be risky.)\n\ --full-text Use entire file instead of summarizing\n\ --gatherer-host Gatherer-Host value\n\ --gatherer-name Gatherer-Name value\n\ --gatherer-version Gatherer-Version value\n\ --help Print usage information\n\ --libdir path List of Directories to find configuration files\n\ --log logfile Name of the file to log messages to\n\ --max-deletions n Number of GDBM deletions before reorganization\n\ --max-refresh n Maximum number of objects to refresh\n\ --memory-efficient Try to be memory efficient at the expense of speed\n\ --minimal-bookkeeping Generates a minimal amount of bookkeeping attrs\n\ --no-access Do not read contents of objects\n\ --no-keywords Do not automatically generate keywords\n\ --post-process filename Perform summary post-processing\n\ --quiet Minimize logging output\n\ --stoplist filename File with list of types to remove\n\ --tmpdir directory Name of directory to use for temporary files\n\ --type-only Only type data; do not summarize objects\n\ --verbose Verbose output (the default)\n\ --version Version information\n", HARVEST_VERSION); exit(1);}int main(argc, argv) int argc; char *argv[];{ DataObject *obj = NULL; unsigned int object_flags = 0; int nested;#ifdef HAVE_GETCWD extern char *getcwd();#else extern char *getwd();#endif setlocale(LC_ALL, ""); /* Initialize Globals */ harvest_add_gatherer_path(); sprintf(default_libpath, "%s/gatherer", harvest_libdir()); verbose = 1; do_dupremove = 0; do_keywords = 1; do_fulltext = 0; do_typeonly = 0; do_minimalbooks = 0; do_fakemd5s = 0; memefficient = 0; do_confhost = 0; do_cksumdups = 1; do_fast = 0; gatherer_id = NULL; max_refresh = 0; default_ttl = DEFAULT_TTL; default_refresh = DEFAULT_REFRESH; tmpdir = stoplist = allowlist = NULL; topdir = xmalloc(MAXPATHLEN + 1);#ifdef HAVE_GETCWD if (getcwd(topdir, MAXPATHLEN) == NULL) { perror("getcwd");#else if (getwd(topdir) == NULL) { perror("getwd");#endif exit(1); }#ifdef USE_QUICKSUM quicksum_file = NULL;#endif /* Process command line */ if (argc < 2) usage(); debug_init(); for (argc--, argv++; argc > 0 && **argv == '-'; argc--, argv++) { if (!strcmp(*argv, "--help")) { usage(); } else if (!strncmp(*argv, "-D", 2)) { debug_flag(*argv); verbose = 1; } else if (!strcmp(*argv, "--fake-md5s")) { do_fakemd5s = 1; } else if (!strcmp(*argv, "--delete-duplicates")) { do_dupremove = 1; } else if (!strcmp(*argv, "--full-text")) { do_fulltext = 1; } else if (!strcmp(*argv, "--fast")) { do_fast = 1; } else if (!strcmp(*argv, "--fast-summarizing")) { do_cksumdups = 0; } else if (!strcmp(*argv, "--confirm-host")) { do_confhost = 1; } else if (!strcmp(*argv, "--minimal-bookkeeping")) { do_minimalbooks = 1; } else if (!strcmp(*argv, "--memory-efficient")) { memefficient = 1; } else if (!strcmp(*argv, "--no-keywords")) { do_keywords = 0; } else if (!strcmp(*argv, "--no-access")) { object_flags |= F_NO_ACCESS; } else if (!strcmp(*argv, "--type-only")) { do_typeonly = 1; } else if (!strcmp(*argv, "--verbose")) { verbose = 1; } else if (!strcmp(*argv, "--quiet")) { verbose = 0; } else if (!strcmp(*argv, "--post-process")) { if (--argc < 1) usage(); pp_rules_file = strdup(*++argv); } else if (!strcmp(*argv, "--version")) { printf("Version: %s\n", HARVEST_VERSION); exit(0); } else if (!strcmp(*argv, "--stoplist")) { if (--argc < 1) usage(); stoplist = strdup(*++argv); } else if (!strcmp(*argv, "--allowlist")) { if (--argc < 1) usage(); allowlist = strdup(*++argv); } else if (!strcmp(*argv, "--tmpdir")) { if (--argc < 1) usage(); tmpdir = strdup(*++argv); } else if (!strcmp(*argv, "--log")) { if (--argc < 1) usage(); logfile = strdup(*++argv); } else if (!strcmp(*argv, "--dbdir")) { if (--argc < 1) usage(); dbdir = strdup(*++argv); } else if (!strcmp(*argv, "--libdir")) { if (--argc < 1) usage(); libpath = xmalloc(BUFSIZ); sprintf(libpath, "%s:%s", *++argv, default_libpath); } else if (!strcmp(*argv, "--max-deletions")) { if (--argc < 1) usage(); max_deletions = atoi(*++argv); } else if (!strcmp(*argv, "--max-refresh")) { if (--argc < 1) usage(); max_refresh = atoi(*++argv); if (max_refresh < 1) max_refresh = 0; } else if (!strcmp(*argv, "-f")) { if (--argc < 1) usage(); input_file = strdup(*++argv); } else if (!strcmp(*argv, "--gatherer-host")) { if (--argc < 1) usage(); ghost = strdup(*++argv); } else if (!strcmp(*argv, "--gatherer-name")) { if (--argc < 1) usage(); gname = strdup(*++argv); } else if (!strcmp(*argv, "--gatherer-version")) { if (--argc < 1) usage(); gver = strdup(*++argv); } else if (!strcmp(*argv, "--default-ttl")) { if (--argc < 1) usage(); default_ttl = atoi(*++argv); if (default_ttl < 1) default_ttl = DEFAULT_TTL; } else if (!strcmp(*argv, "--default-refresh")) { if (--argc < 1) usage(); default_refresh = atoi(*++argv); if (default_refresh < 1) default_refresh = DEFAULT_REFRESH; } else { usage(); } } /* Set the fast algoritms */ if (do_fast) { do_cksumdups = 0; } if (libpath == NULL) libpath = strdup(default_libpath); /* Do initializations */#ifdef HAVE_SETLINEBUF setlinebuf(stdout); setlinebuf(stderr);#else setbuf(stdout, NULL); setbuf(stderr, NULL);#endif if (verbose) { FILE *fp = stdout; if (logfile != NULL) { if ((fp = fopen(logfile, "a+")) == NULL) { perror(logfile); exit(1); } setbuf(fp, NULL); } init_log3("essence", fp, stderr); } else { init_log3("essence", NULL, stderr); } if (pp_rules_file) if (pp_parse_rules(pp_rules_file)) { errorlog("Unable to parse post-processing rules.\n"); exit(1); } /* Initialize and verify correct environment */#ifdef USE_QUICKSUM if (quicksum_file == NULL) { char buf[MAXPATHLEN + 1], *t, *s; int fd; s = strdup(libpath); for (t = strtok(s, ":"); t; t = strtok(NULL, ":")) { sprintf(buf, "%s/%s", t, USE_QUICKSUM_FILE); if ((fd = open(buf, O_RDONLY)) >= 0) { quicksum_file = strdup(buf); close(fd); break; } } free(s); } if (quicksum_file == NULL) { errorlog("Unable to locate %s in %s.\n", USE_QUICKSUM_FILE, libpath); exit(1); }#endif if (stoplist == NULL) { char buf[MAXPATHLEN + 1], *t, *s; int fd; s = strdup(libpath); for (t = strtok(s, ":"); t; t = strtok(NULL, ":")) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -