📄 enum.c
字号:
static char rcsid[] = "$Id: enum.c,v 2.2 1997/08/27 18:06:28 sxw Exp $";/* * enum.c - Enumerate RootNode URLs into LeafNode URLs. Also produce * Timestamps or MD5s for each LeafNode URL to support incremental * Gatherering (using the dbcheck program). * * Input: * URL Option1 ... Option N * * Output: * URL\tMD5:xxx * URL\tLast-Modification-Time:xxx * * Usage: enum [(-db | -del | -log | -tmpdb) file] [-Ds,l] [-delete] < url_enum_list * * DEBUG: section 40, level 1, 5, 9 Gatherer URL enumeration * AUTHOR: Harvest derived * * Harvest Indexer http://harvest.sourceforge.net/ * ----------------------------------------------- * * The Harvest Indexer is a continued development of code developed by * the Harvest Project. Development is carried out by numerous individuals * in the Internet community, and is not officially connected with the * original Harvest Project or its funding sources. * * Please mail lee@arco.de if you are interested in participating * in the development effort. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* * ---------------------------------------------------------------------- * Copyright (c) 1994, 1995. All rights reserved. * * The Harvest software was developed by the Internet Research Task * Force Research Group on Resource Discovery (IRTF-RD): * * Mic Bowman of Transarc Corporation. * Peter Danzig of the University of Southern California. * Darren R. Hardy of the University of Colorado at Boulder. * Udi Manber of the University of Arizona. * Michael F. Schwartz of the University of Colorado at Boulder. * Duane Wessels of the University of Colorado at Boulder. * * This copyright notice applies to software in the Harvest * ``src/'' directory only. Users should consult the individual * copyright notices in the ``components/'' subdirectories for * copyright information about other software bundled with the * Harvest source code distribution. * * TERMS OF USE * * The Harvest software may be used and re-distributed without * charge, provided that the software origin and research team are * cited in any use of the system. Most commonly this is * accomplished by including a link to the Harvest Home Page * (http://harvest.cs.colorado.edu/) from the query page of any * Broker you deploy, as well as in the query result pages. These * links are generated automatically by the standard Broker * software distribution. * * The Harvest software is provided ``as is'', without express or * implied warranty, and with no support nor obligation to assist * in its use, correction, modification or enhancement. We assume * no liability with respect to the infringement of copyrights, * trade secrets, or any patents, and are not responsible for * consequential damages. Proper use of the Harvest software is * entirely the responsibility of the user. * * DERIVATIVE WORKS * * Users may make derivative works from the Harvest software, subject * to the following constraints: * * - You must include the above copyright notice and these * accompanying paragraphs in all forms of derivative works, * and any documentation and other materials related to such * distribution and use acknowledge that the software was * developed at the above institutions. * * - You must notify IRTF-RD regarding your distribution of * the derivative work. * * - You must clearly notify users that your are distributing * a modified version and not the original Harvest software. * * - Any derivative product is also subject to these copyright * and use restrictions. * * Note that the Harvest software is NOT in the public domain. We * retain copyright, as specified above. * * HISTORY OF FREE SOFTWARE STATUS * * Originally we required sites to license the software in cases * where they were going to build commercial products/services * around Harvest. In June 1995 we changed this policy. We now * allow people to use the core Harvest software (the code found in * the Harvest ``src/'' directory) for free. We made this change * in the interest of encouraging the widest possible deployment of * the technology. The Harvest software is really a reference * implementation of a set of protocols and formats, some of which * we intend to standardize. We encourage commercial * re-implementations of code complying to this set of standards. * */#include <stdio.h>#include <string.h>#include <stdlib.h>#include <unistd.h>#include <memory.h>#include <limits.h>#include <fcntl.h>#include <sys/file.h>#include <sys/stat.h>#include <sys/types.h>#include <sys/errno.h>#include <signal.h>#include <gdbm.h>#include "util.h"#include "url.h"#include "template.h"/* Local variables *//* Easily define how to enumerate with the various access methods */#define Arg_URL 1 /* just give the url as the arg */#define Arg_Host_File 2 /* 'host path' are the args */#define Arg_Path 3 /* 'path' is the only arg */struct enumerators { /* table is currently ignored */ int url_type; /* type this enumerator handles */ char *url_enum_breadth; /* the pgm for breadth first searches */ char *url_enum_depth; /* the pgm for depth first searches */ char *special; /* special flags if any */ char *stamp_kind; /* proddb label for the stamp */ int arg_types; /* sigh, need until all conform */} enums[] = { { URL_FILE, "fileenum", "fileenum", "", T_TIMESTAMP, Arg_URL}, { URL_FTP, "ftpenum", "ftpenum", "", T_TIMESTAMP, Arg_URL}, { URL_GOPHER, "gopherenum-breadth", "gopherenum-depth", "", T_MD5, Arg_URL}, { URL_HTTP, "httpenum-breadth", "httpenum-depth", "", T_MD5, Arg_URL}, { URL_NEWS, "newsenum", "newsenum", "", T_MD5, Arg_URL}};#define DBFILE "PRODUCTION.gdbm" /* default proddb */#define DBNEWFILE "tmpdb.gdbm" /* tmp log of all urls enumed */#define DELFILE "url.del" /* Stale URLs in proddb */#define LOGFILE "enum.log" /* default debug logfile */static char *tree_root = NULL;static char *addfile = NULL;static char *delfile = DELFILE;static char *proddbfile = DBFILE;static char *logfile = LOGFILE;static char *tmpdbfile = DBNEWFILE;static char *cur_attr; /* set to md5 or time's label */static char *root; /* set to root url of enum space */static int find_deleted = 0; /* find any deleted objects */static char url[BUFSIZ]; /* URL */static int url_max; /* Option 1 */static char url_filter[BUFSIZ]; /* Option 2 */static int host_max; /* Option 3 */static char host_filter[BUFSIZ]; /* Option 4 */static int delay; /* Option 5 */static int max_depth; /* Option 6 */static int cur_depth; /* Option 6 */static char access_types[64]; /* Option 7 */static char user_enum_pgm[BUFSIZ]; /* Option 8 */static char search_type[BUFSIZ]; /* Option 9 */FILE *fadd; /* the add filehandles (stdout) */FILE *fdel; /* the deleted urls */FILE *flog; /* log output */GDBM_FILE proddb; /* The Gatherer's production database */GDBM_FILE newdb; /* A Temporary db to store timestamps */typedef struct _nodespec { char *buf; struct _nodespec *next;} nodespec;static nodespec *ns_head = 0, *ns_tail = 0;typedef struct _cpi { int childpid; char *result_file; FILE *wfp;} cpi;/* Local functions */static int do_an_enumeration ();static int get_data ();static int do_kid ();static void usage ();static void init ();static void finish ();static void process_url_stamp ();static void do_add ();static void add_to_newdb ();static void add_to_todo ();static void enum_to_get_deleted ();static void set_envs ();static cpi *create_user_enum_pgm_pipes ();/* * init() - Initializes the overall Enumeration process */static voidinit (argc, argv)int argc;char *argv[];{ debug_init (); for (argc--, argv++; argc > 0 && **argv == '-'; argc--, argv++) { if (strncmp (*argv, "-D", 2) == 0) { debug_flag (*argv); } else if (strcmp (*argv, "-delete") == 0) { find_deleted = 1; } else if (strcmp (*argv, "-db") == 0) { if (--argc < 1) usage (); proddbfile = xstrdup (*++argv); } else if (strcmp (*argv, "-add") == 0) { if (--argc < 1) usage (); addfile = xstrdup (*++argv); } else if (strcmp (*argv, "-tmpdb") == 0) { if (--argc < 1) usage (); tmpdbfile = xstrdup (*++argv); } else if (strcmp (*argv, "-del") == 0) { if (--argc < 1) usage (); delfile = xstrdup (*++argv); } else if (strcmp (*argv, "-log") == 0) { if (--argc < 1) usage (); logfile = xstrdup (*++argv); } else { usage (); } } if ((flog = fopen (logfile, "a+")) == NULL) { if (getenv ("HARVEST_GATHERER_LOGFILE") != (char *) NULL) flog = fopen (getenv ("HARVEST_GATHERER_LOGFILE"), "a+"); if (flog == (FILE *) NULL) flog = stderr; } init_log3 ("enum", flog, stderr); /* * open the main proddb and output files; If proddb file is not found, * we assume a new proddb */ Debug (40, 9, ("using: %s: for production db\n", proddbfile)); proddb = gdbm_open (proddbfile, 0, GDBM_READER, 0644, 0); fadd = stdout; if (addfile != NULL) { Debug (40, 9, ("using: %s: for addfile\n", addfile)); if ((fadd = fopen (addfile, "w")) == NULL) { log_errno (addfile); fatal ("Internal enum error 2.\n"); } } fdel = NULL; if (find_deleted) { Debug (40, 9, ("using: %s: for delfile\n", delfile)); if ((fdel = fopen (delfile, "w")) == NULL) { log_errno (delfile); fatal ("Internal enum error 3.\n"); } }}static voidfinish (){ if (proddb != NULL) gdbm_close (proddb); if (fdel != NULL) fclose (fdel); fclose (fadd); fclose (flog);}/* * process_url_stamp() - Check in the database for the URL name with * timestamp/md5 stamp. If the URL is registered in the database * and the timestamps are the same, then ignore. If they're different * then add the name to the todo list. */static voidprocess_url_stamp (name, stamp)char *name, *stamp;{ datum key, d; Template *template; AVPair *avp; key.dptr = name; key.dsize = strlen (name); d = gdbm_fetch (proddb, key); if (d.dptr != NULL) { init_parse_template_string (d.dptr, d.dsize); if ((template = parse_template ()) == NULL) fatal ("Cannot parse template with %s\n", key.dptr); finish_parse_template (); free (d.dptr); avp = extract_AVPair (template->list, cur_attr); if ((avp != NULL) && !strcmp (stamp, avp->value)) { free_template (template); return; } free_template (template); } do_add (name, stamp);}/* * do_add() - save the URL/timestamp/md5 data in the to-add pile */static voiddo_add (name, stamp)char *name; /* the url to 'add' */char *stamp; /* the md5/timestamp from enum */{ Debug (40, 9, ("do_add: Saving %s, %s\n", name, stamp)); fprintf (fadd, "%s\t%s:%s\n", name, cur_attr, stamp); fflush (fadd); /* MUST flush */}static voidadd_to_newdb (name, timestamp) /* save to my proddb */char *name, *timestamp;{ datum k; /* the key */ datum d; /* the data */ k.dptr = name; /* the key is url */ k.dsize = strlen (name) + 1; /* save the null too */ d.dptr = timestamp; d.dsize = strlen (timestamp) + 1; Debug (40, 9, ("add_to_newdb: Marking %s\n", name)); if (gdbm_store (newdb, k, d, GDBM_INSERT)) Log ("Warning: Ignoring re-visited URL: %s\n", name);}static voidenum_to_get_deleted (){ /* get urls to delete */ datum k, nk; /* current key in both proddb's */ datum d; /* data (if any) in new proddb */ int rootlen; rootlen = strlen (root); /* don't include null here */ k = gdbm_firstkey (proddb); /* get first key in ess proddb */ while (k.dptr != NULL) { /* while there are keys */ if (strncmp (k.dptr, root, rootlen) == 0) { /* if in root url */ d = gdbm_fetch (newdb, k); /* see if I've seen this url */ if (d.dptr == NULL && fdel != NULL) /* if not, then it's old! */ fprintf (fdel, "%s\n", k.dptr); if (d.dptr != NULL) free (d.dptr); } nk = gdbm_nextkey (proddb, k); free (k.dptr); k = nk; }}static cpi *create_user_enum_pgm_pipes (enum_arg)char *enum_arg; /* the URL from which we are doing this enumeration */{ int pfd[2], pid, fd; char *argv[64], buf[BUFSIZ], *tmpfile; static cpi *cpinfo; /* need static to return var */ tmpfile = xstrdup (tempnam (NULL, "uep")); Debug (40, 1, ("RUNNING enumerator: %s %s > %s\n", user_enum_pgm, enum_arg, tmpfile)); cpinfo = NULL; if (pipe (pfd) < 0) { fatal_errno ("pipe"); } /* need to use fork() since parse_argv() will cause mem leak */ if ((pid = fork ()) < 0) { fatal_errno ("fork"); } if (pid == 0) { /* child */ if ((fd = creat (tmpfile, 0664)) < 0) { fatal_errno (tmpfile); }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -