⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 fileenum.c

📁 harvest是一个下载html网页得机器人
💻 C
字号:
static char rcsid[] = "$Id: fileenum.c,v 2.1 1997/03/21 19:21:29 sxw Exp $";/* *  fileenum.c - Enumerates file: RootNode URLs into LeafNode URLs * *  Usage: fileenum [-n] pathname *         -n == don't expand directory name using chdir/getwd * *  Outputs the following format: * *      URL of tree root *      URL <tab> last-modification-time *      ... *      URL <tab> last-modification-time * *  Jim Guyton, University of Colorado, Boulder, May 1994 *  Duane Wessels, University of Colorado, Boulder, Sept 1995 * *  DEBUG: section  46, level 1         Gatherer enumeration for file:// URLs *  AUTHOR: Harvest derived * *  Harvest Indexer http://www.tardis.ed.ac.uk/harvest/ *  --------------------------------------------------- * *  The Harvest Indexer is a continued development of code developed by *  the Harvest Project. Development is carried out by numerous individuals *  in the Internet community, and is not officially connected with the *  original Harvest Project or its funding sources. * *  Please mail harvest@tardis.ed.ac.uk if you are interested in participating *  in the development effort. * *  This program is free software; you can redistribute it and/or modify *  it under the terms of the GNU General Public License as published by *  the Free Software Foundation; either version 2 of the License, or *  (at your option) any later version. * *  This program is distributed in the hope that it will be useful, *  but WITHOUT ANY WARRANTY; without even the implied warranty of *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *  GNU General Public License for more details. * *  You should have received a copy of the GNU General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//*  ---------------------------------------------------------------------- *  Copyright (c) 1994, 1995.  All rights reserved. * *    The Harvest software was developed by the Internet Research Task *    Force Research Group on Resource Discovery (IRTF-RD): * *          Mic Bowman of Transarc Corporation. *          Peter Danzig of the University of Southern California. *          Darren R. Hardy of the University of Colorado at Boulder. *          Udi Manber of the University of Arizona. *          Michael F. Schwartz of the University of Colorado at Boulder. *          Duane Wessels of the University of Colorado at Boulder. * *    This copyright notice applies to software in the Harvest *    ``src/'' directory only.  Users should consult the individual *    copyright notices in the ``components/'' subdirectories for *    copyright information about other software bundled with the *    Harvest source code distribution. * *  TERMS OF USE * *    The Harvest software may be used and re-distributed without *    charge, provided that the software origin and research team are *    cited in any use of the system.  Most commonly this is *    accomplished by including a link to the Harvest Home Page *    (http://harvest.cs.colorado.edu/) from the query page of any *    Broker you deploy, as well as in the query result pages.  These *    links are generated automatically by the standard Broker *    software distribution. * *    The Harvest software is provided ``as is'', without express or *    implied warranty, and with no support nor obligation to assist *    in its use, correction, modification or enhancement.  We assume *    no liability with respect to the infringement of copyrights, *    trade secrets, or any patents, and are not responsible for *    consequential damages.  Proper use of the Harvest software is *    entirely the responsibility of the user. * *  DERIVATIVE WORKS * *    Users may make derivative works from the Harvest software, subject *    to the following constraints: * *      - You must include the above copyright notice and these *        accompanying paragraphs in all forms of derivative works, *        and any documentation and other materials related to such *        distribution and use acknowledge that the software was *        developed at the above institutions. * *      - You must notify IRTF-RD regarding your distribution of *        the derivative work. * *      - You must clearly notify users that your are distributing *        a modified version and not the original Harvest software. * *      - Any derivative product is also subject to these copyright *        and use restrictions. * *    Note that the Harvest software is NOT in the public domain.  We *    retain copyright, as specified above. * *  HISTORY OF FREE SOFTWARE STATUS * *    Originally we required sites to license the software in cases *    where they were going to build commercial products/services *    around Harvest.  In June 1995 we changed this policy.  We now *    allow people to use the core Harvest software (the code found in *    the Harvest ``src/'' directory) for free.  We made this change *    in the interest of encouraging the widest possible deployment of *    the technology.  The Harvest software is really a reference *    implementation of a set of protocols and formats, some of which *    we intend to standardize.  We encourage commercial *    re-implementations of code complying to this set of standards. * */#include <stdio.h>#include <string.h>#include <stdlib.h>#include <sys/types.h>#include <dirent.h>#include <signal.h>#include <sys/stat.h>#include <sys/socket.h>#include <sys/param.h>#include <netdb.h>#include "util.h"#include "url.h"#define PUBLIC extern#include "filter.h"/* Local variables */static char *hostname = NULL;static char *my_pgmname = NULL;static char tree_root[BUFSIZ];static int max_depth = 0;static int url_max = 0;static int nurls = 0;/* Local functions */static void sigdie();static void usage();static void doit();static void sigdie(){    exit(0);}static void usage(){    fprintf(stderr, "usage: %s File-URL\n", my_pgmname);    exit(1);}static void doit(path, depth)     char *path;     int depth;{    int pathlen = 0;		/* Length of source pathname */    struct stat statbuf;    DIR *dirf = NULL;    char *fullname = NULL;    struct dirent *dp = NULL;    int y = 0;    fullname = xmalloc(MAXPATHLEN + 1);    pathlen = strlen(path);    if (max_depth > 0 && depth > max_depth) {	Debug(46, 1, ("Maximum Depth of %d Reached: %s\n",		max_depth, path));	xfree(fullname);	return;    }    strcpy(fullname, path);    if (fullname[pathlen - 1] != '/') {		/* Add trailing / if needed */	strcat(fullname, "/");	pathlen++;    }    /* Do the enumeration */    if ((dirf = opendir(path)) == NULL) {	log_errno2(__FILE__, __LINE__, path);	xfree(fullname);	return;    }    while ((dp = readdir(dirf)) != NULL) {	if (strcmp(dp->d_name, ".") == 0)	    continue;	if (strcmp(dp->d_name, "..") == 0)	    continue;	fullname[pathlen] = '\0';	strcat(fullname, dp->d_name);	/* Check to see if it passes the filter */	if ((y = filter_match(fullname, url_filter, nurl_filter))) {	    Debug(46, 1, ("Removing Candidate: [%s] %s\n",		    Filter_Type_Name[y], fullname));	    continue;	}	if (stat(fullname, &statbuf) < 0) {	    log_errno2(__FILE__, __LINE__, fullname);	    continue;	}	if (S_ISREG(statbuf.st_mode)) {	    fprintf(stdout, "file://%s%s\t%lu\n", hostname,		rfc1738_escape(fullname), statbuf.st_mtime);	    fflush(stdout);	    if (nurls++ >= url_max) {		Log("Truncating RootNode %s at %d LeafNode URLs\n", tree_root, url_max);		sigdie();	    }	}	if ((statbuf.st_mode & S_IFMT) == S_IFDIR) {	    strcat(fullname, "/");	    doit(fullname, depth + 1);	}    }    closedir(dirf);    xfree(fullname);}int main(argc, argv)     int argc;     char *argv[];{    char *startpath = NULL;    char *s = NULL;    FILE *logfp = NULL;    URL *up = NULL;    int cur_depth = 0;#ifdef USE_HOST_CACHE    host_cache_init();#endif    debug_init();    my_pgmname = xstrdup(argv[0]);    while (argc > 1 && argv[1][0] == '-') {	if (strncmp(argv[1], "-D", 2) == 0) {	    debug_flag(argv[1]);	    argv++;	    argc--;	} else	    usage();    }    if (argc != 2)	usage();    signal(SIGTERM, sigdie);    if (getenv("HARVEST_GATHERER_LOGFILE") != (char *) NULL)	logfp = fopen(getenv("HARVEST_GATHERER_LOGFILE"), "a+");    if (logfp == (FILE *) NULL)	logfp = stderr;    init_log3(my_pgmname, logfp, stderr);    Debug(46, 1, ("Running File Enumeration: %s\n", argv[1]));    up = url_open(argv[1]);    if (up == (URL *) NULL) {	Log("Bad URL: %s\n", argv[1]);	exit(1);    }    if (up->type != URL_FILE) {	Log("Not a File URL: %s\n", argv[1]);	exit(1);    }    hostname = xstrdup(up->host);    /* argh.  We should check here to see if the given host is the */    /* local machine.  -DW */    max_depth = url_max = 0;    if ((s = getenv("HARVEST_URL_MAX")) != NULL)	url_max = atoi(s);    if ((s = getenv("HARVEST_DEPTH_MAX")) != NULL)	max_depth = atoi(s);    if ((s = getenv("HARVEST_DEPTH_CUR")) != NULL)	cur_depth = atoi(s);    if (url_max < 1)	url_max = 250;		/* hard-coded maximum */    if (max_depth < 1)	max_depth = 0;		/* hard-coded maximum */    host_filterfile = NULL;    url_filterfile = getenv("HARVEST_URL_FILTER");    filter_initialize();    startpath = up->pathname;    /* first line is start of URL enumeration space */    sprintf(tree_root, "file://%s%s", hostname,	rfc1738_escape(startpath));    fprintf(stdout, "%s\n", tree_root);    fflush(stdout);    doit(startpath, cur_depth);    exit(0);}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -