⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 gopherenum-breadth.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 2 页
字号:
static char rcsid[] = "$Id: gopherenum-breadth.c,v 2.5 2000/02/03 12:45:56 sxw Exp $";/* *  gopherenum-breadth.c - RootNode URL enumerator for Gopher URLs * *  Usage: gopherenum-breadth gopher-URL * *  Outputs the following format: * *      URL of tree root *      URL <tab> md5 *      ... *      URL <tab> md5 * *  DEBUG: section  43, level 1, 5, 9   Gatherer enumeration for Gopher *  AUTHOR: Harvest derived * *  Harvest Indexer http://harvest.sourceforge.net/ *  ----------------------------------------------- * *  The Harvest Indexer is a continued development of code developed by *  the Harvest Project. Development is carried out by numerous individuals *  in the Internet community, and is not officially connected with the *  original Harvest Project or its funding sources. * *  Please mail lee@arco.de if you are interested in participating *  in the development effort. * *  This program is free software; you can redistribute it and/or modify *  it under the terms of the GNU General Public License as published by *  the Free Software Foundation; either version 2 of the License, or *  (at your option) any later version. * *  This program is distributed in the hope that it will be useful, *  but WITHOUT ANY WARRANTY; without even the implied warranty of *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *  GNU General Public License for more details. * *  You should have received a copy of the GNU General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//*  ---------------------------------------------------------------------- *  Copyright (c) 1994, 1995.  All rights reserved. * *    The Harvest software was developed by the Internet Research Task *    Force Research Group on Resource Discovery (IRTF-RD): * *          Mic Bowman of Transarc Corporation. *          Peter Danzig of the University of Southern California. *          Darren R. Hardy of the University of Colorado at Boulder. *          Udi Manber of the University of Arizona. *          Michael F. Schwartz of the University of Colorado at Boulder. *          Duane Wessels of the University of Colorado at Boulder. * *    This copyright notice applies to software in the Harvest *    ``src/'' directory only.  Users should consult the individual *    copyright notices in the ``components/'' subdirectories for *    copyright information about other software bundled with the *    Harvest source code distribution. * *  TERMS OF USE * *    The Harvest software may be used and re-distributed without *    charge, provided that the software origin and research team are *    cited in any use of the system.  Most commonly this is *    accomplished by including a link to the Harvest Home Page *    (gopher://harvest.cs.colorado.edu/) from the query page of any *    Broker you deploy, as well as in the query result pages.  These *    links are generated automatically by the standard Broker *    software distribution. * *    The Harvest software is provided ``as is'', without express or *    implied warranty, and with no support nor obligation to assist *    in its use, correction, modification or enhancement.  We assume *    no liability with respect to the infringement of copyrights, *    trade secrets, or any patents, and are not responsible for *    consequential damages.  Proper use of the Harvest software is *    entirely the responsibility of the user. * *  DERIVATIVE WORKS * *    Users may make derivative works from the Harvest software, subject *    to the following constraints: * *      - You must include the above copyright notice and these *        accompanying paragraphs in all forms of derivative works, *        and any documentation and other materials related to such *        distribution and use acknowledge that the software was *        developed at the above institutions. * *      - You must notify IRTF-RD regarding your distribution of *        the derivative work. * *      - You must clearly notify users that your are distributing *        a modified version and not the original Harvest software. * *      - Any derivative product is also subject to these copyright *        and use restrictions. * *    Note that the Harvest software is NOT in the public domain.  We *    retain copyright, as specified above. * *  HISTORY OF FREE SOFTWARE STATUS * *    Originally we required sites to license the software in cases *    where they were going to build commercial products/services *    around Harvest.  In June 1995 we changed this policy.  We now *    allow people to use the core Harvest software (the code found in *    the Harvest ``src/'' directory) for free.  We made this change *    in the interest of encouraging the widest possible deployment of *    the technology.  The Harvest software is really a reference *    implementation of a set of protocols and formats, some of which *    we intend to standardize.  We encourage commercial *    re-implementations of code complying to this set of standards. * */#include <stdio.h>#include <stdlib.h>#include <unistd.h>#include <memory.h>#include <string.h>#include <signal.h>#include <gdbm.h>#include "util.h"#include "url.h"#define PUBLIC extern#include "filter.h"typedef struct _list_t {    void *ptr;    int depth;    struct _list_t *next;} list_t;list_t *head = NULL;list_t **Tail = NULL;/* define HOST_COUNT_IP to 'count' visited hosts based on IP, not the   *//* given hostname.  This way aliased machines will be properly          *//* enumerated                                                           */#define HOST_COUNT_IP/* Global variables */int max_depth = 0;int cur_depth = 0;int depth_hist[100];/* Local variables */static int url_max = 0;static int nurls = 0;static int host_max = 0;static int nhosts = 0;static char *tree_root = NULL;static char *urldb_filename = NULL;static char *hostdb_filename = NULL;static char *md5db_filename = NULL;static GDBM_FILE urldbf = NULL;static GDBM_FILE hostdbf = NULL;static GDBM_FILE md5dbf = NULL;static FILE *not_visited = NULL;/* Local functions */static void usage();static void mark_failed();static void mark_retrieved();static void sigdie();static int url_in_db();static int md5_in_db();static int gopher_enum();extern int RobotsTxtCheck _PARAMS((URL *));list_t *add_to_list(url, depth)     char *url;     int depth;{    list_t *l = NULL;    l = (list_t *) xmalloc(sizeof(list_t));    l->ptr = (void *) xstrdup(url);    l->next = (list_t *) NULL;    l->depth = depth;    *Tail = l;    Tail = &(l->next);    return l;}list_t *free_from_list(l)     list_t *l;{    list_t *r = NULL;    r = l->next;    xfree(l->ptr);    xfree(l);    return r;}/* ---------------------------------------------------------------------- *//* *  mark_failed() - Mark that a URL failed to be retrieved, so that the *  enumerator doesn't try it again. This option may not be wanted by *  some users and so should be configurable. */static void mark_failed(URL *up) {    datum k,d;    Debug(43, 9, ("mark_failed: url='%s'",up->url));    k.dptr = xstrdup(up->url);    k.dsize = strlen(k.dptr) + 1;    d.dptr = xstrdup("FailedAccess");    d.dsize = strlen(d.dptr) + 1;    if (!gdbm_exists(urldbf, k) && gdbm_store(urldbf, k, d, GDBM_INSERT))        fatal("GDBM URLDB: %s: %s", k.dptr, gdbm_strerror(gdbm_errno));    xfree(k.dptr);    xfree(d.dptr);}/* *  mark_retrieved() - Mark that the given URL was successfully retrieved, *  so that the URL is not retrieved again.  This prevents cycles in the *  enumeration. */static void mark_retrieved(up)     URL *up;{    datum k, d;    Debug(43, 9, ("mark_retrieved: url='%s', md5='%s'\n", up->url, up->md5));    k.dptr = xstrdup(up->url);    k.dsize = strlen(k.dptr) + 1;    d.dptr = xstrdup(up->md5);    d.dsize = strlen(d.dptr) + 1;    if (!gdbm_exists(urldbf, k) && gdbm_store(urldbf, k, d, GDBM_INSERT))	fatal("GDBM URLDB: %s: %s", k.dptr, gdbm_strerror(gdbm_errno));    if (!gdbm_exists(md5dbf, d) && gdbm_store(md5dbf, d, k, GDBM_INSERT))	fatal("GDBM MD5DB: %s: %s", k.dptr, gdbm_strerror(gdbm_errno));    xfree(k.dptr);    xfree(d.dptr);    /* Print URL to stdout to enumerate; flush to keep pipe moving */    fprintf(stdout, "%s\t%s\n", up->url, up->md5);	/* URL <tab> MD5 */    fflush(stdout);    if (nurls++ >= url_max) {	Log("Truncating RootNode %s at %d LeafNode URLs\n",	    tree_root, url_max);	url_close(up);	up = NULL;	sigdie(0);    }}/* *  url_in_db() - check to see if the URL is in the database */static int url_in_db(url)     char *url;{    datum k;    int r;    Debug(43, 9, ("url_in_db: checking for url='%s'\n", url));    k.dptr = xstrdup(url);    k.dsize = strlen(k.dptr) + 1;    r = gdbm_exists(urldbf, k);    xfree(k.dptr);    return (r);}/* *  md5_in_db() - check to see if the MD5 is in the database */static int md5_in_db(md5)     char *md5;{    datum k;    int r;    k.dptr = xstrdup(md5);    k.dsize = strlen(k.dptr) + 1;    r = gdbm_exists(md5dbf, k);    xfree(k.dptr);    return (r);}/* *  host_in_db() - check to see if the host is in the database */static int host_in_db(host)     char *host;{    datum k;    int r;#ifdef HOST_COUNT_IP    Host *h;    h = get_host(host);    if (!h)	return 0;    k.dptr = xstrdup(h->dotaddr);#else    k.dptr = xstrdup(host);#endif    k.dsize = strlen(k.dptr) + 1;    r = gdbm_exists(hostdbf, k);    xfree(k.dptr);    return (r);}/* *  visit_server() - Determine if we should visit the server.  Return *  zero if we should not process the URL; otherwise, return non-zero. */static int visit_server(up)     URL *up;{    datum k, d;#ifdef HOST_COUNT_IP    Host *h = NULL;#endif    if (host_in_db(up->host))	/* Host is already in the db */	return (1);    if (++nhosts > host_max)	return (0);#ifdef HOST_COUNT_IP    h = get_host(up->host);    if (!h)	return (0);    k.dptr = xstrdup(h->dotaddr);#else    k.dptr = xstrdup(up->host);#endif    k.dsize = strlen(k.dptr) + 1;    d.dptr = xstrdup(up->url);    d.dsize = strlen(d.dptr) + 1;    if (gdbm_store(hostdbf, k, d, GDBM_INSERT))	fatal("GDBM HOSTDB: %s: %s", k.dptr, gdbm_strerror(gdbm_errno));    xfree(k.dptr);    xfree(d.dptr);    return (1);}int url_is_allowed(url)     char *url;{    URL *tup = NULL;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -