📄 newsenum.c

📁 harvest是一个下载html网页得机器人
💻 C
字号:
static char rcsid[] = "$Id: newsenum.c,v 2.1 1997/03/21 19:21:39 sxw Exp $";/* *  newsenum.c - Enumerates news URLs * *  DEBUG: none *  AUTHOR: Harvest derived * *  Harvest Indexer http://harvest.sourceforge.net/ *  ----------------------------------------------- * *  The Harvest Indexer is a continued development of code developed by *  the Harvest Project. Development is carried out by numerous individuals *  in the Internet community, and is not officially connected with the *  original Harvest Project or its funding sources. * *  Please mail lee@arco.de if you are interested in participating *  in the development effort. * *  This program is free software; you can redistribute it and/or modify *  it under the terms of the GNU General Public License as published by *  the Free Software Foundation; either version 2 of the License, or *  (at your option) any later version. * *  This program is distributed in the hope that it will be useful, *  but WITHOUT ANY WARRANTY; without even the implied warranty of *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *  GNU General Public License for more details. * *  You should have received a copy of the GNU General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//*  Copyright (c) 1994, 1995.  All rights reserved. * *    The Harvest software was developed by the Internet Research Task *    Force Research Group on Resource Discovery (IRTF-RD): * *          Mic Bowman of Transarc Corporation. *          Peter Danzig of the University of Southern California. *          Darren R. Hardy of the University of Colorado at Boulder. *          Udi Manber of the University of Arizona. *          Michael F. Schwartz of the University of Colorado at Boulder. *          Duane Wessels of the University of Colorado at Boulder. * *    This copyright notice applies to software in the Harvest *    ``src/'' directory only.  Users should consult the individual *    copyright notices in the ``components/'' subdirectories for *    copyright information about other software bundled with the *    Harvest source code distribution. * *  TERMS OF USE * *    The Harvest software may be used and re-distributed without *    charge, provided that the software origin and research team are *    cited in any use of the system.  Most commonly this is *    accomplished by including a link to the Harvest Home Page *    (http://harvest.cs.colorado.edu/) from the query page of any *    Broker you deploy, as well as in the query result pages.  These *    links are generated automatically by the standard Broker *    software distribution. * *    The Harvest software is provided ``as is'', without express or *    implied warranty, and with no support nor obligation to assist *    in its use, correction, modification or enhancement.  We assume *    no liability with respect to the infringement of copyrights, *    trade secrets, or any patents, and are not responsible for *    consequential damages.  Proper use of the Harvest software is *    entirely the responsibility of the user. * *  DERIVATIVE WORKS * *    Users may make derivative works from the Harvest software, subject *    to the following constraints: * *      - You must include the above copyright notice and these *        accompanying paragraphs in all forms of derivative works, *        and any documentation and other materials related to such *        distribution and use acknowledge that the software was *        developed at the above institutions. * *      - You must notify IRTF-RD regarding your distribution of *        the derivative work. * *      - You must clearly notify users that your are distributing *        a modified version and not the original Harvest software. * *      - Any derivative product is also subject to these copyright *        and use restrictions. * *    Note that the Harvest software is NOT in the public domain.  We *    retain copyright, as specified above. * *  HISTORY OF FREE SOFTWARE STATUS * *    Originally we required sites to license the software in cases *    where they were going to build commercial products/services *    around Harvest.  In June 1995 we changed this policy.  We now *    allow people to use the core Harvest software (the code found in *    the Harvest ``src/'' directory) for free.  We made this change *    in the interest of encouraging the widest possible deployment of *    the technology.  The Harvest software is really a reference *    implementation of a set of protocols and formats, some of which *    we intend to standardize.  We encourage commercial *    re-implementations of code complying to this set of standards. */#include <stdio.h>#include <string.h>#include <stdlib.h>#include <unistd.h>#include <ctype.h>#include <gdbm.h>#include "util.h"#include "url.h"/* Local variables */static char *url_file = NULL;static GDBM_FILE urldbf = NULL;static int url_max = 250, nurls = 0;/* Local functions */static void process_url();static void usage();static void cleanup();static int url_in_db();static void mark_retrieved();static Buffer *news_enum();/* *  mark_retrieved() - Mark that the given URL was successfully retrieved, *  so that the URL is not retrieved again.  This prevents cycles in the *  enumeration. */static void mark_retrieved(up)     URL *up;{    datum k, d;    k.dptr = xstrdup(up->url);    k.dsize = strlen(k.dptr) + 1;    d.dptr = xstrdup(up->md5);    d.dsize = strlen(d.dptr) + 1;    if (gdbm_store(urldbf, k, d, GDBM_REPLACE)) {	fatal("gdbm_store: %s: urldb: %s\n", k.dptr,	    gdbm_strerror(gdbm_errno));    }    xfree(k.dptr);    xfree(d.dptr);#ifdef DEBUG    fprintf(stderr, "ADDING: %s\n", up->url);    fprintf(stderr, "ADDING: %s\n", up->md5);#endif    if (up->type == URL_NEWS) {	fprintf(stdout, "%s\t%s\n", up->url, up->md5);	fflush(stdout);	nurls++;    } else {	fprintf(stderr, "!!! URL=%s  TYPE=%d\n", up->url, up->type);    }}/* *  url_in_db() - check to see if the URL is in the database */static int url_in_db(url)     char *url;{    datum k;    int r;    k.dptr = xstrdup(url);    k.dsize = strlen(k.dptr) + 1;    r = gdbm_exists(urldbf, k);    xfree(k.dptr);    return (r);}/* *  news_enum() - Returns all of the URLs that News file at url.  The *  buffer that is returned has the URLs separated by \n's.  Returns *  NULL on error. */static Buffer *news_enum(up)     URL *up;{    FILE *fp = NULL;    char *news_path = NULL;    char *p = NULL;    char *q = NULL;    int i;    static Buffer *b = NULL;    static char buf[BUFSIZ];    static char newurl[BUFSIZ];    /* Verify that we haven't seen this before */    if (url_in_db(up->url)) {#ifdef DEBUG	fprintf(stderr, "EXISTS (url): %s\n", up->url);#endif	return (NULL);    }#ifdef DEBUG    fprintf(stderr, "RETRIEVING: %s\n", up->url);#endif    if (url_retrieve(up)) {#ifdef DEBUG	fprintf(stderr, "RETRIEVING FAILED: %s\n", up->url);#endif	return (NULL);    }    /* Remember that we've been here before */    if (up->md5)	mark_retrieved(up);    /*     *  For each pointer, convert it to a URL, and add it to     *  the list of URLs to return.     */    b = create_buffer(BUFSIZ);    if ((fp = fopen(up->filename, "r")) == NULL) {	log_errno(up->filename);	return (NULL);    }    while (fgets(buf, BUFSIZ, fp)) {#ifdef DEBUG	fprintf(stderr, "LINE: %s", buf);#endif	/* Here, we are parsing the output of an NNTP server XOVER command  */	/* Each line is one message; fields separated by tabs:              */	/* msgnum   subj    from    date    msgid   ??id n1   n2    Xref:   */	/* NOTE: the msgid field is surrounded by brackets: <xxyzz@foo.com> */	/* But a news URL looks like this:  news:xxyzz@foo.com              */	for (q = buf, i = 0; i < 4; i++)	    if (q)		q = strchr(q + 1, '\t');	if (!q)	    continue;	q += 2;			/* skip tab and < */	p = strchr(q, '>');	if (!p)	    continue;	*p = 0;	news_path = xstrdup(q);#ifdef DEBUG	Log("News Path: %s\n", news_path);#endif	sprintf(newurl, "news:%s\n", news_path);	add_buffer(b, newurl, strlen(newurl));	if (news_path)	    xfree(news_path);    }    fclose(fp);    return (b);}/* *  process_url() - Retrieves the given URL (News only), computes an MD5, *  and extracts the list of further News pointers within the document. *  If any of the HREF pointers are News and the same host as the given *  URL up, then it calls process_url() recursively. * */static void process_url(up)     URL *up;{    Buffer *b;    char buf[BUFSIZ], *p;    URL *tup;#ifdef DEBUG    fprintf(stderr, "DOING: %s\n", up->url);#endif    if ((b = news_enum(up)) == NULL)	return;    add_buffer(b, "\0", 1);    /*     *  Now, for each URL in the buffer, call process_url() if     *  the URL is an News url and it is on the same host     */    p = b->data;    while (sscanf(p, "%s\n", buf) == 1) {	p = strchr(p, '\n') + 1;#ifdef DEBUG	fprintf(stderr, "PROCESSING: %s\n", buf);#endif	if ((tup = url_open(buf)) == NULL)	    continue;	/* If it isn't a news url, or the servers are differnet */	if ((tup->type != URL_NEWS)) {	    url_close(tup);	    continue;	}	if (url_retrieve(tup)) {#ifdef DEBUG	    fprintf(stderr, "RETRIEVING FAILED: %s\n", tup->url);#endif	    url_close(tup);	    continue;	}	mark_retrieved(tup);	url_close(tup);	if (++nurls > url_max) {	    Log("Truncating RootNode URL at %d LeafNode URLs.\n",		url_max);	    break;	}    }    free_buffer(b);}static void usage(){    fprintf(stderr, "Usage: newsenum news-URL\n");    exit(1);}int main(argc, argv)     int argc;     char **argv;{    char *s;    URL *up;    FILE *logfp = NULL;    if (getenv("HARVEST_GATHERER_LOGFILE") != (char *) NULL)	logfp = fopen(getenv("HARVEST_GATHERER_LOGFILE"), "a+");    if (logfp == (FILE *) NULL)	logfp = stderr;    init_log3("newsenum", logfp, stderr);    if (argc != 2)	usage();    debug_init();    if ((s = getenv("HARVEST_URL_MAX")) != NULL)	url_max = atoi(s);#ifdef DEBUG    Log("Running News enumeration: %s\n", argv[1]);#endif    url_file = xstrdup(tempnam(NULL, "Nurl"));    urldbf = gdbm_open(url_file, 0, GDBM_NEWDB, 0644, NULL);    if (urldbf == NULL) {	errorlog("gdbm_open: %s: %s\n", url_file,	    gdbm_strerror(gdbm_errno));	log_errno(url_file);	exit(1);    }    init_url();    if ((up = url_open(argv[1])) == NULL || up->type != URL_NEWS) {	(void) unlink(url_file);	usage();    }    printf("%s\n", up->url);	/* Print tree root */    process_url(up);    url_close(up);    finish_url();    cleanup();    exit(0);}static void cleanup(){    gdbm_close(urldbf);    (void) unlink(url_file);}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -