📄 newsenum.c
字号:
static char rcsid[] = "$Id: newsenum.c,v 2.1 1997/03/21 19:21:39 sxw Exp $";/* * newsenum.c - Enumerates news URLs * * DEBUG: none * AUTHOR: Harvest derived * * Harvest Indexer http://harvest.sourceforge.net/ * ----------------------------------------------- * * The Harvest Indexer is a continued development of code developed by * the Harvest Project. Development is carried out by numerous individuals * in the Internet community, and is not officially connected with the * original Harvest Project or its funding sources. * * Please mail lee@arco.de if you are interested in participating * in the development effort. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* Copyright (c) 1994, 1995. All rights reserved. * * The Harvest software was developed by the Internet Research Task * Force Research Group on Resource Discovery (IRTF-RD): * * Mic Bowman of Transarc Corporation. * Peter Danzig of the University of Southern California. * Darren R. Hardy of the University of Colorado at Boulder. * Udi Manber of the University of Arizona. * Michael F. Schwartz of the University of Colorado at Boulder. * Duane Wessels of the University of Colorado at Boulder. * * This copyright notice applies to software in the Harvest * ``src/'' directory only. Users should consult the individual * copyright notices in the ``components/'' subdirectories for * copyright information about other software bundled with the * Harvest source code distribution. * * TERMS OF USE * * The Harvest software may be used and re-distributed without * charge, provided that the software origin and research team are * cited in any use of the system. Most commonly this is * accomplished by including a link to the Harvest Home Page * (http://harvest.cs.colorado.edu/) from the query page of any * Broker you deploy, as well as in the query result pages. These * links are generated automatically by the standard Broker * software distribution. * * The Harvest software is provided ``as is'', without express or * implied warranty, and with no support nor obligation to assist * in its use, correction, modification or enhancement. We assume * no liability with respect to the infringement of copyrights, * trade secrets, or any patents, and are not responsible for * consequential damages. Proper use of the Harvest software is * entirely the responsibility of the user. * * DERIVATIVE WORKS * * Users may make derivative works from the Harvest software, subject * to the following constraints: * * - You must include the above copyright notice and these * accompanying paragraphs in all forms of derivative works, * and any documentation and other materials related to such * distribution and use acknowledge that the software was * developed at the above institutions. * * - You must notify IRTF-RD regarding your distribution of * the derivative work. * * - You must clearly notify users that your are distributing * a modified version and not the original Harvest software. * * - Any derivative product is also subject to these copyright * and use restrictions. * * Note that the Harvest software is NOT in the public domain. We * retain copyright, as specified above. * * HISTORY OF FREE SOFTWARE STATUS * * Originally we required sites to license the software in cases * where they were going to build commercial products/services * around Harvest. In June 1995 we changed this policy. We now * allow people to use the core Harvest software (the code found in * the Harvest ``src/'' directory) for free. We made this change * in the interest of encouraging the widest possible deployment of * the technology. The Harvest software is really a reference * implementation of a set of protocols and formats, some of which * we intend to standardize. We encourage commercial * re-implementations of code complying to this set of standards. */#include <stdio.h>#include <string.h>#include <stdlib.h>#include <unistd.h>#include <ctype.h>#include <gdbm.h>#include "util.h"#include "url.h"/* Local variables */static char *url_file = NULL;static GDBM_FILE urldbf = NULL;static int url_max = 250, nurls = 0;/* Local functions */static void process_url();static void usage();static void cleanup();static int url_in_db();static void mark_retrieved();static Buffer *news_enum();/* * mark_retrieved() - Mark that the given URL was successfully retrieved, * so that the URL is not retrieved again. This prevents cycles in the * enumeration. */static void mark_retrieved(up) URL *up;{ datum k, d; k.dptr = xstrdup(up->url); k.dsize = strlen(k.dptr) + 1; d.dptr = xstrdup(up->md5); d.dsize = strlen(d.dptr) + 1; if (gdbm_store(urldbf, k, d, GDBM_REPLACE)) { fatal("gdbm_store: %s: urldb: %s\n", k.dptr, gdbm_strerror(gdbm_errno)); } xfree(k.dptr); xfree(d.dptr);#ifdef DEBUG fprintf(stderr, "ADDING: %s\n", up->url); fprintf(stderr, "ADDING: %s\n", up->md5);#endif if (up->type == URL_NEWS) { fprintf(stdout, "%s\t%s\n", up->url, up->md5); fflush(stdout); nurls++; } else { fprintf(stderr, "!!! URL=%s TYPE=%d\n", up->url, up->type); }}/* * url_in_db() - check to see if the URL is in the database */static int url_in_db(url) char *url;{ datum k; int r; k.dptr = xstrdup(url); k.dsize = strlen(k.dptr) + 1; r = gdbm_exists(urldbf, k); xfree(k.dptr); return (r);}/* * news_enum() - Returns all of the URLs that News file at url. The * buffer that is returned has the URLs separated by \n's. Returns * NULL on error. */static Buffer *news_enum(up) URL *up;{ FILE *fp = NULL; char *news_path = NULL; char *p = NULL; char *q = NULL; int i; static Buffer *b = NULL; static char buf[BUFSIZ]; static char newurl[BUFSIZ]; /* Verify that we haven't seen this before */ if (url_in_db(up->url)) {#ifdef DEBUG fprintf(stderr, "EXISTS (url): %s\n", up->url);#endif return (NULL); }#ifdef DEBUG fprintf(stderr, "RETRIEVING: %s\n", up->url);#endif if (url_retrieve(up)) {#ifdef DEBUG fprintf(stderr, "RETRIEVING FAILED: %s\n", up->url);#endif return (NULL); } /* Remember that we've been here before */ if (up->md5) mark_retrieved(up); /* * For each pointer, convert it to a URL, and add it to * the list of URLs to return. */ b = create_buffer(BUFSIZ); if ((fp = fopen(up->filename, "r")) == NULL) { log_errno(up->filename); return (NULL); } while (fgets(buf, BUFSIZ, fp)) {#ifdef DEBUG fprintf(stderr, "LINE: %s", buf);#endif /* Here, we are parsing the output of an NNTP server XOVER command */ /* Each line is one message; fields separated by tabs: */ /* msgnum subj from date msgid ??id n1 n2 Xref: */ /* NOTE: the msgid field is surrounded by brackets: <xxyzz@foo.com> */ /* But a news URL looks like this: news:xxyzz@foo.com */ for (q = buf, i = 0; i < 4; i++) if (q) q = strchr(q + 1, '\t'); if (!q) continue; q += 2; /* skip tab and < */ p = strchr(q, '>'); if (!p) continue; *p = 0; news_path = xstrdup(q);#ifdef DEBUG Log("News Path: %s\n", news_path);#endif sprintf(newurl, "news:%s\n", news_path); add_buffer(b, newurl, strlen(newurl)); if (news_path) xfree(news_path); } fclose(fp); return (b);}/* * process_url() - Retrieves the given URL (News only), computes an MD5, * and extracts the list of further News pointers within the document. * If any of the HREF pointers are News and the same host as the given * URL up, then it calls process_url() recursively. * */static void process_url(up) URL *up;{ Buffer *b; char buf[BUFSIZ], *p; URL *tup;#ifdef DEBUG fprintf(stderr, "DOING: %s\n", up->url);#endif if ((b = news_enum(up)) == NULL) return; add_buffer(b, "\0", 1); /* * Now, for each URL in the buffer, call process_url() if * the URL is an News url and it is on the same host */ p = b->data; while (sscanf(p, "%s\n", buf) == 1) { p = strchr(p, '\n') + 1;#ifdef DEBUG fprintf(stderr, "PROCESSING: %s\n", buf);#endif if ((tup = url_open(buf)) == NULL) continue; /* If it isn't a news url, or the servers are differnet */ if ((tup->type != URL_NEWS)) { url_close(tup); continue; } if (url_retrieve(tup)) {#ifdef DEBUG fprintf(stderr, "RETRIEVING FAILED: %s\n", tup->url);#endif url_close(tup); continue; } mark_retrieved(tup); url_close(tup); if (++nurls > url_max) { Log("Truncating RootNode URL at %d LeafNode URLs.\n", url_max); break; } } free_buffer(b);}static void usage(){ fprintf(stderr, "Usage: newsenum news-URL\n"); exit(1);}int main(argc, argv) int argc; char **argv;{ char *s; URL *up; FILE *logfp = NULL; if (getenv("HARVEST_GATHERER_LOGFILE") != (char *) NULL) logfp = fopen(getenv("HARVEST_GATHERER_LOGFILE"), "a+"); if (logfp == (FILE *) NULL) logfp = stderr; init_log3("newsenum", logfp, stderr); if (argc != 2) usage(); debug_init(); if ((s = getenv("HARVEST_URL_MAX")) != NULL) url_max = atoi(s);#ifdef DEBUG Log("Running News enumeration: %s\n", argv[1]);#endif url_file = xstrdup(tempnam(NULL, "Nurl")); urldbf = gdbm_open(url_file, 0, GDBM_NEWDB, 0644, NULL); if (urldbf == NULL) { errorlog("gdbm_open: %s: %s\n", url_file, gdbm_strerror(gdbm_errno)); log_errno(url_file); exit(1); } init_url(); if ((up = url_open(argv[1])) == NULL || up->type != URL_NEWS) { (void) unlink(url_file); usage(); } printf("%s\n", up->url); /* Print tree root */ process_url(up); url_close(up); finish_url(); cleanup(); exit(0);}static void cleanup(){ gdbm_close(urldbf); (void) unlink(url_file);}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -