⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 mkindex.c

📁 harvest是一个下载html网页得机器人
💻 C
字号:
static char rcsid[] = "$Id: mkindex.c,v 2.1 1997/03/21 19:21:59 sxw Exp $";/* *  mkindex.c - Builds a (URL, Timestamp) and a (URL, MD5) hash table *  from a (URL, Template) hash table. * *  Usage: mkindex [indb tstmpdb md5db] * *  For example, mkindex *               mkindex PRODUCTION.gdbm INDEX.gdbm MD5.gdbm * *  DEBUG: none *  AUTHOR: Harvest derived * *  Harvest Indexer http://www.tardis.ed.ac.uk/harvest/ *  --------------------------------------------------- * *  The Harvest Indexer is a continued development of code developed by *  the Harvest Project. Development is carried out by numerous individuals *  in the Internet community, and is not officially connected with the *  original Harvest Project or its funding sources. * *  Please mail harvest@tardis.ed.ac.uk if you are interested in participating *  in the development effort. * *  This program is free software; you can redistribute it and/or modify *  it under the terms of the GNU General Public License as published by *  the Free Software Foundation; either version 2 of the License, or *  (at your option) any later version. * *  This program is distributed in the hope that it will be useful, *  but WITHOUT ANY WARRANTY; without even the implied warranty of *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *  GNU General Public License for more details. * *  You should have received a copy of the GNU General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* *  ---------------------------------------------------------------------- *  Copyright (c) 1994, 1995.  All rights reserved. * *    The Harvest software was developed by the Internet Research Task *    Force Research Group on Resource Discovery (IRTF-RD): * *          Mic Bowman of Transarc Corporation. *          Peter Danzig of the University of Southern California. *          Darren R. Hardy of the University of Colorado at Boulder. *          Udi Manber of the University of Arizona. *          Michael F. Schwartz of the University of Colorado at Boulder. *          Duane Wessels of the University of Colorado at Boulder. * *    This copyright notice applies to software in the Harvest *    ``src/'' directory only.  Users should consult the individual *    copyright notices in the ``components/'' subdirectories for *    copyright information about other software bundled with the *    Harvest source code distribution. * *  TERMS OF USE * *    The Harvest software may be used and re-distributed without *    charge, provided that the software origin and research team are *    cited in any use of the system.  Most commonly this is *    accomplished by including a link to the Harvest Home Page *    (http://harvest.cs.colorado.edu/) from the query page of any *    Broker you deploy, as well as in the query result pages.  These *    links are generated automatically by the standard Broker *    software distribution. * *    The Harvest software is provided ``as is'', without express or *    implied warranty, and with no support nor obligation to assist *    in its use, correction, modification or enhancement.  We assume *    no liability with respect to the infringement of copyrights, *    trade secrets, or any patents, and are not responsible for *    consequential damages.  Proper use of the Harvest software is *    entirely the responsibility of the user. * *  DERIVATIVE WORKS * *    Users may make derivative works from the Harvest software, subject *    to the following constraints: * *      - You must include the above copyright notice and these *        accompanying paragraphs in all forms of derivative works, *        and any documentation and other materials related to such *        distribution and use acknowledge that the software was *        developed at the above institutions. * *      - You must notify IRTF-RD regarding your distribution of *        the derivative work. * *      - You must clearly notify users that your are distributing *        a modified version and not the original Harvest software. * *      - Any derivative product is also subject to these copyright *        and use restrictions. * *    Note that the Harvest software is NOT in the public domain.  We *    retain copyright, as specified above. * *  HISTORY OF FREE SOFTWARE STATUS * *    Originally we required sites to license the software in cases *    where they were going to build commercial products/services *    around Harvest.  In June 1995 we changed this policy.  We now *    allow people to use the core Harvest software (the code found in *    the Harvest ``src/'' directory) for free.  We made this change *    in the interest of encouraging the widest possible deployment of *    the technology.  The Harvest software is really a reference *    implementation of a set of protocols and formats, some of which *    we intend to standardize.  We encourage commercial *    re-implementations of code complying to this set of standards. * */#include <stdio.h>#include <string.h>#include <time.h>#include <gdbm.h>#include "util.h"#include "template.h"/* Local functions */static void usage();static void usage(){	fprintf(stderr, "Usage: mkindex [indb timedb md5db]\n");	exit(1);}static GDBM_FILE indbf = NULL, tsdbf = NULL, mddbf = NULL;static void die(x)     int x;{	if (indbf != NULL)		gdbm_close(indbf);	if (tsdbf != NULL)		gdbm_close(tsdbf);	if (mddbf != NULL)		gdbm_close(mddbf);	exit(x);}int main(argc, argv)     int argc;     char *argv[];{	char *infile = "PRODUCTION.gdbm";	char *tsfile = "INDEX.gdbm";	char *mdfile = "MD5.gdbm";	datum k;	datum nextkey;	datum d;	datum newd;	Template *template = NULL;	AVPair *avp = NULL;	/*	 *  # of internal cache buckets in gdbm.  We can use a large number	 *  here because each bucket holds a d.dptr, but our d.dptrs are	 *  very small in this program (max 33 bytes).	 *  But cannot be too large because GDBM is sloppy in how it does it.	 */	int cache_size = 300;	/* 3 times normal amount */	init_log3("mkindex", stderr, stderr);	if (argc == 4) {		infile = strdup(argv[1]);		tsfile = strdup(argv[2]);		mdfile = strdup(argv[3]);	} else if (argc != 1)		usage();	indbf = gdbm_open(infile, 0, GDBM_READER, 0644, NULL);	if (indbf == NULL) {		errorlog("gdbm_open: %s: %s\n", infile,		    gdbm_strerror(gdbm_errno));		die(1);	}	tsdbf = gdbm_open(tsfile, 0, GDBM_NEWDB, 0644, NULL);	if (tsdbf == NULL) {		errorlog("gdbm_open: %s: %s\n", tsfile,		    gdbm_strerror(gdbm_errno));		die(1);	}	if (gdbm_setopt(tsdbf, GDBM_CACHESIZE, &cache_size, sizeof(int)))		    Log("WARNING: Cannot reset GDBM cache size to %d.\n",		    cache_size);	mddbf = gdbm_open(mdfile, 0, GDBM_NEWDB, 0644, NULL);	if (mddbf == NULL) {		errorlog("gdbm_open: %s: %s\n", mdfile,		    gdbm_strerror(gdbm_errno));		die(1);	}	if (gdbm_setopt(mddbf, GDBM_CACHESIZE, &cache_size, sizeof(int)))		    Log("WARNING: Cannot reset GDBM cache size to %d.\n",		    cache_size);	/*	 *  Extract the (URL, Template) from indbf, then extract the	 *  timestamp from Template, then build the (Timestamp, URL)	 *  in the tsdbf.	 */	k = gdbm_firstkey(indbf);	while (k.dptr) {		d = gdbm_fetch(indbf, k);		if (d.dptr == NULL) {			errorlog("gdbm_fetch: %s: %s\n", infile,			    gdbm_strerror(gdbm_errno));			die(1);		}		init_parse_template_string(d.dptr, d.dsize);		template = parse_template();		finish_parse_template();		if (template == NULL) {			Log("WARNING: %s is not parseable.\n", k.dptr);			goto next_item;		}		avp = extract_AVPair(template->list, T_UPDATE);		if (avp == NULL) {			errorlog("%s not in template %s\n", T_UPDATE,			    template->url);			die(1);		}		/* Store URL->timestamp mapping in the INDEX */		newd.dptr = xmalloc(avp->vsize + 1);		memcpy(newd.dptr, avp->value, avp->vsize);		newd.dptr[avp->vsize] = '\0';		newd.dsize = avp->vsize + 1;	/* include \0 */		if (gdbm_store(tsdbf, k, newd, GDBM_INSERT)) {			errorlog("gdbm_store: %s: %s\n", tsfile,			    gdbm_strerror(gdbm_errno));			die(1);		}		xfree(newd.dptr);		avp = extract_AVPair(template->list, T_MD5);		if (avp != NULL) {			/* Store URL->timestamp mapping in the INDEX */			newd.dptr = xmalloc(avp->vsize + 1);			memcpy(newd.dptr, avp->value, avp->vsize);			newd.dptr[avp->vsize] = '\0';			newd.dsize = avp->vsize + 1;	/* include \0 */			if (gdbm_store(mddbf, k, newd, GDBM_INSERT)) {				errorlog("gdbm_store: %s: %s\n", mdfile,				    gdbm_strerror(gdbm_errno));				die(1);			}			xfree(newd.dptr);		}	      next_item:		nextkey = gdbm_nextkey(indbf, k);		free(k.dptr);		free(d.dptr);		free_template(template);		k = nextkey;	}	die(0);}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -