⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 cleandb.c

📁 harvest是一个下载html网页得机器人
💻 C
字号:
static char rcsid[] = "cleandb.c,v 1.22 1996/01/17 10:07:46 duane Exp";/* *  cleandb - Cleans up a GDBM database to prepare it for production use. *  Verifies that each SOIF template is legal, verifies that each *  SOIF template is printed with the libtemplate routine, and verifies *  that each SOIF object contains an 'Update-Time', and 'Gatherer-*' attr. * *  Usage: cleandb [-truncate] file * *  Darren Hardy, hardy@cs.colorado.edu, May 1994 * *  Copyright (c) 1994, 1995.  All rights reserved. *   *    The Harvest software was developed by the Internet Research Task *    Force Research Group on Resource Discovery (IRTF-RD): *   *          Mic Bowman of Transarc Corporation. *          Peter Danzig of the University of Southern California. *          Darren R. Hardy of the University of Colorado at Boulder. *          Udi Manber of the University of Arizona. *          Michael F. Schwartz of the University of Colorado at Boulder. *          Duane Wessels of the University of Colorado at Boulder. *   *    This copyright notice applies to software in the Harvest *    ``src/'' directory only.  Users should consult the individual *    copyright notices in the ``components/'' subdirectories for *    copyright information about other software bundled with the *    Harvest source code distribution. *   *  TERMS OF USE *     *    The Harvest software may be used and re-distributed without *    charge, provided that the software origin and research team are *    cited in any use of the system.  Most commonly this is *    accomplished by including a link to the Harvest Home Page *    (http://harvest.cs.colorado.edu/) from the query page of any *    Broker you deploy, as well as in the query result pages.  These *    links are generated automatically by the standard Broker *    software distribution. *     *    The Harvest software is provided ``as is'', without express or *    implied warranty, and with no support nor obligation to assist *    in its use, correction, modification or enhancement.  We assume *    no liability with respect to the infringement of copyrights, *    trade secrets, or any patents, and are not responsible for *    consequential damages.  Proper use of the Harvest software is *    entirely the responsibility of the user. *   *  DERIVATIVE WORKS *   *    Users may make derivative works from the Harvest software, subject  *    to the following constraints: *   *      - You must include the above copyright notice and these  *        accompanying paragraphs in all forms of derivative works,  *        and any documentation and other materials related to such  *        distribution and use acknowledge that the software was  *        developed at the above institutions. *   *      - You must notify IRTF-RD regarding your distribution of  *        the derivative work. *   *      - You must clearly notify users that your are distributing  *        a modified version and not the original Harvest software. *   *      - Any derivative product is also subject to these copyright  *        and use restrictions. *   *    Note that the Harvest software is NOT in the public domain.  We *    retain copyright, as specified above. *   *  HISTORY OF FREE SOFTWARE STATUS *   *    Originally we required sites to license the software in cases *    where they were going to build commercial products/services *    around Harvest.  In June 1995 we changed this policy.  We now *    allow people to use the core Harvest software (the code found in *    the Harvest ``src/'' directory) for free.  We made this change *    in the interest of encouraging the widest possible deployment of *    the technology.  The Harvest software is really a reference *    implementation of a set of protocols and formats, some of which *    we intend to standardize.  We encourage commercial *    re-implementations of code complying to this set of standards.   *   */#include <stdio.h>#include <string.h>#include <ctype.h>#include <time.h>#include <gdbm.h>#include "util.h"#include "template.h"/* *  MAX_BYTES - Maximum number of bytes allowed in a field during -truncate. */#ifndef MAX_BYTES#define MAX_BYTES	(8 * 1024)#endif/* Local functions */static void usage();static void check_template();static int do_truncate = 0;static void usage(){	fprintf(stderr, "Usage: cleandb [-truncate] db\n");	exit(1);}static void remove_keywords(t)     Template *t;{	AVList *walker = t->list;	/* Remove any keyword data that's longer than MAX_BYTES */	while (walker) {		if (strstr(walker->data->attribute, "eyword")) {			if (walker->data->vsize > MAX_BYTES) {				Log("Trimmed %d bytes from %s attribute (%s)\n",				    walker->data->vsize - MAX_BYTES, t->url,				    walker->data->attribute);				walker->data->vsize = MAX_BYTES;			}		}		walker = walker->next;	}}static void check_template(dbf, k, d)     GDBM_FILE dbf;     datum k;     datum d;{	Template *t = NULL;	Buffer *b = NULL;	datum nd;	/* Parse the template to ensure correctness */	init_parse_template_string(d.dptr, d.dsize);	t = parse_template();	finish_parse_template();	if (t == NULL) {	/* Unparsable; delete it */		Log("Deleting invalid SOIF: Unparsable: %s\n", k.dptr);		gdbm_delete(dbf, k);		xfree(k.dptr);		xfree(d.dptr);		return;	}	if (extract_AVPair(t->list, T_UPDATE) == NULL) {		Log("Deleting invalid SOIF: No %s: %s\n", T_UPDATE, k.dptr);		gdbm_delete(dbf, k);		xfree(k.dptr);		xfree(d.dptr);		free_template(t);		return;	}	if (extract_AVPair(t->list, T_GHOST) == NULL) {		Log("Deleting invalid SOIF: No %s: %s\n", T_GHOST, k.dptr);		gdbm_delete(dbf, k);		xfree(k.dptr);		xfree(d.dptr);		free_template(t);		return;	}	if (extract_AVPair(t->list, T_GNAME) == NULL) {		Log("Deleting invalid SOIF: No %s: %s\n", T_GNAME, k.dptr);		gdbm_delete(dbf, k);		xfree(k.dptr);		xfree(d.dptr);		free_template(t);		return;	}	if (extract_AVPair(t->list, T_GVERSION) == NULL) {		Log("Deleting invalid SOIF: No %s: %s\n", T_GVERSION, k.dptr);		gdbm_delete(dbf, k);		xfree(k.dptr);		xfree(d.dptr);		free_template(t);		return;	}	if (do_truncate)		remove_keywords(t);	/* Verify that the stored data is the same as the parsed template */	b = init_print_template(NULL);	print_template(t);	nd.dptr = b->data;	nd.dsize = b->length;	if (d.dsize != nd.dsize)	/* Different templates, replace */		(void) gdbm_store(dbf, k, nd, GDBM_REPLACE);	/* Clean up */	xfree(k.dptr);	xfree(d.dptr);	finish_print_template();	free_template(t);}int main(argc, argv)     int argc;     char *argv[];{	GDBM_FILE dbf;	datum d, k, nk;	init_log3("cleandb", stdout, stderr);	if (argc > 1 && !strcmp(argv[1], "-truncate")) {		argc--;		argv++;		do_truncate = 1;	}	if (argc != 2)		usage();	dbf = gdbm_open(argv[1], 0, GDBM_WRITER, 0644, NULL);	if (dbf == NULL) {		errorlog("gdbm_open: %s: %s\n", argv[1],		    gdbm_strerror(gdbm_errno));		log_errno(argv[1]);		usage();	}	k = gdbm_firstkey(dbf);	while (k.dptr) {		nk = gdbm_nextkey(dbf, k);		d = gdbm_fetch(dbf, k);		check_template(dbf, k, d);		k = nk;	}	gdbm_close(dbf);	exit(0);}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -