📄 db.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 2 页
字号:
12 下一页
static char rcsid[] = "db.c,v 1.43 1996/01/05 20:28:53 duane Exp";/* *  db.c - Storage Manager for the Essence system * *  DEBUG: section  61, level 1         Gatherer essence database routines * *  Darren Hardy, hardy@cs.colorado.edu, February 1994 * *  ---------------------------------------------------------------------- *  Copyright (c) 1994, 1995.  All rights reserved. * *    The Harvest software was developed by the Internet Research Task *    Force Research Group on Resource Discovery (IRTF-RD): * *          Mic Bowman of Transarc Corporation. *          Peter Danzig of the University of Southern California. *          Darren R. Hardy of the University of Colorado at Boulder. *          Udi Manber of the University of Arizona. *          Michael F. Schwartz of the University of Colorado at Boulder. *          Duane Wessels of the University of Colorado at Boulder. * *    This copyright notice applies to software in the Harvest *    ``src/'' directory only.  Users should consult the individual *    copyright notices in the ``components/'' subdirectories for *    copyright information about other software bundled with the *    Harvest source code distribution. * *  TERMS OF USE * *    The Harvest software may be used and re-distributed without *    charge, provided that the software origin and research team are *    cited in any use of the system.  Most commonly this is *    accomplished by including a link to the Harvest Home Page *    (http://harvest.cs.colorado.edu/) from the query page of any *    Broker you deploy, as well as in the query result pages.  These *    links are generated automatically by the standard Broker *    software distribution. * *    The Harvest software is provided ``as is'', without express or *    implied warranty, and with no support nor obligation to assist *    in its use, correction, modification or enhancement.  We assume *    no liability with respect to the infringement of copyrights, *    trade secrets, or any patents, and are not responsible for *    consequential damages.  Proper use of the Harvest software is *    entirely the responsibility of the user. * *  DERIVATIVE WORKS * *    Users may make derivative works from the Harvest software, subject *    to the following constraints: * *      - You must include the above copyright notice and these *        accompanying paragraphs in all forms of derivative works, *        and any documentation and other materials related to such *        distribution and use acknowledge that the software was *        developed at the above institutions. * *      - You must notify IRTF-RD regarding your distribution of *        the derivative work. * *      - You must clearly notify users that your are distributing *        a modified version and not the original Harvest software. * *      - Any derivative product is also subject to these copyright *        and use restrictions. * *    Note that the Harvest software is NOT in the public domain.  We *    retain copyright, as specified above. * *  HISTORY OF FREE SOFTWARE STATUS * *    Originally we required sites to license the software in cases *    where they were going to build commercial products/services *    around Harvest.  In June 1995 we changed this policy.  We now *    allow people to use the core Harvest software (the code found in *    the Harvest ``src/'' directory) for free.  We made this change *    in the interest of encouraging the widest possible deployment of *    the technology.  The Harvest software is really a reference *    implementation of a set of protocols and formats, some of which *    we intend to standardize.  We encourage commercial *    re-implementations of code complying to this set of standards. * */#include <stdio.h>#include <string.h>#include <stdlib.h>#include <unistd.h>#include <sys/param.h>#include <time.h>#include <gdbm.h>#include "util.h"#include "url.h"#include "template.h"#include "essence.h"/* Local variables */static char dbfile[MAXPATHLEN + 1];	/* WORKING.gdbm */static char prodbfile[MAXPATHLEN + 1];	/* PRODUCTION.gdbm */static char indexfile[MAXPATHLEN + 1];	/* INDEX.gdbm */static char md5file[MAXPATHLEN + 1];	/* MD5.gdbm */static char reffile[MAXPATHLEN + 1];	/* REFRESH.gdbm */static GDBM_FILE dbf = NULL;	/* WORKING.gdbm */static GDBM_FILE pdbf = NULL;	/* PRODUCTION.gdbm */static GDBM_FILE idbf = NULL;	/* INDEX.gdbm */static GDBM_FILE mdbf = NULL;	/* MD5.gdbm */static GDBM_FILE rdbf = NULL;	/* REFRESH.gdbm */static int ndeletions = 0;	/* num of deletion operations */static int null_filter = 0;	/* dbcheck is nop? */static int max_deletions;/* Local functions */static void dbcheck_refresh();static Buffer *soif_to_buffer();/* *  init_db() - Initialize database routines.  n is the number of deletion *  operations allowed before reorganizing the GDBM database.  If n == 0, *  don't reorganize GDBM database. */void init_db(dbdir, n)     char *dbdir;     int n;{	max_deletions = n;	ndeletions = 0;	sprintf(dbfile, "%s/WORKING.gdbm", dbdir ? dbdir : topdir);	sprintf(prodbfile, "%s/PRODUCTION.gdbm", dbdir ? dbdir : topdir);	sprintf(indexfile, "%s/INDEX.gdbm", dbdir ? dbdir : topdir);	sprintf(md5file, "%s/MD5.gdbm", dbdir ? dbdir : topdir);	sprintf(reffile, "%s/REFRESH.gdbm", dbdir ? dbdir : topdir);	/* Initialize WORKING.gdbm */	dbf = gdbm_open(dbfile, 0, GDBM_NEWDB, 0644, NULL);	if (dbf == NULL) {		/* Cannot run without the working db */		log_errno(dbfile);		fatal("gdbm_open: %s: %s\n", dbfile, gdbm_strerror(gdbm_errno));	}	pdbf = gdbm_open(prodbfile, 0, GDBM_READER, 0644, NULL);	idbf = gdbm_open(indexfile, 0, GDBM_READER, 0644, NULL);	mdbf = gdbm_open(md5file, 0, GDBM_READER, 0644, NULL);	rdbf = NULL;	if (pdbf == NULL || idbf == NULL || mdbf == NULL) {		/* Act as a nop filter */		Log("WARNING: Incremental Gatherering will NOT be supported on this run.\n");		Log("\tunable to locate these database(s) needed for incremental gatherering:\n");		if (pdbf == NULL)			Log("\t%s\n", prodbfile);		if (idbf == NULL)			Log("\t%s\n", indexfile);		if (mdbf == NULL)			Log("\t%s\n", md5file);		null_filter = 1;	}	if (null_filter)		return;	/* We don't need the refresh database if we have a null filter */	rdbf = gdbm_open(reffile, 0, GDBM_NEWDB, 0644, NULL);	if (rdbf == NULL) {		Log("WARNING: gdbm_open: %s: %s\n", reffile,		    gdbm_strerror(gdbm_errno));		log_errno(reffile);	}}/* *  finish_db() - Cleaned up after database routines. */void finish_db(){	if (dbf == NULL)		return;	gdbm_sync(dbf);		/* sync to disk */#ifdef GDBM_GROWTH_BUG	if (ndeletions > 0 && gdbm_reorganize(dbf))		Log("WARNING: gdbm_reorganize: %s: %s\n", dbfile, gdbm_strerror(gdbm_errno));#endif	ndeletions = 0;	if (dbf != NULL) {		gdbm_close(dbf);		dbf = NULL;	}	if (pdbf != NULL) {		gdbm_close(pdbf);		pdbf = NULL;	}	if (idbf != NULL) {		gdbm_close(idbf);		idbf = NULL;	}	if (mdbf != NULL) {		gdbm_close(mdbf);		mdbf = NULL;	}	if (rdbf != NULL) {		gdbm_close(rdbf);		rdbf = NULL;	}}/* *  duplicate_url() - Returns non-zero if the URL is already in the *  database; zero otherwise. */int duplicate_url(url)     char *url;{	datum k;	int r;	k.dptr = url;	k.dsize = strlen(url) + 1;	r = gdbm_exists(dbf, k);	return (r);}/* *  duplicate_url() - Returns non-zero if the URL is already in any of *  the databases (WORKING or PRODUCTION); zero otherwise. */int duplicate_url_any(url)     char *url;{	datum k;	int r;	k.dptr = url;	k.dsize = strlen(url) + 1;	r = gdbm_exists(dbf, k);	if (r == 0 && pdbf != NULL)		r = gdbm_exists(pdbf, k);	return (r);}/* *  add_template() - Adds the template to the database.  If should_append *  is non-zero, then the template is appended to any existing *  template data for the URL. */void add_template(template, object)     Template *template;     DataObject *object;{	datum k, d;	Buffer *b = NULL;	Template *ct = NULL;	int appending = 0;	Debug(61, 1, ("add_template(%s)\n", template->url));	/* Set the key */	k.dptr = strdup(template->url);	k.dsize = strlen(k.dptr) + 1;	/* store terminating null char, too */	if (gdbm_exists(dbf, k)) {		datum curd;		/* If a template already exists, then check nested file.  */		if ((object->flags & F_NESTED) == 0) {			errorlog("Existing GDBM Entry for non-nested %s\n",			    template->url);			xfree(k.dptr);			return;		}		/* Grab the existing template and parse it into a Template */		curd = gdbm_fetch(dbf, k);		init_parse_template_string(curd.dptr, curd.dsize);		ct = parse_template();		finish_parse_template();		free(curd.dptr);		/* Verify that the template was parsable */		if (ct == NULL) {			errorlog("Template for %s in %s is malformed.\n",			    k.dptr, dbfile);			xfree(k.dptr);			return;		}		/* Embed the current template within old template. */		if (embed_template(template, ct) == NULL) {			errorlog("add_template: Failed to embed template: %s\n",
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -