⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 cache.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 2 页
字号:
static char rcsid[] = "$Id: cache.c,v 2.1 1997/03/21 18:01:13 sxw Exp $";/* *  cache.c - Simple, local disk cache for liburl. *  Uses a GDBM file to map URLs to the cached files.  Uses links to copy *  files.  Locks out other processes that might make modifications to the *  cache by using the mutual exclusion protection of GDBM.  Maintains a *  Cache.size file that has the number of bytes in the cache. * *  DEBUG: section  22, level 1         Common liburl disk cache routines *  AUTHOR: Harvest derived * *  Harvest Indexer http://harvest.sourceforge.net/ *  ----------------------------------------------- * *  The Harvest Indexer is a continued development of code developed by *  the Harvest Project. Development is carried out by numerous individuals *  in the Internet community, and is not officially connected with the *  original Harvest Project or its funding sources. * *  Please mail harvest@tardis.ed.ac.uk if you are interested in participating *  in the development effort. * *  This program is free software; you can redistribute it and/or modify *  it under the terms of the GNU General Public License as published by *  the Free Software Foundation; either version 2 of the License, or *  (at your option) any later version. * *  This program is distributed in the hope that it will be useful, *  but WITHOUT ANY WARRANTY; without even the implied warranty of *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *  GNU General Public License for more details. * *  You should have received a copy of the GNU General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//* *  ---------------------------------------------------------------------- *  Copyright (c) 1994, 1995.  All rights reserved. * *    The Harvest software was developed by the Internet Research Task *    Force Research Group on Resource Discovery (IRTF-RD): * *          Mic Bowman of Transarc Corporation. *          Peter Danzig of the University of Southern California. *          Darren R. Hardy of the University of Colorado at Boulder. *          Udi Manber of the University of Arizona. *          Michael F. Schwartz of the University of Colorado at Boulder. *          Duane Wessels of the University of Colorado at Boulder. * *    This copyright notice applies to software in the Harvest *    ``src/'' directory only.  Users should consult the individual *    copyright notices in the ``components/'' subdirectories for *    copyright information about other software bundled with the *    Harvest source code distribution. * *  TERMS OF USE * *    The Harvest software may be used and re-distributed without *    charge, provided that the software origin and research team are *    cited in any use of the system.  Most commonly this is *    accomplished by including a link to the Harvest Home Page *    (http://harvest.cs.colorado.edu/) from the query page of any *    Broker you deploy, as well as in the query result pages.  These *    links are generated automatically by the standard Broker *    software distribution. * *    The Harvest software is provided ``as is'', without express or *    implied warranty, and with no support nor obligation to assist *    in its use, correction, modification or enhancement.  We assume *    no liability with respect to the infringement of copyrights, *    trade secrets, or any patents, and are not responsible for *    consequential damages.  Proper use of the Harvest software is *    entirely the responsibility of the user. * *  DERIVATIVE WORKS * *    Users may make derivative works from the Harvest software, subject *    to the following constraints: * *      - You must include the above copyright notice and these *        accompanying paragraphs in all forms of derivative works, *        and any documentation and other materials related to such *        distribution and use acknowledge that the software was *        developed at the above institutions. * *      - You must notify IRTF-RD regarding your distribution of *        the derivative work. * *      - You must clearly notify users that your are distributing *        a modified version and not the original Harvest software. * *      - Any derivative product is also subject to these copyright *        and use restrictions. * *    Note that the Harvest software is NOT in the public domain.  We *    retain copyright, as specified above. * *  HISTORY OF FREE SOFTWARE STATUS * *    Originally we required sites to license the software in cases *    where they were going to build commercial products/services *    around Harvest.  In June 1995 we changed this policy.  We now *    allow people to use the core Harvest software (the code found in *    the Harvest ``src/'' directory) for free.  We made this change *    in the interest of encouraging the widest possible deployment of *    the technology.  The Harvest software is really a reference *    implementation of a set of protocols and formats, some of which *    we intend to standardize.  We encourage commercial *    re-implementations of code complying to this set of standards. * */#include <stdio.h>#include <string.h>#include <stdlib.h>#include <unistd.h>#include <errno.h>#include <fcntl.h>#include <time.h>#include <sys/time.h>#include <sys/types.h>#include <sys/stat.h>#include <gdbm.h>#include "util.h"#include "url.h"/* *  Try HAVE_SRAND48, then try HAVE_SRANDOM, otherwise assume HAVE_SRAND *//* *  CACHE_TTL - number of seconds that makes cached files invalid */#ifndef CACHE_TTL#define CACHE_TTL		(1 * 7 * 24 * 60 * 60)	/* 1 week */#endif/* *  USE_CACHE_TMPDIR is the default temporary directory of where to *  place the cache, or the environment variable TMPDIR is used. *  This directory MUST be on the same partition as TMPDIR, since we *  use link(2) for copying. */#ifndef USE_CACHE_TMPDIR#define USE_CACHE_TMPDIR	"/tmp"#endif/* Local variables */static char cachedir[BUFSIZ];static char cachetable[BUFSIZ];static char cachesize[BUFSIZ];static time_t watermark;static GDBM_FILE dbf = NULL;static int max_cache_size = (32 * 1024 * 1024);		/* 32 MBs */static int cache_ttl = CACHE_TTL;static GDBM_FILE lm_dbf = NULL;static char lmttable[BUFSIZ];/* Local functions */static void delete_cache_entry();static void get_access();static void release_access();static void die();static int get_cachesize();static void change_cachesize();static void delete_cache_url();static char *next_filename();static void init_next_filename();static void die(){	if (lm_dbf != NULL)		gdbm_close(lm_dbf);	lm_dbf = NULL;	if (dbf != NULL)		gdbm_close(dbf);	dbf = NULL;	exit(1);}/* *  finish_cache() - Cleanup the cache. */void finish_cache(){	if (lm_dbf != NULL)		gdbm_close(lm_dbf);	lm_dbf = NULL;	if (dbf != NULL)		gdbm_close(dbf);	dbf = NULL;}/* *  init_cache() - Startup the cache */void init_cache(){	char *s = getenv("TMPDIR");	struct stat sb;	/* Create a directory in which to cache the files */	sprintf(cachedir, "%s/cache-liburl", s ? s : USE_CACHE_TMPDIR);	(void) mkdir(cachedir, 0755);	if (access(cachedir, W_OK)) {		errorlog("Cannot use %s\n", cachedir);		die();	}	init_next_filename(cachedir);	sprintf(cachetable, "%s/Cache.gdbm", cachedir);	sprintf(cachesize, "%s/Cache.size", cachedir);	if (access(cachetable, F_OK)) {		dbf = gdbm_open(cachetable, 0, GDBM_NEWDB, 0664, NULL);		if (dbf == NULL) {			if ((gdbm_errno != GDBM_CANT_BE_WRITER) &&			    (gdbm_errno != GDBM_CANT_BE_READER)) {				errorlog("GDBM ERROR: gdbm_open: %s: %s\n",				    cachetable, gdbm_strerror(gdbm_errno));				die();			}		} else			gdbm_close(dbf);	}	dbf = NULL;	/*	 * watermark was used for comparing cached object time with time on	 * GDBM file.  Now we use the the current time instead so this could	 * go away -DW	 */	if (stat(cachetable, &sb) < 0) {		log_errno(cachetable);		watermark = 0;	} else {		watermark = sb.st_mtime;	}	watermark = watermark > 0 ? watermark : 0;	sprintf(lmttable, "%s/LMT.gdbm", cachedir);	if (access(lmttable, F_OK)) {		lm_dbf = gdbm_open(lmttable, 0, GDBM_NEWDB, 0664, NULL);		if (lm_dbf == NULL) {			if ((gdbm_errno != GDBM_CANT_BE_WRITER) &&			    (gdbm_errno != GDBM_CANT_BE_READER)) {				errorlog("GDBM ERROR: gdbm_open: %s: %s\n",				    lmttable, gdbm_strerror(gdbm_errno));				die();			}		} else			gdbm_close(lm_dbf);	}	lm_dbf = NULL;#if   defined(HAVE_SRAND48)	(void) srand48((long) time(NULL));#elif defined(HAVE_SRANDOM)	(void) srandom((unsigned) time(NULL));#else	(void) srand(time(NULL));#endif	max_cache_size = 32;	if ((s = getenv("HARVEST_MAX_LOCAL_CACHE")) != NULL)		max_cache_size = atoi(s);	if (max_cache_size < 0)		max_cache_size = 32;	max_cache_size *= 1024 * 1024;	cache_ttl = CACHE_TTL;	if ((s = getenv("GATHERER_CACHE_TTL")) != NULL)		cache_ttl = atoi(s);	if (cache_ttl < 0)		cache_ttl = CACHE_TTL;}/* *  get_access() - Obtains access to GDBM database table.  Blocks until *  it can obtain access.  Locks all other liburl's from the cache table. */static void get_access(flag)     int flag;{	while (1) {		dbf = gdbm_open(cachetable, 0, flag, 0664, NULL);		if (dbf != NULL)			break;		if ((gdbm_errno != GDBM_CANT_BE_WRITER) &&		    (gdbm_errno != GDBM_CANT_BE_READER)) {			errorlog("GDBM ERROR: gdbm_open: %s: %s\n",			    cachetable, gdbm_strerror(gdbm_errno));			die();		}#ifdef HAVE_USLEEP#if   defined(HAVE_SRAND48)		(void) usleep((lrand48() % 200) + 10);	/* wait a random amount */#elif defined(HAVE_SRANDOM)		(void) usleep((random() % 200) + 10);	/* wait a random amount */#else		(void) usleep((rand() % 200) + 10);	/* wait a random amount */#endif#else		{			struct timeval sleep;			sleep.tv_sec = 0;#if   defined(HAVE_SRAND48)			sleep.tv_usec = (lrand48() % 200) + 10;#elif defined(HAVE_SRANDOM)			sleep.tv_usec = (random() % 200) + 10;#else			sleep.tv_usec = (rand() % 200) + 10;#endif#ifndef _HARVEST_HPUX_			select(0, (fd_set *) 0, (fd_set *) 0, (fd_set *) 0, &sleep);#else /* _HARVEST_HPUX_ */			select(0, (int *) 0, (int *) 0, (int *) 0, &sleep);#endif /* _HARVEST_HPUX_ */		}#endif	}	/*	 * this should be safe.  Only open this DB after the other has	 * been opened.	 */	lm_dbf = gdbm_open(lmttable, 0, flag, 0664, NULL);	if (lm_dbf == NULL) {		errorlog("GDBM ERROR: gdbm_open: %s: %s\n",		    lmttable, gdbm_strerror(gdbm_errno));		die();	}}/* *  release_access() - Releases access to the GDBM database. */static void release_access(){	if (lm_dbf != NULL)		gdbm_close(lm_dbf);	lm_dbf = NULL;	if (dbf != NULL)		gdbm_close(dbf);	dbf = NULL;}/* *  get_cache_filename() - Generates a unique filename to store in the cache */static char *get_cache_filename(){	static char *s;	while (1) {		if ((s = next_filename()) == NULL)			return (NULL);		if (access(s, F_OK))			return (s);		xfree(s);	}	return (NULL);}/* *  add_cache() - Add the URL,filename to the cache. */void add_cache(url, filename, lmt)     char *url;     char *filename;     time_t lmt;{	datum k, d;	char *cfile;	struct stat sb;	int ndeletes = 0, current_size;	int status;	/* Find out some more about the file */	if (lstat(filename, &sb) < 0) {		log_errno(filename);		return;	}	if (!S_ISREG(sb.st_mode))		return;	get_access(GDBM_WRCREAT);	/* LOCK */

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -