⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 stor_reg.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 2 页
字号:
static char rcsid[] = "$Id: stor_reg.c,v 2.2 1997/04/17 14:43:13 sxw Exp $";/*  *  stor_reg.c -- Primitives for Registry Storage on disk * *  Maintains the registry on disk. * *  DEBUG: none *  AUTHOR: Harvest derived (Darren Hardy) * *  Harvest Indexer http://www.tardis.ed.ac.uk/harvest/ *  --------------------------------------------------- * *  The Harvest Indexer is a continued development of code developed by *  the Harvest Project. Development is carried out by numerous individuals *  in the Internet community, and is not officially connected with the *  original Harvest Project or its funding sources. *  *  Please mail harvest@tardis.ed.ac.uk if you are interested in participating *  in the development effort. * *  This program is free software; you can redistribute it and/or modify *  it under the terms of the GNU General Public License as published by *  the Free Software Foundation; either version 2 of the License, or *  (at your option) any later version. *   *  This program is distributed in the hope that it will be useful, *  but WITHOUT ANY WARRANTY; without even the implied warranty of *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *  GNU General Public License for more details. *   *  You should have received a copy of the GNU General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//*  ---------------------------------------------------------------------- *  Copyright (c) 1994, 1995.  All rights reserved. *   *    The Harvest software was developed by the Internet Research Task *    Force Research Group on Resource Discovery (IRTF-RD): *   *          Mic Bowman of Transarc Corporation. *          Peter Danzig of the University of Southern California. *          Darren R. Hardy of the University of Colorado at Boulder. *          Udi Manber of the University of Arizona. *          Michael F. Schwartz of the University of Colorado at Boulder. *          Duane Wessels of the University of Colorado at Boulder. *   *    This copyright notice applies to software in the Harvest *    ``src/'' directory only.  Users should consult the individual *    copyright notices in the ``components/'' subdirectories for *    copyright information about other software bundled with the *    Harvest source code distribution. *   *  TERMS OF USE *     *    The Harvest software may be used and re-distributed without *    charge, provided that the software origin and research team are *    cited in any use of the system.  Most commonly this is *    accomplished by including a link to the Harvest Home Page *    (http://harvest.cs.colorado.edu/) from the query page of any *    Broker you deploy, as well as in the query result pages.  These *    links are generated automatically by the standard Broker *    software distribution. *     *    The Harvest software is provided ``as is'', without express or *    implied warranty, and with no support nor obligation to assist *    in its use, correction, modification or enhancement.  We assume *    no liability with respect to the infringement of copyrights, *    trade secrets, or any patents, and are not responsible for *    consequential damages.  Proper use of the Harvest software is *    entirely the responsibility of the user. *   *  DERIVATIVE WORKS *   *    Users may make derivative works from the Harvest software, subject  *    to the following constraints: *   *      - You must include the above copyright notice and these  *        accompanying paragraphs in all forms of derivative works,  *        and any documentation and other materials related to such  *        distribution and use acknowledge that the software was  *        developed at the above institutions. *   *      - You must notify IRTF-RD regarding your distribution of  *        the derivative work. *   *      - You must clearly notify users that your are distributing  *        a modified version and not the original Harvest software. *   *      - Any derivative product is also subject to these copyright  *        and use restrictions. *   *    Note that the Harvest software is NOT in the public domain.  We *    retain copyright, as specified above. *   *  HISTORY OF FREE SOFTWARE STATUS *   *    Originally we required sites to license the software in cases *    where they were going to build commercial products/services *    around Harvest.  In June 1995 we changed this policy.  We now *    allow people to use the core Harvest software (the code found in *    the Harvest ``src/'' directory) for free.  We made this change *    in the interest of encouraging the widest possible deployment of *    the technology.  The Harvest software is really a reference *    implementation of a set of protocols and formats, some of which *    we intend to standardize.  We encourage commercial *    re-implementations of code complying to this set of standards.   *   */#include "broker.h"#include "log.h"#include <unistd.h>static int rfd = -1;		/* Registry file descriptor */static char *registry_file = NULL;	/* Registry file name *//*  *  init_registry_file() - Initializes the Registry file.  MUST call *  this function before any other of the below functions. */int init_registry_file(){	char *s = UTIL_make_admin_filename("Registry");	int r;	r = set_registry_file(s);	xfree(s);	return (r);}int set_registry_file(filename)char *filename;{	registry_file = xstrdup(filename);	if ((rfd = open(registry_file, O_RDWR | O_CREAT, 0664)) < 0) {		log_errno(registry_file);		errorlog("Cannot write to Registry file: %s\n", registry_file);		return ERROR;	}	return SUCCESS;}char *get_registry_file(){	return (xstrdup(registry_file));}/* *  finish_registry_file() - Cleans up the registry file. */void finish_registry_file(){	xfree(registry_file);	close(rfd);	registry_file = NULL;	rfd = -1;}#if defined(__FreeBSD__)long my_lseek(fd, offset, whence)int fd;long offset;int whence;{        off_t lseek(int, off_t, int);        return((long) lseek(fd, (off_t) offset, whence));}#endif/* *  seek_registry() - Interface to lseek(2) for the registry file. */off_t seek_registry(offset, mark)off_t offset;int mark;{	static off_t pos;#if defined(__FreeBSD__)	if ((pos = my_lseek(rfd, offset, mark)) < 0) {		log_errno("seek_registry: lseek");	}#else	if ((pos = lseek(rfd, offset, mark)) < 0) {		log_errno("seek_registry: lseek");	}#endif	return pos;}/* *  read_header() - Reads the header from the Registry file.  Must call *  init_registry_file() before calling this function.  Will return a  *  pointer to malloc'ed space that contains the registry header upon *  success.  If the header's magic number or version number aren't *  consistent, then returns NULL.  On error, returns NULL. *  If the registry file is empty, then it returns REGISTRY_EOF. */REGISTRY_HEADER *read_header(){	static REGISTRY_HEADER *rhdr;	int n;	if (seek_registry((off_t)0, SEEK_SET) < 0) {		return (NULL);	}	rhdr = (REGISTRY_HEADER *) xmalloc(sizeof(REGISTRY_HEADER));	if ((n = read(rfd, rhdr, sizeof(REGISTRY_HEADER))) < 0) {		log_errno(registry_file);		xfree(rhdr);		return (NULL);	}	if (n == 0) {		xfree(rhdr);		return ((REGISTRY_HEADER *) REGISTRY_EOF);	}	rhdr->magic = ntohl(rhdr->magic);	rhdr->version = ntohl(rhdr->version);	rhdr->nrecords = ntohl(rhdr->nrecords);	rhdr->nrecords_deleted = ntohl(rhdr->nrecords_deleted);	rhdr->nrecords_valid = ntohl(rhdr->nrecords_valid);	/* Check consistency of the header */	if (rhdr->magic != REGISTRY_MAGIC) {		errorlog("read_header: Registry is corrupt.\n");		xfree(rhdr);		return (NULL);	}	if (rhdr->version != REGISTRY_VERSION) {		errorlog("read_header: Unknown Registry version %d\n",			rhdr->version);		xfree(rhdr);		return (NULL);	}	return (rhdr);}/* *  write_header() - Writes the header from the Registry file.  Must call *  init_registry_file() before calling this function.  If the header's  *  magic number or version number aren't consistent, then returns ERROR  *  and doesn't write to the file.  On error, returns ERROR. */int write_header(rhdr)REGISTRY_HEADER *rhdr;{	REGISTRY_HEADER tmp;	/* Check consistency of the header */	if (rhdr->magic != REGISTRY_MAGIC) {		errorlog("write_header: Registry is corrupt.\n");		return ERROR;	}	if (rhdr->version != REGISTRY_VERSION) {		errorlog("write_header: Unknown Registry version %d\n",			rhdr->version);		return ERROR;	}	tmp.magic = htonl(rhdr->magic);	tmp.version = htonl(rhdr->version);	tmp.nrecords = htonl(rhdr->nrecords);	tmp.nrecords_deleted = htonl(rhdr->nrecords_deleted);	tmp.nrecords_valid = htonl(rhdr->nrecords_valid);	if (seek_registry((off_t)0, SEEK_SET) < 0) {		return ERROR;	}	if (write(rfd, &tmp, sizeof(REGISTRY_HEADER)) < 0) {		log_errno(registry_file);		return ERROR;	}	return SUCCESS;}/* *  read_record() - Reads the next record from the registry.  To reset *  to the beginning of the registry file, use read_header().  MUST *  call read_header() before the first read_record().  For example, * *              read_header() *              while (read_record() != REGISTRY_EOF) { }; *   *  Will return REGISTRY_EOF when finished reading the Registry. *  Will read the next record from the registry, parse it, writes the *  appropriate values to the allocated registry_entry, then returns SUCCESS.   *  If the record is marked deleted, then it returns ENTRY_DELETED and  *  doesn't write any data to registry_entry.  On error, it returns ERROR  *  and the data written to registry_entry is undefined. */int read_record(registry_entry)reg_t *registry_entry;{	RECORD_HEADER rhdr;	char *record, *rp;	GathererID MyGid;	int n;	num32 x;	/* Read and canonicalize the record header */	if ((n = read(rfd, &rhdr, sizeof(RECORD_HEADER))) < 0) {		log_errno(registry_file);		return ERROR;	}	if (n == 0)		return REGISTRY_EOF;	rhdr.record_size = ntohl(rhdr.record_size);	rhdr.magic = ntohl(rhdr.magic);	rhdr.flag = ntohl(rhdr.flag);	/* See if the header is corrupt */	if (rhdr.magic != REGISTRY_MAGIC) {		errorlog("Record Header is corrupt at offset %d: 0x%08x 0x%08x 0x%08x\n",			seek_registry((off_t)0, SEEK_CUR),			rhdr.record_size, rhdr.magic, rhdr.flag);		return ERROR;	}	/* Check to see if the record is deleted.  If so, skip nbytes */	if (IS_DELETED(rhdr.flag)) {		if (seek_registry((off_t)rhdr.record_size, SEEK_CUR) < 0) {			return ERROR;		}		return ENTRY_DELETED;	}	/* We're ready to read the record.  malloc the space and read */	record = (char *) xmalloc(rhdr.record_size);	if ((n = read(rfd, record, rhdr.record_size)) < 0) {		log_errno(registry_file);		xfree(record);		return ERROR;	}	/* 	 *  Now parse the record into the registry_entry space 	 *	 *  Each record looks exactly like this:	 *	 *      4 network-order bytes of URL size	 *      n bytes of URL	 *      4 network-order bytes of Gatherer-Name size	 *      n bytes of Gatherer-Name	 *      4 network-order bytes of Gatherer-Host size

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -