⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 collector.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 2 页
字号:
static char rcsid[] = "$Id: collector.c,v 2.2 2000/01/21 17:37:33 sxw Exp $";/* *  collector.c -- Utility procs for add/delete/refresh objects in the Broker. * *  DEBUG: section  71, level 1         Broker collection routines *  AUTHOR: Harvest Derived (William G. Camargo, Darren Hardy) * *  Harvest Indexer http://harvest.sourceforge.net/ *  ----------------------------------------------- * *  The Harvest Indexer is a continued development of code developed by *  the Harvest Project. Development is carried out by numerous individuals *  in the Internet community, and is not officially connected with the *  original Harvest Project or its funding sources. * *  Please mail lee@arco.de if you are interested in participating *  in the development effort. * *  This program is free software; you can redistribute it and/or modify *  it under the terms of the GNU General Public License as published by *  the Free Software Foundation; either version 2 of the License, or *  (at your option) any later version. * *  This program is distributed in the hope that it will be useful, *  but WITHOUT ANY WARRANTY; without even the implied warranty of *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *  GNU General Public License for more details. * *  You should have received a copy of the GNU General Public License *  along with this program; if not, write to the Free Software *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. *//*  ---------------------------------------------------------------------- *  Copyright (c) 1994, 1995.  All rights reserved. * *    The Harvest software was developed by the Internet Research Task *    Force Research Group on Resource Discovery (IRTF-RD): * *          Mic Bowman of Transarc Corporation. *          Peter Danzig of the University of Southern California. *          Darren R. Hardy of the University of Colorado at Boulder. *          Udi Manber of the University of Arizona. *          Michael F. Schwartz of the University of Colorado at Boulder. *          Duane Wessels of the University of Colorado at Boulder. * *    This copyright notice applies to software in the Harvest *    ``src/'' directory only.  Users should consult the individual *    copyright notices in the ``components/'' subdirectories for *    copyright information about other software bundled with the *    Harvest source code distribution. * *  TERMS OF USE * *    The Harvest software may be used and re-distributed without *    charge, provided that the software origin and research team are *    cited in any use of the system.  Most commonly this is *    accomplished by including a link to the Harvest Home Page *    (http://harvest.cs.colorado.edu/) from the query page of any *    Broker you deploy, as well as in the query result pages.  These *    links are generated automatically by the standard Broker *    software distribution. * *    The Harvest software is provided ``as is'', without express or *    implied warranty, and with no support nor obligation to assist *    in its use, correction, modification or enhancement.  We assume *    no liability with respect to the infringement of copyrights, *    trade secrets, or any patents, and are not responsible for *    consequential damages.  Proper use of the Harvest software is *    entirely the responsibility of the user. * *  DERIVATIVE WORKS * *    Users may make derivative works from the Harvest software, subject *    to the following constraints: * *      - You must include the above copyright notice and these *        accompanying paragraphs in all forms of derivative works, *        and any documentation and other materials related to such *        distribution and use acknowledge that the software was *        developed at the above institutions. * *      - You must notify IRTF-RD regarding your distribution of *        the derivative work. * *      - You must clearly notify users that your are distributing *        a modified version and not the original Harvest software. * *      - Any derivative product is also subject to these copyright *        and use restrictions. * *    Note that the Harvest software is NOT in the public domain.  We *    retain copyright, as specified above. * *  HISTORY OF FREE SOFTWARE STATUS * *    Originally we required sites to license the software in cases *    where they were going to build commercial products/services *    around Harvest.  In June 1995 we changed this policy.  We now *    allow people to use the core Harvest software (the code found in *    the Harvest ``src/'' directory) for free.  We made this change *    in the interest of encouraging the widest possible deployment of *    the technology.  The Harvest software is really a reference *    implementation of a set of protocols and formats, some of which *    we intend to standardize.  We encourage commercial *    re-implementations of code complying to this set of standards. * */#include "broker.h"#include "log.h"#define LUPDATE_SIZE MAXHOSTNAMELEN+30/* Global variables */extern char *Gather;extern char *HName;extern char *DIRpath;extern char *ColConfig;extern char *obj_desc;extern int obj_desc_s;int new_nobjs, up_nobjs, del_nobjs, ref_nobjs, recv_nobjs, ign_nobjs;time_t max_update_time = 0;GathererID *COL_gid = NULL;/* Local functions */static int COL_put_last_update();/* ----------------------------------------------------------------- * COL_UPD_Obj_begin() -- initialize a new summary object. * ----------------------------------------------------------------- */FILE *COL_UPD_Obj_begin(entry)     reg_t *entry;{	fd_t fd;	if ((fd = SM_Create_Obj()) == ERROR) {		errorlog("Collector: Cannot create a new object.\n");		return NULL;	}	entry->FD = fd;	return (SM_Write_Obj(fd));}/* ----------------------------------------------------------------- * * COL_Fill_Entry() -- fill in refresh rate and expiration times based * on given/default info. * ----------------------------------------------------------------- */int COL_Fill_Entry(entry)     reg_t *entry;{	/*	 *  All Registry entries MUST have:	 *      URL, Gatherer-Name, Gatherer-Host, Gatherer-Version,	 *      and Update-Time	 *  MD5's are optional, but may used in elimination searches.	 */	if (entry->url == NULL) {		errorlog("%s%s: %s attribute is missing from object: %s\n",		    COLLECT, ENTRY_ERR, "URL", entry->url);		return ERROR;	}	if (entry->update_time == 0) {		errorlog("%s%s: %s attribute is missing from object: %s\n",		    COLLECT, ENTRY_ERR, "Update-Time", entry->url);		return ERROR;	}	if (COL_gid->gn == NULL) {		errorlog("%s%s: %s attribute is missing from object: %s\n",		    COLLECT, ENTRY_ERR, "Gatherer-Name", entry->url);		return ERROR;	}	if (COL_gid->gh == NULL) {		errorlog("%s%s: %s attribute is missing from object: %s\n",		    COLLECT, ENTRY_ERR, "Gatherer-Host", entry->url);		return ERROR;	}	if (COL_gid->gv == NULL) {		errorlog("%s%s: %s attribute is missing from object: %s\n",		    COLLECT, ENTRY_ERR, "Gatherer-Version", entry->url);		return ERROR;	}	COL_gid->GID = -1;	entry->GID = RG_gid_register(COL_gid);	if (entry->GID == -1) {		errorlog("%s%s: illegal Gatherer ID for object.\n",		    COLLECT, ENTRY_ERR);		return ERROR;	}	/* Set default values */	if (entry->lmt < 1)		entry->lmt = 0;	if (entry->refresh_rate < 1)		entry->refresh_rate = (time_t) WEEK;	return SUCCESS;}/* ----------------------------------------------------------------- * * COL_UPD_Obj_end() -- Add initialized summary object to the Broker. * ----------------------------------------------------------------- */int COL_UPD_Obj_end(entry)     reg_t *entry;{	reg_t *tmp;	int updating = 0;	/*	 *  We want to see if the new object already matches any objects	 *  in the current Registry.  If it does, then if the new object's	 *  Update-Time is older than or the same as the Registry object's	 *  Update-Time, then we ignore the new object.  Otherwise, we need	 *  to replace the Registry objects with the new object.  We do this	 *  by deleting the Registry objects, then adding the new object	 *  to the Registry.  The RG_Cleaner() will run periodically	 *  to compress the Registry.	 *	 *  If the new object is not in the Registry, then it's new	 *  so we add it to the Registry.	 */	while ((tmp = RG_Object_Search_Entry(entry)) != NULL) {		if (tmp->update_time >= entry->update_time) {			(void) SM_Destroy_Obj(entry->FD);			RG_Free_Entry(entry);			ign_nobjs++;			if (updating) del_nobjs++; /* There will be no update */			return SUCCESS;		} else {			(void) RG_Clean_Entry(tmp);			if (updating) del_nobjs++; /* One might be updated, */			updating = 1;		   /* others are deleted.   */		}	}	if (RG_Register(entry) == ERROR) {		RG_Free_Entry(entry);		return ERROR;	}	do_IND_New_Object(entry);	LOGUPDATE(entry);	if (updating) {		up_nobjs++;	} else		new_nobjs++;	return SUCCESS;}/* ----------------------------------------------------------------- * COL_DEL_Obj() -- remove an object from the Broker. * ----------------------------------------------------------------- */int COL_DEL_Obj(entry)     reg_t *entry;{	reg_t *tmp;	int err = SUCCESS;	if ((tmp = RG_Object_Search_Entry(entry)) != NULL) {		LOGDELETE(tmp);		if (RG_Clean_Entry(tmp) == ERROR)			err = ERROR;		del_nobjs++;	} else {		ign_nobjs++;	}	RG_Free_Entry(entry);	return (err);}/* ----------------------------------------------------------------- * COL_REF_Obj -- update expiration time of an object. * ----------------------------------------------------------------- */int COL_REF_Obj(entry)     reg_t *entry;{	reg_t *tmp;	/*	 *  When refreshing the object, all we need to do is save	 *  the new update_time, then write it to the Registry file.	 */	if ((tmp = RG_Object_Search_Entry(entry)) != NULL) {		/* save new expiration time on disk */		tmp->update_time = entry->update_time;		replace_record(tmp);		LOGREFRESH(entry);		ref_nobjs++;		RG_Free_Entry(entry);		return SUCCESS;	}	ign_nobjs++;	RG_Free_Entry(entry);	return ERROR;}/* ----------------------------------------------------------------- * * COL_Save_Att() -- decide which attributes are needed in the registry * and save/free it * ----------------------------------------------------------------- */int COL_Save_Att(wlk, entry)     AVPair *wlk;     reg_t *entry;{	num32 len;	char *field_name;	char *value;	field_name = wlk->attribute;	value = wlk->value;	len = (num32) wlk->vsize;	/* We can assume that these strcmp will only match once per object */	if (strcmp(field_name, GATH_HOST) == 0) {		COL_gid->gh = (char *) xmalloc(len + 1);		memcpy(COL_gid->gh, value, len);		COL_gid->gh[len] = '\0';		COL_gid->ghs = len;		return SUCCESS;	} else if (strcmp(field_name, GATH_NAME) == 0) {		COL_gid->gn = (char *) xmalloc(len + 1);		memcpy(COL_gid->gn, value, len);		COL_gid->gn[len] = '\0';		COL_gid->gns = len;		return SUCCESS;	} else if (strcmp(field_name, GATH_VER) == 0) {		COL_gid->gv = (char *) xmalloc(len + 1);		memcpy(COL_gid->gv, value, len);		COL_gid->gv[len] = '\0';		COL_gid->gvs = len;		return SUCCESS;	} else if (strcmp(field_name, MD5) == 0) {		entry->md5 = (char *) xmalloc(len + 1);		memcpy(entry->md5, value, len);		entry->md5[len] = '\0';		entry->md5s = len;		return SUCCESS;	} else if (strcmp(field_name, LMT_A) == 0) {		entry->lmt = (time_t) atol(value);		if (entry->lmt < 1)			entry->lmt = 0;		return SUCCESS;	} else if (strcmp(field_name, UPDATE_A) == 0) {		entry->update_time = (time_t) atol(value);		if (entry->update_time < 1)			entry->update_time = 0;		return SUCCESS;	} else if (strcmp(field_name, TTL) == 0) {		entry->ttl = (time_t) atol(value);		if (entry->ttl < 1)			entry->ttl = 0;		return SUCCESS;	} else if (strcmp(field_name, REFRESH_A) == 0) {		entry->refresh_rate = (time_t) atol(value);		if (entry->refresh_rate < 1)			entry->refresh_rate = 0;		return SUCCESS;	} else if (strcasecmp(field_name, obj_desc) == 0) {		if (entry->desc != NULL) {			/* Ignore duplicate description field */			return SUCCESS;		}#ifdef TRUNCATE_DESCRIPTIONS		{			/* Makes all descriptions one-line only */			int maxdesc = 70, x;			char *s;			/* don't malloc too much; we truncate at maxdesc */			x = ((maxdesc + 10) < len) ? (maxdesc + 10) : len;			entry->desc = (char *) xmalloc(x + 1);			memcpy(entry->desc, value, x);			entry->desc[x] = '\0';			/* See if chopping at the first newline will do it */			if ((s = strchr(entry->desc, '\n')) != NULL)				*s = '\0';			if (strlen(entry->desc) > maxdesc) {				/* we'd better just chop off the end */				entry->desc[maxdesc - 1] = '\0';				entry->desc[maxdesc - 2] = '.';				entry->desc[maxdesc - 3] = '.';				entry->desc[maxdesc - 4] = '.';			}			/* reassign buffer */			s = xstrdup(entry->desc);			xfree(entry->desc);			entry->desc = s;		}#else		entry->desc = (char *) xmalloc(len + 1);		memcpy(entry->desc, value, len);		entry->desc[len] = '\0';#endif		entry->descs = strlen(entry->desc);		return SUCCESS;	}	return ERROR;}/* ----------------------------------------------------------------- * * COL_Normalize() do some thesaurus, normalization-- changes all * field names to lower case * ----------------------------------------------------------------- */char *COL_Normalize_Name(name)     char *name;{	UTIL_Make_Lower(name);	return (name);}static int To_Gatherer[2];/* *  COL_Create_Read_Pipe - creates a read pipe from the gather process. */FILE *COL_Create_Read_Pipe(cmd)     char *cmd;{	int pid;	static FILE *fp;	if (pipe(To_Gatherer) < 0) {		log_errno("pipe");		return (NULL);	}	/* need to use fork() rather than vfork() because of a memory leak */	if ((pid = fork()) < 0) {		log_errno("fork");		return (NULL);	}	if (pid == 0) {		/* child */		char *argv[64];		/* simple parsing of the command string */		memset(argv, '\0', sizeof(char *) * 64);		parse_argv(argv, cmd);		/* make 'gather' talk with the Broker */		close(To_Gatherer[0]);		dup2(To_Gatherer[1], 1);	/* stdout -> write pipe */		/* close to prevent gather from getting the broker sockets */		close_all_fds(3);		/* stdin is /dev/null, stdout is pipe, stderr is broker.out */		execvp(argv[0], argv);		perror(argv[0]);		_exit(1);	}	/* parent */	close(To_Gatherer[1]);	if ((fp = fdopen(To_Gatherer[0], "r")) == NULL) {		errorlog("COL_Create_Read_Pipe: fdopen(%d, \"r\") failed.\n",		    To_Gatherer[0]);		close(To_Gatherer[0]);		return (NULL);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -