⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 candidate.c

📁 harvest是一个下载html网页得机器人
💻 C
字号:
static char rcsid[] = "candidate.c,v 1.19 1996/01/05 20:28:52 duane Exp";/* *  candidate.c - Candidate Selection for the Essence system. * *  Darren Hardy, hardy@cs.colorado.edu, February 1994 * *  ---------------------------------------------------------------------- *  Copyright (c) 1994, 1995.  All rights reserved. * *    The Harvest software was developed by the Internet Research Task *    Force Research Group on Resource Discovery (IRTF-RD): * *          Mic Bowman of Transarc Corporation. *          Peter Danzig of the University of Southern California. *          Darren R. Hardy of the University of Colorado at Boulder. *          Udi Manber of the University of Arizona. *          Michael F. Schwartz of the University of Colorado at Boulder. *          Duane Wessels of the University of Colorado at Boulder. * *    This copyright notice applies to software in the Harvest *    ``src/'' directory only.  Users should consult the individual *    copyright notices in the ``components/'' subdirectories for *    copyright information about other software bundled with the *    Harvest source code distribution. * *  TERMS OF USE * *    The Harvest software may be used and re-distributed without *    charge, provided that the software origin and research team are *    cited in any use of the system.  Most commonly this is *    accomplished by including a link to the Harvest Home Page *    (http://harvest.cs.colorado.edu/) from the query page of any *    Broker you deploy, as well as in the query result pages.  These *    links are generated automatically by the standard Broker *    software distribution. * *    The Harvest software is provided ``as is'', without express or *    implied warranty, and with no support nor obligation to assist *    in its use, correction, modification or enhancement.  We assume *    no liability with respect to the infringement of copyrights, *    trade secrets, or any patents, and are not responsible for *    consequential damages.  Proper use of the Harvest software is *    entirely the responsibility of the user. * *  DERIVATIVE WORKS * *    Users may make derivative works from the Harvest software, subject *    to the following constraints: * *      - You must include the above copyright notice and these *        accompanying paragraphs in all forms of derivative works, *        and any documentation and other materials related to such *        distribution and use acknowledge that the software was *        developed at the above institutions. * *      - You must notify IRTF-RD regarding your distribution of *        the derivative work. * *      - You must clearly notify users that your are distributing *        a modified version and not the original Harvest software. * *      - Any derivative product is also subject to these copyright *        and use restrictions. * *    Note that the Harvest software is NOT in the public domain.  We *    retain copyright, as specified above. * *  HISTORY OF FREE SOFTWARE STATUS * *    Originally we required sites to license the software in cases *    where they were going to build commercial products/services *    around Harvest.  In June 1995 we changed this policy.  We now *    allow people to use the core Harvest software (the code found in *    the Harvest ``src/'' directory) for free.  We made this change *    in the interest of encouraging the widest possible deployment of *    the technology.  The Harvest software is really a reference *    implementation of a set of protocols and formats, some of which *    we intend to standardize.  We encourage commercial *    re-implementations of code complying to this set of standards. * */#include <stdio.h>#include <string.h>#include <stdlib.h>#include "util.h"#include "url.h"#include "essence.h"static char *stoptypes[MAX_TYPES];static char *allowtypes[MAX_TYPES];/* *  init_stoplist() - Initializes candidate selection step */void init_stoplist(){	FILE *fp;	int i;	char buf[BUFSIZ], *s;	/* Grab the allow list */	i = 0;	if (stoplist != NULL && (fp = fopen(stoplist, "r")) != NULL) {		while (fgets(buf, BUFSIZ, fp)) {			if (buf[0] == '#')				continue;			s = strtok(buf, " \t\n");			if (s != NULL)				stoptypes[i++] = strdup(s);		}		fclose(fp);	}	for (; i < MAX_TYPES; i++)		stoptypes[i] = NULL;	/* Grab the allow list */	i = 0;	if (allowlist != NULL && (fp = fopen(allowlist, "r")) != NULL) {		while (fgets(buf, BUFSIZ, fp)) {			if (buf[0] == '#')				continue;			s = strtok(buf, " \t\n");			if (s != NULL)				allowtypes[i++] = strdup(s);		}		fclose(fp);	}	for (; i < MAX_TYPES; i++)		allowtypes[i] = NULL;#ifdef NO_UNIX_RECURSE	/* Add Directory by hand */	for (i = 0; i < MAX_TYPES && stoptypes[i]; i++)		if (!strcmp(stoptypes[i], "Directory"))			break;	if (stoptypes[i] == NULL)		stoptypes[i] = strdup("Directory");#endif}/* *  finish_stoplist() - Cleans up after candidate selection step */void finish_stoplist(){	int i;	for (i = 0; i < MAX_TYPES; i++) {		if (stoptypes[i])			xfree(stoptypes[i]);		if (allowtypes[i])			xfree(allowtypes[i]);	}}/* *  allow_bytype() - Candidate selection on an object determined by *  its type.  Only allows objects with matching types.  Returns non-zero *  if the object should be a candidate; returns zero otherwise. */int allow_bytype(object)     DataObject *object;{	int i;	if (!object || !object->type)		return (0);	for (i = 0; allowtypes[i] != NULL && i < MAX_TYPES; i++) {		if (!strcmp(allowtypes[i], object->type))			return (1);	}	return (0);}/* *  stop_bytype() - Candidate selection on an object determined by *  its type.  Returns non-zero if the object should be not be a *  candidate; returns zero otherwise. */int stop_bytype(object)     DataObject *object;{	int i;	if (!object || !object->type)		return (0);	for (i = 0; stoptypes[i] != NULL && i < MAX_TYPES; i++) {		if (!strcmp(stoptypes[i], object->type))			return (1);	}	return (0);}/* *  stop_byname() - Candidate selection on an object determined by *  its name.  Returns non-zero if the object should be not be a *  candidate; returns zero otherwise. */int stop_byname(object)     DataObject *object;{	return (0);}/* *  stop_byduplicate() - Candidate selection on an object determined by *  a duplicate in the database.  A duplicate need not be an exact match; *  it could be another version of the object (like the compressed *  version).  Returns non-zero if the object should not be a candidate; *  returns zero otherwise. */int stop_byduplicate(object)     DataObject *object;{	char *s, *q, buf[BUFSIZ];	int r;	/*	 *  If the object is not nested, then check to see if it's in db	 */	if ((object->flags & F_NESTED) == 0) {		r = duplicate_url(object->url->url);		if (r)			return (r);	}	/*	 *  If the object is compressed then check to see if the	 *  uncompressed version has already been done.	 */	if (!strcmp(object->type, "BZIP2Compressed") ||	    !strcmp(object->type, "Compressed") ||	    !strcmp(object->type, "GNUCompressed") ||	    !strcmp(object->type, "BZIP2CompressedTar") ||	    !strcmp(object->type, "CompressedTar") ||	    !strcmp(object->type, "GNUCompressedTar")) {		s = strdup(object->url->url);		if ((q = strrchr(s, '.')) == NULL) {	/* strip .Z, .bz2, .gz, etc */			xfree(s);			return (0);		}		*q = '\0';		r = duplicate_url_any(s);		xfree(s);		return (r);	}	/*	 *  Now check to see if the compressed version was already in the	 *  database.	 */	sprintf(buf, "%s.Z", object->url->url);	r = duplicate_url_any(buf);	if (r)		return (r);	sprintf(buf, "%s.gz", object->url->url);	r = duplicate_url_any(buf);	if (r)		return (r);	/*	 *  If we have a PostScript file, prefer the Dvi or Text version.	 *  This is a hack and doesn't work in all cases.  For example,	 *  won't remove .ps.Z + .dvi.Z.	 */	if (!strcmp(object->type, "PostScript")) {		s = strdup(object->url->url);		if ((q = strrchr(s, '.')) == NULL) {	/* strip .ps */			xfree(s);			return (0);		}		*q = '\0';		sprintf(buf, "%s.dvi", s);	/* use DVI instead */		r = duplicate_url_any(buf);		sprintf(buf, "%s.txt", s);	/* use Text instead */		xfree(s);		if (r)			return (r);		r = duplicate_url_any(buf);		if (r)			return (r);	}	return (r);}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -