⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 index.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 2 页
字号:
static char rcsid[] = "index.c,v 1.5 1996/01/04 04:07:07 duane Exp";/*  *  index.c -- Broker indexing/search support using PLS, Inc.'s PLWeb 2.0 * *  Darren Hardy, U. Colorado - Boulder * *  DEBUG: section 101, level 1         Broker PLWeb indexing engine * *  You can define the following values in the broker.conf file: * *      PLS-Root                Directory in which PLWeb is installed *      PLS-DBgroup             Name of PLWeb database group *      PLS-DBname              Name of PLWeb database *      PLS-Num-Reorg           Number of deletions before plreorg is run * *  ---------------------------------------------------------------------- *  Copyright (c) 1994, 1995.  All rights reserved. *   *    The Harvest software was developed by the Internet Research Task *    Force Research Group on Resource Discovery (IRTF-RD): *   *          Mic Bowman of Transarc Corporation. *          Peter Danzig of the University of Southern California. *          Darren R. Hardy of the University of Colorado at Boulder. *          Udi Manber of the University of Arizona. *          Michael F. Schwartz of the University of Colorado at Boulder. *          Duane Wessels of the University of Colorado at Boulder. *   *    This copyright notice applies to software in the Harvest *    ``src/'' directory only.  Users should consult the individual *    copyright notices in the ``components/'' subdirectories for *    copyright information about other software bundled with the *    Harvest source code distribution. *   *  TERMS OF USE *     *    The Harvest software may be used and re-distributed without *    charge, provided that the software origin and research team are *    cited in any use of the system.  Most commonly this is *    accomplished by including a link to the Harvest Home Page *    (http://harvest.cs.colorado.edu/) from the query page of any *    Broker you deploy, as well as in the query result pages.  These *    links are generated automatically by the standard Broker *    software distribution. *     *    The Harvest software is provided ``as is'', without express or *    implied warranty, and with no support nor obligation to assist *    in its use, correction, modification or enhancement.  We assume *    no liability with respect to the infringement of copyrights, *    trade secrets, or any patents, and are not responsible for *    consequential damages.  Proper use of the Harvest software is *    entirely the responsibility of the user. *   *  DERIVATIVE WORKS *   *    Users may make derivative works from the Harvest software, subject  *    to the following constraints: *   *      - You must include the above copyright notice and these  *        accompanying paragraphs in all forms of derivative works,  *        and any documentation and other materials related to such  *        distribution and use acknowledge that the software was  *        developed at the above institutions. *   *      - You must notify IRTF-RD regarding your distribution of  *        the derivative work. *   *      - You must clearly notify users that your are distributing  *        a modified version and not the original Harvest software. *   *      - Any derivative product is also subject to these copyright  *        and use restrictions. *   *    Note that the Harvest software is NOT in the public domain.  We *    retain copyright, as specified above. *   *  HISTORY OF FREE SOFTWARE STATUS *   *    Originally we required sites to license the software in cases *    where they were going to build commercial products/services *    around Harvest.  In June 1995 we changed this policy.  We now *    allow people to use the core Harvest software (the code found in *    the Harvest ``src/'' directory) for free.  We made this change *    in the interest of encouraging the widest possible deployment of *    the technology.  The Harvest software is really a reference *    implementation of a set of protocols and formats, some of which *    we intend to standardize.  We encourage commercial *    re-implementations of code complying to this set of standards.   *   */#include "broker.h"#include "log.h"#include "PLWeb/index.h"#ifndef USE_PARENS_FOR_BOOLEAN#undef USE_PARENS_FOR_BOOLEAN#endif#define BADQ_STR \        "103 - ERROR: PLWeb cannot support your query.\n"#define PLS_RESULT_TAG	"PLWeb Results: "/* Global variables */extern char *DIRpath;extern char *brk_obj_url;extern int qsock;extern int IndexType;extern int QM_opaqueflag;extern int QM_gotphrase;	/* got a quoted phrase or not */extern int IndexServer_pid;extern reg_t *Registry;extern char *SM_Get_Obj_Filename();	/* only UNIX filesystem SM *//* Local variables */#define LOCAL staticLOCAL char *PLWeb_plsroot = NULL;LOCAL char *PLWeb_dbgroup = NULL;LOCAL char *PLWeb_dbname = NULL;LOCAL int PLWeb_nreorg = 0;LOCAL int PLWeb_NewObj = 0;LOCAL int PLWeb_maxresults;LOCAL int PLWeb_illegal_query = 0;LOCAL int PLWeb_ncalled = 0;	/* current number of queries against server */LOCAL int PLWeb_lifetime = 15 * 60;LOCAL int PLWeb_max_lifetime = 15 * 60;		/* 15 minutes *//* Local functions */LOCAL int PLWeb_bulk_query();LOCAL int PLWeb_del_query();LOCAL int PLWeb_user_query();LOCAL char *PLWeb_do_qlist();LOCAL char *PLWeb_build_select();LOCAL fd_t PLWeb_getfd();#define BIG_BUFSIZ	(8*BUFSIZ)	/* for very long lines *//* ----------------------------------------------------------------- * * PLWeb_bulk_query - do bulk transfer of all objects that match the query * ----------------------------------------------------------------- */LOCAL int PLWeb_bulk_query(rsock, indexfp, ptime)     int rsock;     FILE *indexfp;     time_t ptime;{	char ret[BIG_BUFSIZ];	fd_t qfd, oldfd = -1;	int cnt = 0;	reg_t *bentry;	FILE *fp;	if ((fp = fdopen(rsock, "w")) == NULL) {		log_errno("fdopen");		QM_send_bulk_err(rsock);		return ERROR;	}	QM_send_bulk_begin(rsock);	while (fgets(ret, BIG_BUFSIZ, indexfp) != NULL) {		if (((qfd = PLWeb_getfd(ret)) != ERROR) &&		    (qfd != oldfd) &&		    ((bentry = RG_Get_Entry(qfd)) != NULL) &&		    (bentry->update_time >= ptime) &&		    (QM_send_bulk_fd(qfd, fp, bentry) == SUCCESS)) {			cnt++;		}	}	fflush(fp);		/* critical, must flush before termination */	QM_send_bulk_end(rsock);	fclose(fp);	return SUCCESS;}/* ----------------------------------------------------------------- * * PLWeb_del_query -- delete all objects that match the query.  * ----------------------------------------------------------------- */LOCAL int PLWeb_del_query(rsock, indexfp)     int rsock;     FILE *indexfp;{	char ret[BIG_BUFSIZ];	fd_t qfd, oldfd = -1;	int cnt = 0;	reg_t *rme;	while (fgets(ret, BIG_BUFSIZ, indexfp) != NULL) {		if (((qfd = PLWeb_getfd(ret)) != ERROR) &&		    (qfd != oldfd) &&		    ((rme = RG_Get_Entry(qfd)) != NULL)) {			COL_DEL_Obj(rme);			cnt++;		}	}	Log("Deleted %d objects based on query.\n", cnt);	return SUCCESS;}/* ----------------------------------------------------------------- * * PLWeb_user_query -- Read the output of the PLWeb query on indexfp, then * send to rsock via protocol. * ----------------------------------------------------------------- */LOCAL int PLWeb_user_query(rsock, indexfp)     int rsock;     FILE *indexfp;{	fd_t fd1, fd2 = (fd_t) (-1);	char inb[BIG_BUFSIZ], opb[BUFSIZ], *opdata[BIG_BUFSIZ], *tmp, *s;	int opsize = 0, obcnt = 0, i, rank, sumsize;	/* If the query was illegal, give up quickly */	if (PLWeb_illegal_query) {		SWRITE(rsock, BADQ_STR, strlen(BADQ_STR));		return ERROR;	}	/*	 *  Before we return the query results, we perform 2 write's on	 *  the socket to the client to test whether or not the client	 *  will be able to receive the query results.	 *  We have to do two writes because the first will complete 	 *  even though the other side is gone.	 */	(void) write(rsock, PIPECHK, strlen(PIPECHK));	if (write(rsock, PIPECHK, strlen(PIPECHK)) == -1) {		errorlog("Client is gone -- aborting user query results.\n");		close(rsock);		return ERROR;	}	memset(opdata, '\0', BIG_BUFSIZ * sizeof(char *));	/* zero out opdata */	while (fgets(inb, BIG_BUFSIZ, indexfp) != NULL) {		if ((fd1 = PLWeb_getfd(inb)) == ERROR) {			continue;		}		(void) strtok(inb, "\t");	/* tag + URL */		rank = atoi(strtok(NULL, "\t"));	/* rank info */		sumsize = atoi(strtok(NULL, "\t"));	/* summary size */		sprintf(inb, "Rank: %d   Summary Size: %d bytes\n", rank, sumsize);		opdata[0] = xstrdup(inb);		opdata[1] = NULL;		opsize = 1;		if ((fd1 != fd2) && (fd2 != (fd_t) (-1))) {			/* return the previous object */			if (QM_user_object(rsock, fd2, opsize, opdata)			    == SUCCESS)				obcnt++;			/* free the opaque data */			for (i = 0; i < BUFSIZ; i++) {				if (opdata[i] != NULL) {					xfree(opdata[i]);					opdata[i] = NULL;				}			}			opsize = 0;		}		fd2 = fd1;	}	/* Get the last object */	if (fd2 != (fd_t) (-1)) {		if (QM_user_object(rsock, fd2, opsize, opdata) == SUCCESS)			obcnt++;	}	QM_user_done(rsock, obcnt);	/* Free memory */	for (i = 0; i < BUFSIZ; i++)		if (opdata[i] != NULL)			xfree(opdata[i]);	return SUCCESS;}/* ----------------------------------------------------------------- * * PLWeb_do_qlist -- Recursive function to build a query from the list. * ----------------------------------------------------------------- */LOCAL char *PLWeb_do_qlist(ql)     qlist_t *ql;{#ifdef USE_PARENS_FOR_BOOLEAN	char *ll, *rl;	static char *nl;	if (ql->type == LOGICAL) {		if (ql->op == NOT) {			return NULL;		}		if ((ll = PLWeb_do_qlist((qlist_t *) ql->llist)) == NULL) {			return NULL;		}		if ((rl = PLWeb_do_qlist((qlist_t *) ql->rlist)) == NULL) {			xfree(ll);			return NULL;		}		nl = (char *) xmalloc(BUFSIZ);		nl[0] = '(';		nl[1] = '\0';		strcat(nl, ll);		switch (ql->op) {		case AND:			strncat(nl, " AND ", 5);			break;		case OR:			strncat(nl, " OR ", 4);			break;		default:			xfree(nl);			xfree(rl);			xfree(ll);			return NULL;		}		strcat(nl, rl);		strcat(nl, ")");		xfree(ll);		xfree(rl);		return (nl);	}	return (PLWeb_build_select(ql));#else	char *ll, *rl;	if (ql->type == LOGICAL) {		if (ql->op == NOT) {			return NULL;		}		if ((ll = PLWeb_do_qlist((qlist_t *) ql->llist)) == NULL) {			return NULL;		}		if ((rl = PLWeb_do_qlist((qlist_t *) ql->rlist)) == NULL) {			xfree(ll);			return NULL;		}		switch (ql->op) {		case AND:			strncat(ll, " AND ", 5);			break;		case OR:			strncat(ll, " OR ", 4);			break;		default:			xfree(rl);			xfree(ll);			return NULL;		}		strcat(ll, rl);		xfree(rl);		return (ll);	}	return (PLWeb_build_select(ql));#endif}/* ----------------------------------------------------------------- * * PLWeb_build_select -- Build the basic PLWeb query.  * ----------------------------------------------------------------- */LOCAL char *PLWeb_build_select(ql)     qlist_t *ql;{	static char *tmp;	if (ql->op == EXACT) {		tmp = (char *) xmalloc(BUFSIZ);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -