⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 index.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 2 页
字号:
static char rcsid[] = "index.c,v 1.72 1996/01/17 10:07:45 duane Exp";/*  *    wais.c -- Broker support for WAIS searching and indexing. *  *    William G. Camargo, Chanda Dharap,  Penn State Univ.   *    Darren Hardy, Duane Wessels, Univ. of Colorado - Boulder * *  ---------------------------------------------------------------------- *  *    You can define the following values in the broker.conf file: *  *    WAIS-Database        database name for waisindex and waissearch *    WAIS-Port            port number for waisserver and waissearch *    WAIS-Host            hostname for waisserver and waissearch *    WAIS-Log             log file for waisserver *    WAIS-Index           waisindex command *    WAIS-Parse           waisparse command (WAIS, Inc. only) *    WAIS-Server          waisserver command *    WAIS-Search          waissearch command *    WAIS-Lookup          waislookup command (WAIS, Inc. only) * *  ---------------------------------------------------------------------- *  *    When using freeWAIS-0.3 with boolean support, or TMC's wais-8-b5:  *  *      To index an objects directory: *  *        % waisindex -r [-a] -d index-name directory *  *      prints information to stdout.  Then, start up a server for the index: *  *        % waisserver -p port-num -e logfile -d index-directory & *  *      To search the index: *  *        % waissearch -h host -p port-num -d index-name keywords < /dev/null  *  *  ---------------------------------------------------------------------- *  *    When using WAIS, Inc. version 2.0:   *  *      To index an objects directory: *       *        % waisparse -parse soif -r dir | waisindex -d index-name [-append] *        % waisdelete -d index-name file *       *      prints information to stdout.  Then, start up a server for the index: *       *        % waisserver -p port-num -e logfile -d index-directory & *       *      To search the index: *       *        % echo keyword-search | waislookup -d index-name@HOST:port-num * *      Or, if you have the WAIS, Inc. Client Toolkit, then you can *      use the in-line client query code in waisquery.c by setting *      WAIS-Lookup to ``inline'' and compiling the broker using *      waisquery.c and linking with the Client Toolkit libraries. *      Also, you need to #define USE_WAIS_INLINE in this index.c file. * *  ---------------------------------------------------------------------- *  Copyright (c) 1994, 1995.  All rights reserved. *   *    The Harvest software was developed by the Internet Research Task *    Force Research Group on Resource Discovery (IRTF-RD): *   *          Mic Bowman of Transarc Corporation. *          Peter Danzig of the University of Southern California. *          Darren R. Hardy of the University of Colorado at Boulder. *          Udi Manber of the University of Arizona. *          Michael F. Schwartz of the University of Colorado at Boulder. *          Duane Wessels of the University of Colorado at Boulder. *   *    This copyright notice applies to software in the Harvest *    ``src/'' directory only.  Users should consult the individual *    copyright notices in the ``components/'' subdirectories for *    copyright information about other software bundled with the *    Harvest source code distribution. *   *  TERMS OF USE *     *    The Harvest software may be used and re-distributed without *    charge, provided that the software origin and research team are *    cited in any use of the system.  Most commonly this is *    accomplished by including a link to the Harvest Home Page *    (http://harvest.cs.colorado.edu/) from the query page of any *    Broker you deploy, as well as in the query result pages.  These *    links are generated automatically by the standard Broker *    software distribution. *     *    The Harvest software is provided ``as is'', without express or *    implied warranty, and with no support nor obligation to assist *    in its use, correction, modification or enhancement.  We assume *    no liability with respect to the infringement of copyrights, *    trade secrets, or any patents, and are not responsible for *    consequential damages.  Proper use of the Harvest software is *    entirely the responsibility of the user. *   *  DERIVATIVE WORKS *   *    Users may make derivative works from the Harvest software, subject  *    to the following constraints: *   *      - You must include the above copyright notice and these  *        accompanying paragraphs in all forms of derivative works,  *        and any documentation and other materials related to such  *        distribution and use acknowledge that the software was  *        developed at the above institutions. *   *      - You must notify IRTF-RD regarding your distribution of  *        the derivative work. *   *      - You must clearly notify users that your are distributing  *        a modified version and not the original Harvest software. *   *      - Any derivative product is also subject to these copyright  *        and use restrictions. *   *    Note that the Harvest software is NOT in the public domain.  We *    retain copyright, as specified above. *   *  HISTORY OF FREE SOFTWARE STATUS *   *    Originally we required sites to license the software in cases *    where they were going to build commercial products/services *    around Harvest.  In June 1995 we changed this policy.  We now *    allow people to use the core Harvest software (the code found in *    the Harvest ``src/'' directory) for free.  We made this change *    in the interest of encouraging the widest possible deployment of *    the technology.  The Harvest software is really a reference *    implementation of a set of protocols and formats, some of which *    we intend to standardize.  We encourage commercial *    re-implementations of code complying to this set of standards.   *   */#include "broker.h"#include "log.h"#include "Wais/index.h"/* USE_WAIS_INLINE - define to include WAIS, Inc. in-line client support */#ifndef USE_WAIS_INLINE#undef USE_WAIS_INLINE#endif#define NOSTRUCT_STR \	"103 - ERROR: WAIS Indexer does not support structured queries.\n"#define BADQ_STR \	"103 - ERROR: WAIS Indexer cannot support your query.\n"/* Global variables */extern char *DIRpath;extern char *brk_obj_url;extern int IndexType;extern int IndexServer_pid;/* Global functions */extern char *SM_Get_Obj_Filename();	/* only UNIX filesystem SM *//* Local variables */static char *WAISdbname = NULL;static char *WAIShost = NULL;static char *WAISlog = NULL;static char *WAISindex = NULL;static char *WAISparse = NULL;static char *WAISserver = NULL;static char *WAISsearch = NULL;static char *WAISflavor = NULL;static char *WAISlookup = NULL;static char *WAISdelete = NULL;static char *WAISbin = NULL;static char *newobj_fn = NULL;static int WAISport = 0;static int WAIS_NewObj = 0;static int WAIS_illegal_query = 0;/* Local functions */static char *WAIS_do_qlist();static char *WAIS_build_select();static fd_t WAIS_getfd();/*  *  WAIS_Start_WAISserver - starts a waisserver, and sets IndexServer_pid */static void WAIS_Start_WAISserver(){	static char comm[BUFSIZ];	if (WAISport < 1)		return;	Log("Starting %s on port %d.\n", WAISserver, WAISport);	/* WAIS, Inc and freeWAIS have the same server command */	sprintf(comm, "%s -p %d -e %s -d %s",		WAISserver, WAISport, WAISlog, DIRpath);#if DEBUG1	Log("\t command :%s:\n", comm);#endif	/* must use fork() rather than vfork() which causes memory leaks */	if ((IndexServer_pid = fork()) == 0) {	/* child */		char *argv[64];		close_all_fds(3);		memset(argv, '\0', sizeof(argv));		parse_argv(argv, comm);		execvp(argv[0], argv);		perror(argv[0]);		_exit(1);	}	/* parent */	/*	 *  leave IndexServer_pid negative so that it doesn't get	 *  restarted later in do_query.	 */	if (IndexServer_pid < 0) {		log_errno("fork");		return;	}	sleep(5);		/* give WAISserver a little time */	Log("%s (pid %d) is on-line...\n", WAISserver, IndexServer_pid);	return;			/* parent */}/*  *  WAIS_Start_Indexing - Performs an indexing using the given command */static int WAIS_Start_Indexing(comm)char *comm;{	int pid, status = 0;	static char buf[BUFSIZ];	char *cmd = comm;	/* If there's a waisserver running, kill it */	if (WAISport > 0 && IndexServer_pid > 0) {		Log("Killing waisserver (pid %d)...\n", IndexServer_pid);#ifdef USE_WAIS_INLINE		teardown_search();#endif		(void) kill(IndexServer_pid, SIGTERM);		sleep(10);		(void) kill(IndexServer_pid, SIGKILL);		sleep(5);		IndexServer_pid = 0;	}	/* Check if the command string has a pipe in it.  If so, wrap   */	/* /bin/sh -c ... around the command so that it gets executed   */	/* properly.   -DW */	if (strchr(comm, '|') != NULL) {		sprintf(buf, "/bin/sh -c 'exec %s'", comm);		cmd = buf;	}#if DEBUG1	Log("\t command :%s:\n", cmd);#endif	/* must use fork() rather than vfork() which causes memory leaks */	if ((pid = fork()) < 0) {		log_errno("fork");		return ERROR;	}	if (pid == 0) {		/* child */		char *argv[64];		close_all_fds(3);		memset(argv, '\0', sizeof(argv));		parse_argv(argv, cmd);		execvp(argv[0], argv);		perror(argv[0]);		_exit(1);	}	/* parent */	Log("Waiting for waisindex to finish...\n");	/* while waisindex is running, explicitly wait for it */	while (waitpid(pid, &status, WNOHANG) != pid) {		select_loop(15, 0, 0);	/* deny outside connections */		if (kill(pid, 0) != 0)			break;	/* child died, and was caught by sigreap */	}	/* Restart waisserver if needed */	if (WAISport > 0) {		WAIS_Start_WAISserver();	}	return SUCCESS;}/* -----------------------------------------------------------------    WAIS_bulk_query -- send SOIF objects based on query   ----------------------------------------------------------------- */static int WAIS_bulk_query(rsock, indexfp, ptime)int rsock;FILE *indexfp;time_t ptime;{	static char ret[BUFSIZ];	fd_t qfd, oldfd = -1;	int cnt = 0;	reg_t *bentry;	FILE *fp;	if ((fp = fdopen(rsock, "w")) == NULL) {		perror("fdopen");		QM_send_bulk_err(rsock);		return ERROR;	}	QM_send_bulk_begin(rsock);	while (fgets(ret, BUFSIZ, indexfp) != NULL) {		if (((qfd = WAIS_getfd(ret)) != ERROR) &&		    (qfd != oldfd) &&		    ((bentry = RG_Get_Entry(qfd)) != NULL) &&		    (bentry->update_time >= ptime) &&		    (QM_send_bulk_fd(qfd, fp, bentry) == SUCCESS)) {			cnt++;		}	}	fflush(fp);		/* critical, must flush before termination */	QM_send_bulk_end(rsock);	fclose(fp);	return SUCCESS;}/* -----------------------------------------------------------------    WAIS_del_query -- delete objects based on query.   ----------------------------------------------------------------- */static int WAIS_del_query(rsock, indexfp)int rsock;FILE *indexfp;{	static char ret[BUFSIZ];	fd_t qfd, oldfd = -1;	reg_t *rme;	int cnt = 0;	while (fgets(ret, BUFSIZ, indexfp) != NULL) {		if (((qfd = WAIS_getfd(ret)) != ERROR) &&		    (qfd != oldfd) &&		    ((rme = RG_Get_Entry(qfd)) != NULL)) {			COL_DEL_Obj(rme);			cnt++;		}	}	Log("Deleted %d objects based on query.\n", cnt);	return SUCCESS;}/* -----------------------------------------------------------------   WAIS_user_query -- Read the output of the WAIS query on indexfp,   then send to rsock via protocol.   ----------------------------------------------------------------- */static int WAIS_user_query(rsock, indexfp)int rsock;FILE *indexfp;{	int obcnt;	int opsize;	int score;	int lines;	static char ret[BUFSIZ];	char *opdata[2];	static char t[BUFSIZ];	char *s = NULL;	fd_t qfd;#if DEBUG2	Log("\tparsing waissearch output:\n");#endif	if (WAIS_illegal_query) {		SWRITE(rsock, BADQ_STR, strlen(BADQ_STR));		return SUCCESS;	}	if (WAIS_gotstructured &&	    strcasecmp(WAISflavor, "commercial-wais") != 0) {		SWRITE(rsock, NOSTRUCT_STR, strlen(NOSTRUCT_STR));		return SUCCESS;	}	/* 	 *  Now, we read the result set and transfer the results to 	 *  the user.  The OID is embedded in the 'Score:' line, and 	 *  there's always only 1 object match per line, unlike Glimpse.	 */	obcnt = 0;	while (fgets(ret, BUFSIZ, indexfp) != NULL) {#if DEBUG3		Log("WAIS query returned: %s\n", ret);#endif		/* See if this line has a valid OID */		qfd = WAIS_getfd(ret);		if (qfd == ERROR || qfd < 0)			continue;	/* ignore */		opsize = 0;		opdata[0] = opdata[1] = NULL;	/* Reset opdata */		/* Grab the Score and # of lines if possible */		if (strcasecmp(WAISflavor, "commercial-wais") == 0) {			if (((s = strstr(ret, "score")) != NULL) &&			    (sscanf(s, "score %d len %d",				    &score, &lines) == 2)) {				t[0] = '\0';				sprintf(t, "WAIS Results: Score: %d, length: %d", score, lines);				opdata[opsize++] = t;			}		} else {			if (((s = strstr(ret, "Score:")) != NULL) &&			    (sscanf(s, "Score: %d, lines: %d",				    &score, &lines) == 2)) {				t[0] = '\0';				sprintf(t, "WAIS Results: Score: %d, lines: %d",					score, lines);				opdata[opsize++] = t;			}		}		if (QM_user_object(rsock, qfd, opsize, opdata) == SUCCESS)			obcnt++;	}	QM_user_done(rsock, obcnt);	return SUCCESS;}/* strips attr of all non-alpha-numeric characters */static void strip_attr(attr)char *attr;{        int i,j;        static char s[BUFSIZ];	if (strcasecmp(WAISflavor, "commercial-wais"))		return;        for (i = j = 0; attr[i]; i++)                 if (isalnum((unsigned char) attr[i]))                         s[j++] = attr[i];        s[j] = '\0';        strcpy(attr, s);}/* -----------------------------------------------------------------    WAIS_do_qlist -- Recursive function to build a query from the list   ----------------------------------------------------------------- */static char *WAIS_do_qlist(ql)qlist_t *ql;{	char *ll, *rl, *nl;	if (ql->type == LOGICAL) {		if (ql->op == NOT)			return NULL;		if ((ll = WAIS_do_qlist((qlist_t *) ql->llist)) == NULL)			return NULL;		if ((rl = WAIS_do_qlist((qlist_t *) ql->rlist)) == NULL) {			xfree(ll);			return NULL;		}		nl = xmalloc(SEL_SIZE);		nl[0] = '(';		nl[1] = '\0';		strcat(nl, ll);		xfree(ll);		switch (ql->op) {		case AND:			strcat(nl, " AND ");			break;		case OR:			strcat(nl, " OR ");			break;		default:

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -