⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 index.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 2 页
字号:
		tmp[0] = '\0';		if (ql->llist) {			PLWeb_illegal_query = 1;			sprintf(tmp, "%s:%s", ql->llist, ql->rlist);			xfree(ql->rlist);			xfree(ql->llist);			return (tmp);		}		sprintf(tmp, "%s", ql->rlist);		xfree(ql->llist);		return (tmp);	}	if (ql->op == REGEX) {		tmp = (char *) xmalloc(BUFSIZ);		tmp[0] = '\0';		if (ql->llist) {			PLWeb_illegal_query = 1;			sprintf(tmp, "%s:%s", ql->llist, ql->rlist);			xfree(ql->rlist);			xfree(ql->llist);			return (tmp);		}		sprintf(tmp, "%s", ql->rlist);		xfree(ql->llist);		return (tmp);	}	return NULL;}/* ----------------------------------------------------------------- * * PLWeb_getfd -- Get the fd of the PLWeb return. * ----------------------------------------------------------------- */LOCAL fd_t PLWeb_getfd(instr)     char *instr;{	char *buf, *s;	reg_t *r;	if (!strncmp(instr, PLS_RESULT_TAG, strlen(PLS_RESULT_TAG)))		return ERROR;	/* grab the URL from the first tab delimited field */	buf = xstrdup(instr + strlen(PLS_RESULT_TAG) - 1);	for (s = buf; *s; s++) {		if (*s == '\t') {			*s = '\0';			break;		}	}	/* do an approx search by URL to get the OID */	if ((r = RG_Object_Search_ByURL(buf)) == NULL) {		Debug(101, 1, ("PLWeb_getfd: no match for URL '%s'\n", buf));		xfree(buf);		return ERROR;	}	xfree(buf);	return (r->FD);}/* XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX  * PUBLIC FUNCTIONS * XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX *//* ----------------------------------------------------------------- * * IND_New_Object -- index a new object * ----------------------------------------------------------------- */int PLWeb_IND_New_Object(entry)     reg_t *entry;{	int ret = SUCCESS;	if (IndexType == I_PER_OBJ)		ret = ERROR;	/*PLWeb_Index_Object(entry); */	if (ret == SUCCESS)		PLWeb_NewObj++;	return (ret);}/* ----------------------------------------------------------------- * * IND_Index_Full() -- perform a complete index of all objects. * ----------------------------------------------------------------- */int PLWeb_IND_Index_Full(){	char *tfn, *fn, cmd[4 * BUFSIZ], surfn[BUFSIZ];	FILE *fp;	reg_t *tmp;	Log("Begin PLWeb Full Indexing...\n");	if ((tfn = tempnam(NULL, "plweb")) == NULL) {		log_errno("tempnam");		errorlog("Cannot find tmp filename?\n");		return ERROR;	}	if ((fp = fopen(tfn, "w")) == NULL) {		log_errno(tfn);		errorlog("Cannot write tempfile?\n");		return ERROR;	}	/* Walk the entire Registry to generate cheat file for pladdsur.pl */	for (tmp = Registry; tmp != NULL; tmp = tmp->next) {		fn = SM_Get_Obj_Filename(tmp->FD);		sprintf(surfn, "%s.sur", fn);		(void) unlink(surfn);		if (tmp->desc == NULL) {			fprintf(fp, "%s.sur %s %s \"<URL:%s>\"\n",			    fn, fn, tmp->url, tmp->url);		} else {			fprintf(fp, "%s.sur %s %s \"%s\"\n",			    fn, fn, tmp->url, tmp->desc);		}		xfree(fn);	}	(void) fclose(fp);	sprintf(cmd, "pls_newdbgroup %s %s; pls_newdb %s %s %s; pls_index %s %s %s %s", PLWeb_dbgroup, PLWeb_plsroot, PLWeb_dbname, PLWeb_dbgroup, PLWeb_plsroot, PLWeb_dbname, PLWeb_dbgroup, PLWeb_plsroot, tfn);	Debug(101, 1, ("RUNNING PLWeb command: %s\n", cmd));	do_system(cmd);	(void) unlink(tfn);	xfree(tfn);	Log("Finished PLWeb Full Indexing.\n");	return SUCCESS;}/* ----------------------------------------------------------------- * * IND_Index_Incremental -- perform an incremental index * ----------------------------------------------------------------- */int PLWeb_IND_Index_Incremental(){	Log("Sorry, PLWeb Incremental Indexing is UNSUPPORTED!\n");	return ERROR;}/* ----------------------------------------------------------------- * * IND_Index_Start -- prepare for indexing a stream of objects. * ----------------------------------------------------------------- */int PLWeb_IND_Index_Start(){	PLWeb_NewObj = 0;	return SUCCESS;}/* ----------------------------------------------------------------- * * IND_Index_Flush -- finish indexing a stream of objects. * ----------------------------------------------------------------- */int PLWeb_IND_Index_Flush(){	if (PLWeb_NewObj > 0) {		/* Do the default indexing operation */		switch (IndexType) {		case I_FULL:			return (PLWeb_IND_Index_Full());		case I_INCR:			return (PLWeb_IND_Index_Incremental());		case I_PER_OBJ:			break;		default:			fatal("PLWeb_IND_Index_Flush: Internal error.\n");		}	}	return SUCCESS;}/* ----------------------------------------------------------------- * * IND_Destroy_Obj -- remove an object from the indexer. * ----------------------------------------------------------------- */int PLWeb_IND_Destroy_Obj(entry)     reg_t *entry;{	char *fn, *cmd;	/* this is slow since you need 1 fork per delete */	if (SM_Exist_Obj(entry->FD) == TRUE) {		Log("Removing PLWeb object %d from index.\n", entry->FD);		fn = SM_Get_Obj_Filename(entry->FD);		sprintf(cmd, "pls_delete %s %s %s %d %s",		    PLWeb_dbname, PLWeb_dbgroup, PLWeb_plsroot,		    PLWeb_nreorg, fn);		do_system(cmd);		xfree(fn);	}	return SUCCESS;}/* ----------------------------------------------------------------- * * IND_initialize -- initialize interface to indexer * ----------------------------------------------------------------- *//* * ** PURIFY: complains that all these strdup()'s are memory leaks. */int PLWeb_IND_initialize(){	IndexType = I_FULL;	PLWeb_plsroot = xstrdup("/usr/local/pls");	PLWeb_dbgroup = xstrdup("Harvest");	PLWeb_dbname = xstrdup("MyBroker");	PLWeb_max_lifetime = 15 * 60;	PLWeb_nreorg = 64;	PLWeb_ncalled = 0;	return SUCCESS;}/* ----------------------------------------------------------------- * * IND_do_query -- process a query string  * ----------------------------------------------------------------- */int PLWeb_IND_do_query(ql, rsock, qflag, ptime)     qlist_t *ql;     int rsock, qflag;     time_t ptime;{	FILE *indexfp;	char commandstr[BUFSIZ], xbuf[BUFSIZ], *patstr, *tfn = NULL;	int err = SUCCESS;	PLWeb_ncalled++;	sprintf(commandstr, "pls_search %s %s %s ",	    PLWeb_dbname, PLWeb_dbgroup, PLWeb_plsroot);	/* Generate PLWeb pattern to search */	patstr = PLWeb_do_qlist(ql);	if (patstr != NULL) {		sprintf(commandstr + strlen(commandstr), " \'%s\' ", patstr);		xfree(patstr);		/* Need a tmpfile for PLWeb output */		if ((tfn = tempnam(NULL, "query")) != NULL) {			strcat(commandstr, " > ");			strcat(commandstr, tfn);		} else {			SWRITE(rsock, IND_FAIL, IND_FAIL_S);			return ERROR;	/* shouldn't really happen */		}		Debug(101, 1, ("PLWeb search command: %s\n", commandstr));		/* Run the user query, give only PLWeb_lifetime seconds */		do_system_lifetime(commandstr, PLWeb_lifetime);		/* Now process the tempfile that contains the results */		if ((indexfp = fopen(tfn, "r")) == NULL) {			log_errno(tfn);			(void) unlink(tfn);			xfree(tfn);	/* PURIFY */			if (qflag == UQUERY) {				SWRITE(rsock, IND_FAIL, IND_FAIL_S);			} else {				QM_send_bulk_err(rsock);			}			(void) close(rsock);			return ERROR;		}		/* Process the PLWeb results based on this query type */		switch (qflag) {		case QBULK:#ifdef FORK_ON_BULK			if (fork() == 0) {	/* child */				close(qsock);				(void) PLWeb_bulk_query(rsock, indexfp, ptime);				(void) fclose(indexfp);				(void) unlink(tfn);				(void) close(rsock);				_exit(0);			}			err = SUCCESS;#else			err = PLWeb_bulk_query(rsock, indexfp, ptime);#endif			break;		case UQUERY:			err = PLWeb_user_query(rsock, indexfp);			break;		case QDELETE:			err = PLWeb_del_query(rsock, indexfp);			break;		default:			break;		}		/* Clean up */		(void) fclose(indexfp);		(void) unlink(tfn);		xfree(tfn);	/* PURIFY */	} else if (qflag == QBULK) {		QM_send_bulk_err(rsock);		err = ERROR;	} else {		(void) write(rsock, ERR_MSG, strlen(ERR_MSG));		log(ERR_MSG);		err = ERROR;	}	(void) close(rsock);	/* close so that results are sent */	return err;}/* ----------------------------------------------------------------- * * IND_Init_Flags -- intialize query parser flags  * ----------------------------------------------------------------- */void PLWeb_IND_Init_Flags(){	PLWeb_lifetime = PLWeb_max_lifetime;	/* reset on each query */	PLWeb_maxresults = 0;	/* Max number of hits in the result set */	PLWeb_illegal_query = 0;	/* Is PLWeb capable of this query */}/* ----------------------------------------------------------------- * * IND_Set_Flags -- set query parser flag * ----------------------------------------------------------------- */void PLWeb_IND_Set_Flags(flag, val)     char *flag, *val;{	if (flag == NULL)		return;	if ((strcasecmp(flag, "maxresult") == 0) && val != NULL) {		PLWeb_maxresults = atoi(val);		if (PLWeb_maxresults < 1)			PLWeb_maxresults = 0;	} else if (strcasecmp(flag, "timeout") == 0) {		if (val != NULL)			PLWeb_lifetime = atoi(val);		if (PLWeb_lifetime < 10)			PLWeb_lifetime = 10;	/* at least 10 seconds */		if (PLWeb_lifetime > PLWeb_max_lifetime)			PLWeb_lifetime = PLWeb_max_lifetime;	} else {		Log("WARNING: unknown flag/val %s %s\n", flag ? flag :		    "null", val ? val : "null");		PLWeb_illegal_query = 0 /*1 */ ;	}}/* ----------------------------------------------------------------- * * IND_config -- configure indexer specific variables  * ----------------------------------------------------------------- */int PLWeb_IND_config(value, tag)     char *value;     char *tag;{	if (tag == NULL || value == NULL)		return ERROR;	Debug(101, 1, ("PLWeb Configuration: %s %s\n", value, tag));	if (strcasecmp(tag, PLS_ROOT) == 0) {		PLWeb_plsroot = xstrdup(value);	} else if (strcasecmp(tag, PLS_DBGROUP) == 0) {		PLWeb_dbgroup = xstrdup(value);	} else if (strcasecmp(tag, PLS_DBNAME) == 0) {		PLWeb_dbname = xstrdup(value);	} else if (strcasecmp(tag, PLS_NREORG) == 0) {		if (sscanf(value, "%d", &PLWeb_nreorg) != 1)			fatal("sscanf PLWeb_nreorg failed");	} else if (strcasecmp(tag, PLS_MAXLIFE) == 0) {		if (sscanf(value, "%d", &PLWeb_max_lifetime) != 1)			fatal("sscanf PLWeb_max_lifetime failed");	}	return SUCCESS;}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -