⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 post_process.c

📁 harvest是一个下载html网页得机器人
💻 C
字号:
static char rcsid[] = "post_process.c,v 1.20 1996/03/26 04:35:14 wessels Exp";/* *  post_process.c - Post Processing routines for the Essence system * *  Duane Wessels, wessels@cs.colorado.edu, May 1995 * *  DEBUG: section  66, level 1         Gatherer essence post-summarizing * *  ---------------------------------------------------------------------- *  Copyright (c) 1994, 1995.  All rights reserved. * *    The Harvest software was developed by the Internet Research Task *    Force Research Group on Resource Discovery (IRTF-RD): * *          Mic Bowman of Transarc Corporation. *          Peter Danzig of the University of Southern California. *          Darren R. Hardy of the University of Colorado at Boulder. *          Udi Manber of the University of Arizona. *          Michael F. Schwartz of the University of Colorado at Boulder. *          Duane Wessels of the University of Colorado at Boulder. * *    This copyright notice applies to software in the Harvest *    ``src/'' directory only.  Users should consult the individual *    copyright notices in the ``components/'' subdirectories for *    copyright information about other software bundled with the *    Harvest source code distribution. * *  TERMS OF USE * *    The Harvest software may be used and re-distributed without *    charge, provided that the software origin and research team are *    cited in any use of the system.  Most commonly this is *    accomplished by including a link to the Harvest Home Page *    (http://harvest.cs.colorado.edu/) from the query page of any *    Broker you deploy, as well as in the query result pages.  These *    links are generated automatically by the standard Broker *    software distribution. * *    The Harvest software is provided ``as is'', without express or *    implied warranty, and with no support nor obligation to assist *    in its use, correction, modification or enhancement.  We assume *    no liability with respect to the infringement of copyrights, *    trade secrets, or any patents, and are not responsible for *    consequential damages.  Proper use of the Harvest software is *    entirely the responsibility of the user. * *  DERIVATIVE WORKS * *    Users may make derivative works from the Harvest software, subject *    to the following constraints: * *      - You must include the above copyright notice and these *        accompanying paragraphs in all forms of derivative works, *        and any documentation and other materials related to such *        distribution and use acknowledge that the software was *        developed at the above institutions. * *      - You must notify IRTF-RD regarding your distribution of *        the derivative work. * *      - You must clearly notify users that your are distributing *        a modified version and not the original Harvest software. * *      - Any derivative product is also subject to these copyright *        and use restrictions. * *    Note that the Harvest software is NOT in the public domain.  We *    retain copyright, as specified above. * *  HISTORY OF FREE SOFTWARE STATUS * *    Originally we required sites to license the software in cases *    where they were going to build commercial products/services *    around Harvest.  In June 1995 we changed this policy.  We now *    allow people to use the core Harvest software (the code found in *    the Harvest ``src/'' directory) for free.  We made this change *    in the interest of encouraging the widest possible deployment of *    the technology.  The Harvest software is really a reference *    implementation of a set of protocols and formats, some of which *    we intend to standardize.  We encourage commercial *    re-implementations of code complying to this set of standards. * */#include <stdlib.h>#include <unistd.h>#include <fcntl.h>#include <sys/stat.h>#include "util.h"#include "template.h"#include <sys/types.h>#include "post_process.h"Rule *PPRules = NULL;/* * my_write() * * a persistent write() for sockets and pipes.  Don't return until * all bytes have been written, or an error condition. */static int my_write(fd, ptr, nbytes)     register int fd, nbytes;     register char *ptr;{	static int nleft, nwritten;	nleft = nbytes;	while (nleft > 0) {		nwritten = write(fd, ptr, nleft > 8192 ? 8192 : nleft);		if (nwritten <= 0) {			return (nwritten);		}		nleft -= nwritten;		ptr += nwritten;	}	return (nbytes - nleft);}/* * my_read() * * a persistent read() for sockets and pipes.  Don't return until * all bytes have been read, or an error condition. */static int my_read(fd, ptr, nbytes)     register int fd, nbytes;     register char *ptr;{	static int nleft, nread;	nleft = nbytes;	while (nleft > 0) {		nread = read(fd, ptr, nleft);		if (nread < 0)			return (nread);		else if (nread == 0)			break;		nleft -= nread;		ptr += nread;	}	return (nbytes - nleft);}/* *    do_command_io (argv, writebuf, bytesout, bytesin) * *      Writes 'bytesout' of 'writebuf' to a forked processes which *      executes the command in 'argv'.  The command will read from *      stdin and write to stdout, which will be a tmpfile. *      The function returns a malloc'd buffer that contains the *      command output, and sets *bytesin accordingly. */static char *do_command_io(argv, writebuf, bytesout, bytesin)     char **argv;     char *writebuf;     int bytesout;     int *bytesin;{	int n;	char *tfile = NULL;	int p[2];	int fd;	int pid = -1;	int status;	struct stat sb;	char *inbuf = NULL;	Debug(66, 5, ("do_command_io: Running '%s'\n", *argv));	if ((tfile = tempnam(0, 0)) == (char *) 0)		goto do_cmd_done;	if (pipe(p) < 0) {		log_errno2(__FILE__, __LINE__, "pipe");		goto do_cmd_done;	}	if ((pid = fork()) == 0) {	/* child */		fd = open(tfile, O_WRONLY | O_TRUNC | O_CREAT, 0660);		if (fd < 0) {			log_errno2(__FILE__, __LINE__, tfile);			_exit(1);		}		dup2(fd, 1);		close(fd);		dup2(p[0], 0);		close(p[0]);		close(p[1]);		execvp(*argv, argv);		log_errno2(__FILE__, __LINE__, *argv);		_exit(1);	}	close(p[0]);	my_write(p[1], writebuf, bytesout);	close(p[1]);	waitpid(pid, &status, 0);	Debug(66, 5, ("do_command_io: '%s' returned %d\n", *argv, status >> 8));	if (stat(tfile, &sb) < 0) {		log_errno2(__FILE__, __LINE__, tfile);		goto do_cmd_done;	}	if (sb.st_size <= 0) {		Debug(66, 1, ("do_command_io: '%s' wrote no data\n", *argv));		goto do_cmd_done;	}	fd = open(tfile, O_RDONLY);	if (fd < 0) {		log_errno2(__FILE__, __LINE__, tfile);		goto do_cmd_done;	}	inbuf = (char *) xmalloc(sb.st_size);	n = my_read(fd, inbuf, sb.st_size);	if (n < 0) {		log_errno2(__FILE__, __LINE__, "read");		xfree(inbuf);		inbuf = 0;		goto do_cmd_done;	}	*bytesin = n;      do_cmd_done:	close(fd);	if (tfile)		unlink(tfile);	xfree(tfile);	return inbuf;}/* * check_condition() * * Check a single condition from the rules.  Return 1 if the condition * holds, or 0 if it fails.  Supported conditions are string equals * and regular expression matching. */static int check_condition(c, T)     Cond *c;     Template *T;{	char *attr = NULL;	char *c_val = NULL;	char *t_val = NULL;	AVPair *pair = NULL;	int ret = 0;	regex_t compiled_pattern;	if (!c)		goto finish_check_cond;	if (!c->attr)		goto finish_check_cond;	if (!c->value)		goto finish_check_cond;	if (!c->attr->word)		goto finish_check_cond;	if (!c->value->word)		goto finish_check_cond;	attr = xstrdup(c->attr->word);	c_val = xstrdup(c->value->word);	if (!strcasecmp(attr, "url"))		t_val = xstrdup(T->url);	else {		pair = extract_AVPair(T->list, attr);		if (pair == NULL)			goto finish_check_cond;		t_val = xstrdup(pair->value);	}	Debug(66, 5, ("check_condition: attr=%s\n", attr));	Debug(66, 5, ("check_condition: c_val=%s\n", c_val));	Debug(66, 5, ("check_condition: t_val=%s\n", t_val));	switch (c->op) {	case EQUALS:		ret = (strcasecmp(t_val, c_val) == 0);		break;	case NOTEQ:		ret = (strcasecmp(t_val, c_val) != 0);		break;	case REGEX:		regcomp(&compiled_pattern, c_val, REG_EXTENDED);		ret = (regexec(&compiled_pattern, t_val, 0, 0, 0) == 0);		regfree(&compiled_pattern);		break;	case NOTRE:		regcomp(&compiled_pattern, c_val, REG_EXTENDED);		ret = (regexec(&compiled_pattern, t_val, 0, 0, 0) != 0);		regfree(&compiled_pattern);		break;	default:		ret = 0;		break;	}      finish_check_cond:	Debug(66, 1, ("check_condition: returning %d\n", ret));	xfree(t_val);	xfree(c_val);	xfree(attr);	return ret;}/* * check_conditions() * * Check a group of conditions from the rules.  Return 1 if the conditions * hold, or 0 if they do not.  Conditions can be joined with AND, OR. * Individual conditions are evaluated in left->right order.  Complex * AND/OR groupings are not possible. */static int check_conditions(C, T)     Cond *C;     Template *T;{	Cond *c = NULL;	int this_val;	int running_val;	int lastop = -1;	for (c = C; c; c = c->next) {		this_val = check_condition(c, T);		if (lastop != -1)			switch (lastop) {			case AND:				running_val = running_val && this_val;				break;			case OR:				running_val = running_val || this_val;				break;			default:				fprintf(stderr, "Unknown condition op: %d\n", lastop);				break;		} else {			running_val = this_val;		}		lastop = c->nextop;	}	return running_val;}/* * do_assign_inst (T, attrs, args) * * attrs->word is an attribute name * args->word is the attribute value * * Simply add or replace this A/V pair in the Template */static int do_assign_inst(T, attrs, args)     Template *T;     Word *attrs;     Word *args;{	if (!attrs)		return 0;	if (!attrs->word)		return 0;	if (!args)		return 0;	if (!args->word)		return 0;	Debug(66, 5, ("do_assign_inst: %s = %s\n", attrs->word, args->word));	add_AVList(T->list, attrs->word, args->word, strlen(args->word));	return 1;}/* * do_pipe_inst (T, attrs, args) * * attrs->word is an attribute name * args is a list of words that make up a command. * * Open a pipe to the command and write the attribute value.  The command * output replaces the attribute value in the Template. */static int do_pipe_inst(T, attrs, args)     Template *T;     Word *attrs;     Word *args;{	AVPair *pr = NULL;	int argc;	int i, n;	char **argv = NULL;	Word *w = NULL;	int ret = 0;	char *inbuf = NULL;	char *t = NULL;	if (!attrs)		return 0;	if (!attrs->word)		return 0;	if (!args)		return 0;	if (!args->word)		return 0;	Debug(66, 5, ("do_pipe_inst: %s | %s ...\n", attrs->word, args->word));	if (strcasecmp(attrs->word, "url") == 0) {		pr = (AVPair *) xmalloc(sizeof(AVPair));		pr->vsize = strlen(T->url) + 2;		pr->value = xmalloc(pr->vsize);		sprintf(pr->value, "%s\n", T->url);	} else {		pr = extract_AVPair(T->list, attrs->word);	}	if (!pr) {		Debug(66, 5, ("Attribute '%s' not found.\n", attrs->word));		return 0;	}	for (argc = 0, w = args; w; w = w->next)		argc++;	argv = (char **) xmalloc((argc + 1) * sizeof(char *));	for (i = 0, w = args; w; i++, w = w->next)		*(argv + i) = xstrdup(w->word);	*(argv + argc) = NULL;	inbuf = do_command_io(argv, pr->value, pr->vsize, &n);	if (inbuf == (char *) NULL)		goto do_pipe_done;	ret = 1;	if (strcasecmp(attrs->word, "url") == 0) {		if ((t = strchr(inbuf, '\n')))			*t = '\0';		xfree(T->url);		T->url = xstrdup(inbuf);	} else {		add_AVList(T->list, attrs->word, inbuf, n);	}      do_pipe_done:	for (i = 0; i < argc; i++)		xfree(*(argv + i));	xfree(argv);	xfree(inbuf);	if (strcasecmp(attrs->word, "url") == 0)		free_AVPair(pr);	return ret;}/* * do_bang_inst (T, attrs, args) * * attrs is a list of attribute names. * args is a list of words that make up a command. * * Open a pipe to the command and write the SOIF A/V pairs for the * given attributes.  The output of the command is also SOIF A/V pairs * which is incoprorated into the template.  Existing attributes will * be overwritten. */static int do_bang_inst(T, attrs, args)     Template *T;     Word *attrs;     Word *args;{	AVPair *pr = NULL;	int argc;	int i, n;	char **argv = NULL;	Word *w = NULL;	int ret = 0;	char *inbuf = NULL;	Buffer *outb = NULL;	Template *N = NULL;	if (!attrs)		return 0;	if (!attrs->word)		return 0;	if (!args)		return 0;	if (!args->word)		return 0;	Debug(66, 5, ("do_bang_inst: %s ... ! %s ...\n", attrs->word, args->word));	N = create_template(0, T->url);	for (w = attrs; w; w = w->next) {		pr = extract_AVPair(T->list, w->word);		if (!pr)			continue;		if (!N->list) {			N->list = create_AVList(pr->attribute, pr->value, pr->vsize);		} else {			add_AVList(N->list, pr->attribute, pr->value, pr->vsize);		}	}	outb = init_print_template(0);	print_template(N);	free_template(N);	for (argc = 0, w = args; w; w = w->next)		argc++;	argv = (char **) xmalloc((argc + 1) * sizeof(char *));	for (i = 0, w = args; w; i++, w = w->next)		*(argv + i) = xstrdup(w->word);	*(argv + argc) = NULL;	Debug(66, 1, ("Writing this data (%d bytes) to %s:%s\n",		outb->length, *argv, outb->data));	inbuf = do_command_io(argv, outb->data, outb->length, &n);	finish_print_template();	if (!inbuf)		goto do_bang_done;	init_parse_template_string(inbuf, n);	N = parse_template();	finish_parse_template();	if (!N)		goto do_bang_done;	merge_AVList(T->list, N->list);	free_template(N);	ret = 1;      do_bang_done:	for (i = 0; i < argc; i++)		xfree(*(argv + i));	xfree(argv);	xfree(inbuf);	return ret;}/* * do_instructions() * * Run the instructions from a rule */static int do_instructions(I, T)     Inst *I;     Template *T;{	Inst *i = NULL;	int ret = 0;	Debug(66, 1, ("do_instructions: %s\n", T->url));	for (i = I; i; i = i->next) {		Debug(66, 1, ("Doing instruction type %d\n", i->op));		switch (i->op) {		case ASSIGN:			ret = do_assign_inst(T, i->attrs, i->args);			break;		case PIPE:			ret = do_pipe_inst(T, i->attrs, i->args);			break;		case BANG:			ret = do_bang_inst(T, i->attrs, i->args);			break;		case DELETE:			ret = SUMMARIZE_DONT_ADD_OBJECT;			break;		default:			ret = 0;			break;		}	}	return ret;}/* * post_process() - Post Process a SOIF template. */int post_process(T)     Template *T;{	Rule *r = NULL;	int ret = 0;	Debug(66, 1, ("post_process:  Starting: %s\n", T->url));	for (r = PPRules; r; r = r->next) {		if (!check_conditions(r->cond, T))			continue;		Debug(66, 2, ("post_process: Munging: %s\n", T->url));		ret = do_instructions(r->inst, T);	}	Debug(66, 2, ("post_process: Returning: %d\n", ret));	return ret;}extern int yyparse();extern FILE *yyin;/* * pp_parse_rules () * * opens and parses a file of post-processing rules. * * return 1 on failure, 0 on success */int pp_parse_rules(filename)     char *filename;{	FILE *fp = NULL;	int ret;	Log("reading post-processing rules from %s\n", filename);	fp = fopen(filename, "r");	if (!fp) {		log_errno2(__FILE__, __LINE__, filename);		return 1;	}	yyin = fp;	ret = yyparse();	fclose(fp);	yyin = (FILE *) NULL;	Debug(66, 1, ("returning %d from yyparse\n", ret));	return ret;}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -