⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 summarize.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 2 页
字号:
static char rcsid[] = "summarize.c,v 1.87 1996/01/05 20:28:59 duane Exp";/* *  summarize.c - Summarizing for the Essence system. * *  DEBUG: section  64, level 1         Gatherer essence object summarizing * *  Darren Hardy, hardy@cs.colorado.edu, February 1994 * *  ---------------------------------------------------------------------- *  Copyright (c) 1994, 1995.  All rights reserved. * *    The Harvest software was developed by the Internet Research Task *    Force Research Group on Resource Discovery (IRTF-RD): * *          Mic Bowman of Transarc Corporation. *          Peter Danzig of the University of Southern California. *          Darren R. Hardy of the University of Colorado at Boulder. *          Udi Manber of the University of Arizona. *          Michael F. Schwartz of the University of Colorado at Boulder. *          Duane Wessels of the University of Colorado at Boulder. * *    This copyright notice applies to software in the Harvest *    ``src/'' directory only.  Users should consult the individual *    copyright notices in the ``components/'' subdirectories for *    copyright information about other software bundled with the *    Harvest source code distribution. * *  TERMS OF USE * *    The Harvest software may be used and re-distributed without *    charge, provided that the software origin and research team are *    cited in any use of the system.  Most commonly this is *    accomplished by including a link to the Harvest Home Page *    (http://harvest.cs.colorado.edu/) from the query page of any *    Broker you deploy, as well as in the query result pages.  These *    links are generated automatically by the standard Broker *    software distribution. * *    The Harvest software is provided ``as is'', without express or *    implied warranty, and with no support nor obligation to assist *    in its use, correction, modification or enhancement.  We assume *    no liability with respect to the infringement of copyrights, *    trade secrets, or any patents, and are not responsible for *    consequential damages.  Proper use of the Harvest software is *    entirely the responsibility of the user. * *  DERIVATIVE WORKS * *    Users may make derivative works from the Harvest software, subject *    to the following constraints: * *      - You must include the above copyright notice and these *        accompanying paragraphs in all forms of derivative works, *        and any documentation and other materials related to such *        distribution and use acknowledge that the software was *        developed at the above institutions. * *      - You must notify IRTF-RD regarding your distribution of *        the derivative work. * *      - You must clearly notify users that your are distributing *        a modified version and not the original Harvest software. * *      - Any derivative product is also subject to these copyright *        and use restrictions. * *    Note that the Harvest software is NOT in the public domain.  We *    retain copyright, as specified above. * *  HISTORY OF FREE SOFTWARE STATUS * *    Originally we required sites to license the software in cases *    where they were going to build commercial products/services *    around Harvest.  In June 1995 we changed this policy.  We now *    allow people to use the core Harvest software (the code found in *    the Harvest ``src/'' directory) for free.  We made this change *    in the interest of encouraging the widest possible deployment of *    the technology.  The Harvest software is really a reference *    implementation of a set of protocols and formats, some of which *    we intend to standardize.  We encourage commercial *    re-implementations of code complying to this set of standards. * */#include <stdio.h>#include <stdlib.h>#include <unistd.h>#include <string.h>#include <memory.h>#include <ctype.h>#include <sys/types.h>#include <sys/param.h>#include <time.h>#include "util.h"#include "essence.h"#include "post_process.h"/* Local Functions */static int summarize_file();static int read_structured_summary();static void grab_fulltext();static void mkkeywords();static void mkdescription();static void mkgid();#ifdef USE_QUICKSUMstatic int can_quicksum();static void init_quicksum();static void generate_quicksum();static void finish_quicksum();#endif/* *  summarize() - Summarizes an object and adds the generated template *  to the storage manager.  Returns 0 on success; non-zero otherwise. */int summarize(object)     DataObject *object;{	Debug(64, 1, ("summarize(%s, %s)\n", object->url->url, object->type));#ifdef NO_UNIX_RECURSE	if (!strcmp(object->type, "Directory")) {		Log("Skipping %s (%s)\n", object->url->url, object->type);		return (0);	/* skip directories */	}#endif	switch (object->url->type) {	case URL_FILE:		/* Supported Types */	case URL_FTP:	case URL_GOPHER:	case URL_NEWS:	case URL_HTTP:	case URL_NOP:		return (summarize_file(object));	default:		errorlog("Internal summarize() error.  Unsupported type.\n");	}	return (1);}/* *  init_summarize() - Initializes the Summarize step. */void init_summarize(){#ifdef USE_QUICKSUM	init_quicksum();#endif}/* *  finish_summarize() - Cleans up after the Summarize step */void finish_summarize(){#ifdef USE_QUICKSUM	finish_quicksum();#endif}/* *  summarize_file() - Summarizes a file and adds the generated template *  to the storage manager.  Returns 0 on success; non-zero otherwise. */static int summarize_file(obj)     DataObject *obj;{	Template *template = NULL;	FILE *ifp = NULL;	struct OID *oid = NULL;	char buf[BUFSIZ], *s, *q;	int pipefds[2], pid = 0, err;	int localobj = 0;	int pp_code = 0;	/*	 *  We don't really need the object to do a full summary, so	 *  set a flag to say if we got it or not to make this section	 *  more clear.	 */	localobj = !object_retrieve(obj);	/*	 *  Check to see if this object is a nested object.	 *  If so, change the URL of the template to the URL	 *  of the parent object, and include an Attribute for	 *  the name of the nested file (using only the relative pathname)	 */	if ((obj->flags & F_NESTED) && obj->parent_url) {		Debug(64, 1, ("Creating Nested object for %s\n", obj->parent_url));		template = create_template(obj->ttype, obj->parent_url);		s = strstr(obj->url->url, tmpdir);		s = (s != NULL) ? s + strlen(tmpdir) + 1 : obj->url->url;		q = strchr(s, '/');		q = (q == NULL) ? s : q + 1;		template->list = create_AVList(T_NESTED, q, strlen(q));	} else {		oid = generate_oid(obj->url->url, gatherer_id, obj);		template = create_template_with_oid(obj->ttype,		    obj->url->url, oid);	}	/* Add some other known Attributes */	add_AVList(template->list, T_FILETYPE, obj->type, strlen(obj->type));	/* We can't look at the object so finish up */	if (obj->flags & F_NO_ACCESS || obj->flags & F_MANUAL || !localobj) {		goto finish_summarizing;	}	/* below here localobj == non-zero; add a few more attributes... */	sprintf(buf, "%u", (unsigned int) obj->s->st_size);	add_AVList(template->list, T_FILESIZE, buf, strlen(buf));#ifdef USE_T_URI	add_AVList(template->list, T_URI, obj->url->url, strlen(obj->url->url));#endif#ifdef USE_MD5	{		/* If the file is local, we have its MD5 value */		if (obj->url->md5) {			add_AVList(template->list, T_MD5, obj->url->md5,			    strlen(obj->url->md5));		}	}#endif	/* If we don't know its type then we can do no more */	if (!strcmp(obj->type, "Unknown")) {		goto finish_summarizing;	/* goto? aack! oh well */	}	/* For full-text indexing try to grab all the data and finish */	if (do_fulltext) {		grab_fulltext(template, obj);		goto finish_summarizing;	}	/*	 *  Summarize the Object	 *	 *  Check to see if we want to summarize.	 *  Check to see if we can access its contents.	 *  Check to see if we're to summarize the contents or to	 *  simply use the full-text of the file.  Then, check to see	 *  if we can use the fast, internal summarizer (quicksum())	 *  (or the semi-fast, external summarizer) that uses regular	 *  expressions to define the values for the attributes.	 *  If all else fails, run the standard, external summarizer.	 */	/*	 *  The quicksum mechanism lets some easy types be summarized	 *  very quickly using regular expressions; we save a fork()	 *  and it's easier to specify how to summarize the object.	 *  See quicksum.cf for the regular expression syntax.	 *	 *  If we have POSIX regular expressions, then we can do	 *  all of the quick summing in a procedure; otherwise, we	 *  need to call the hacked perl script to do it for us...	 */#ifdef USE_QUICKSUM	if (can_quicksum(obj->type)) {		generate_quicksum(template, obj);		goto finish_summarizing;	}#else	buf[0] = '\0';	sprintf(buf, "quick-sum \"%s\" \"%s\" < /dev/null",	    quicksum_file, obj->type);	if (do_system(buf) == 0) {	/* Make sure this works */		buf[0] = '\0';		sprintf(buf, "quick-sum \"%s\" \"%s\" \"%s\"",		    quicksum_file, obj->type, obj->url->filename);	}#endif	else {		buf[0] = '\0';	/* in case sprintf fails */		sprintf(buf, "%s.sum \"%s\"", obj->type, obj->url->filename);	}	Debug(64, 1, ("Summarizer: RUNNING: %s\n", buf));	/*	 *  Run the external summarizer.  We could use popen(3), but it	 *  exec's a shell to process the command line.  We build our	 *  own pipeline and fork/exec to save this extra process.	 */	if (pipe(pipefds) < 0) {		log_errno("pipe");		goto finish_summarizing;	}	/*	 *  We can't use vfork() here, because otherwise parse_argv	 *  introduces a memory leak.	 */	if ((pid = fork()) < 0) {		log_errno("fork");		goto finish_summarizing;	}	if (pid == 0) {		/* child: summarizer */		char *argv[64], *urlbuf;		close(pipefds[0]);	/* child wont read from pipe */		dup2(pipefds[1], 1);	/* stdout -> write:pipe */		close(pipefds[1]);	/* close pipe, its now stdout */		/* parse_argv may barf, so initialize */		memset(argv, '\0', sizeof(argv));		parse_argv(argv, buf);		/* add an environment variable for the child */		urlbuf = xmalloc(strlen(obj->url->url) + 32);		urlbuf[0] = '\0';		sprintf(urlbuf, "SUMMARIZER_URL=%s", obj->url->url);		if (putenv(urlbuf) < 0) {			log_errno("putenv");		}		execvp(argv[0], argv);		sprintf(buf, "execvp: %s", argv[0]);		log_errno(buf);		_exit(1);	}	/* parent */	close(pipefds[1]);	if ((ifp = fdopen(pipefds[0], "r")) == NULL) {		errorlog("summarize: Running external summarizer: ");		log_errno(buf);		close(pipefds[0]);		goto finish_summarizing;	}	if (!read_structured_summary(ifp, template)) {		errorlog("Invalid output from %s.sum (url=%s)\n",		    obj->type, obj->url->url);	}	/* Write the Template to the Database */      finish_summarizing:	Debug(64, 1, ("Finish building summary for %s\n", obj->url->url));	if (obj->avl) {		/* add "hardcoded" AVPairs to Template */		merge_AVList(template->list, obj->avl);		Debug(64, 1, ("Merging AVList for obj: %s\n", obj->url->url));	}	if (do_keywords) {		mkkeywords(template);	}	mkdescription(template);	/* only do description for main tmpl */	mkgid(template);	pp_code = post_process(template);	if (pp_code == SUMMARIZE_DONT_ADD_OBJECT) {		Debug(64, 1, ("NOT adding %s to the database\n", obj->url->url));	} else {		add_template(template, obj);	}	/* clean up */	free_template(template);	if (oid)		free_oid(oid);	if (ifp) {		/* some people came from early on so check */		fclose(ifp);		close(pipefds[0]);		/* explicitly wait for the summarizer to exit */		err = waitpid(pid, (int *) NULL, (int) NULL);		if (err != pid) {			Debug(64, 1, ("WARNING: waiting for child %d got %d...\n",				pid, err));		}	}	return (0);}/* *  summarize_nested_object() - Summarizes a nested object and adds *  the generated template to the storage manager.  Returns 0 on *  success; non-zero otherwise. */int summarize_nested_object(object)     DataObject *object;{	Template *template = NULL;	struct OID *oid = NULL;	char buf[BUFSIZ], *s, *q;	int localobject = 0;	int pp_code = 0;	localobject = !object_retrieve(object);		/* Force retrieval of object */	/*	 *  Check to see if this object is a nested object.	 *  If so, change the URL of the template to the URL	 *  of the parent object, and include an Attribute for	 *  the name of the nested file (using only the last	 *  component of the relative pathname)	 */	if ((object->flags & F_NESTED) && object->parent_url) {		Debug(64, 1, ("Creating Nested object for %s\n", object->parent_url));		template = create_template(object->ttype, object->parent_url);		s = strstr(object->url->url, tmpdir);		s = (s != NULL) ? s + strlen(tmpdir) + 1 : object->url->url;		q = strchr(s, '/');		q = (q == NULL) ? s : q + 1;		template->list = create_AVList(T_NESTED, q, strlen(q));	} else {		Debug(64, 1, ("Creating Nested object for %s\n", object->url->url));		oid = generate_oid(object->url->url, gatherer_id, object);		template = create_template_with_oid(object->ttype,		    object->url->url, oid);	}	/* Add some other known Attributes */	add_AVList(template->list, T_FILETYPE, object->type, strlen(object->type));	if (localobject) {		sprintf(buf, "%u", (unsigned int) object->s->st_size);		add_AVList(template->list, T_FILESIZE, buf, strlen(buf));	}#ifdef USE_MD5	{		/* The file is local, so generate its MD5 value */		if (localobject && object->url->md5) {			add_AVList(template->list, T_MD5, object->url->md5, strlen(object->url->md5));		}	}#endif	/* We don't need to do any summarizing, a stubby template is enough */	/* Write the Template to the Database */	if (do_keywords) {		mkkeywords(template);	}	mkgid(template);	pp_code = post_process(template);	if (pp_code == SUMMARIZE_DONT_ADD_OBJECT) {		Debug(64, 1, ("NOT adding %s to the database\n",			object->url->url));	} else {		add_template(template, object);	}	/* clean up */	free_template(template);	if (oid)		free_oid(oid);	return (0);}/* Local functions */#define skip_whitespace()	\	while (1) { \		c = fgetc(fp); \		if (c == EOF) return 1; \		if (c == '}') return 1; \		if (!isspace(c)) { ungetc(c, fp); break; } \	}#define grab_attribute() \	p = buf; \	while (1) { \		c = fgetc(fp); \		if (c == EOF) return 1; \		if (c == '\n') return 0; \		if (c == '{') break; \		if (c == '}') break; \		*p++ = c; \		if (p == &buf[BUFSIZ-1]) return 1; \	} \	*p = '\0';#define grab_ttype() \	do { \		int i; \		if ((c = getc(fp)) == '@') { \			memset(buf,'\0',BUFSIZ); \			i = 0; \			for (c = getc(fp); (!isspace(c) && (c != '{')); c = getc(fp)) { \				if (i >= BUFSIZ -1) return(1); \				buf[i++] = c; \			} \			xfree (template->template_type);  \			template->template_type = xstrdup(buf);  \			while (isspace(c) || (c == '{')) c = getc(fp); \

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -