📄 summarize.c
字号:
static char rcsid[] = "summarize.c,v 1.87 1996/01/05 20:28:59 duane Exp";/* * summarize.c - Summarizing for the Essence system. * * DEBUG: section 64, level 1 Gatherer essence object summarizing * * Darren Hardy, hardy@cs.colorado.edu, February 1994 * * ---------------------------------------------------------------------- * Copyright (c) 1994, 1995. All rights reserved. * * The Harvest software was developed by the Internet Research Task * Force Research Group on Resource Discovery (IRTF-RD): * * Mic Bowman of Transarc Corporation. * Peter Danzig of the University of Southern California. * Darren R. Hardy of the University of Colorado at Boulder. * Udi Manber of the University of Arizona. * Michael F. Schwartz of the University of Colorado at Boulder. * Duane Wessels of the University of Colorado at Boulder. * * This copyright notice applies to software in the Harvest * ``src/'' directory only. Users should consult the individual * copyright notices in the ``components/'' subdirectories for * copyright information about other software bundled with the * Harvest source code distribution. * * TERMS OF USE * * The Harvest software may be used and re-distributed without * charge, provided that the software origin and research team are * cited in any use of the system. Most commonly this is * accomplished by including a link to the Harvest Home Page * (http://harvest.cs.colorado.edu/) from the query page of any * Broker you deploy, as well as in the query result pages. These * links are generated automatically by the standard Broker * software distribution. * * The Harvest software is provided ``as is'', without express or * implied warranty, and with no support nor obligation to assist * in its use, correction, modification or enhancement. We assume * no liability with respect to the infringement of copyrights, * trade secrets, or any patents, and are not responsible for * consequential damages. Proper use of the Harvest software is * entirely the responsibility of the user. * * DERIVATIVE WORKS * * Users may make derivative works from the Harvest software, subject * to the following constraints: * * - You must include the above copyright notice and these * accompanying paragraphs in all forms of derivative works, * and any documentation and other materials related to such * distribution and use acknowledge that the software was * developed at the above institutions. * * - You must notify IRTF-RD regarding your distribution of * the derivative work. * * - You must clearly notify users that your are distributing * a modified version and not the original Harvest software. * * - Any derivative product is also subject to these copyright * and use restrictions. * * Note that the Harvest software is NOT in the public domain. We * retain copyright, as specified above. * * HISTORY OF FREE SOFTWARE STATUS * * Originally we required sites to license the software in cases * where they were going to build commercial products/services * around Harvest. In June 1995 we changed this policy. We now * allow people to use the core Harvest software (the code found in * the Harvest ``src/'' directory) for free. We made this change * in the interest of encouraging the widest possible deployment of * the technology. The Harvest software is really a reference * implementation of a set of protocols and formats, some of which * we intend to standardize. We encourage commercial * re-implementations of code complying to this set of standards. * */#include <stdio.h>#include <stdlib.h>#include <unistd.h>#include <string.h>#include <memory.h>#include <ctype.h>#include <sys/types.h>#include <sys/param.h>#include <time.h>#include "util.h"#include "essence.h"#include "post_process.h"/* Local Functions */static int summarize_file();static int read_structured_summary();static void grab_fulltext();static void mkkeywords();static void mkdescription();static void mkgid();#ifdef USE_QUICKSUMstatic int can_quicksum();static void init_quicksum();static void generate_quicksum();static void finish_quicksum();#endif/* * summarize() - Summarizes an object and adds the generated template * to the storage manager. Returns 0 on success; non-zero otherwise. */int summarize(object) DataObject *object;{ Debug(64, 1, ("summarize(%s, %s)\n", object->url->url, object->type));#ifdef NO_UNIX_RECURSE if (!strcmp(object->type, "Directory")) { Log("Skipping %s (%s)\n", object->url->url, object->type); return (0); /* skip directories */ }#endif switch (object->url->type) { case URL_FILE: /* Supported Types */ case URL_FTP: case URL_GOPHER: case URL_NEWS: case URL_HTTP: case URL_NOP: return (summarize_file(object)); default: errorlog("Internal summarize() error. Unsupported type.\n"); } return (1);}/* * init_summarize() - Initializes the Summarize step. */void init_summarize(){#ifdef USE_QUICKSUM init_quicksum();#endif}/* * finish_summarize() - Cleans up after the Summarize step */void finish_summarize(){#ifdef USE_QUICKSUM finish_quicksum();#endif}/* * summarize_file() - Summarizes a file and adds the generated template * to the storage manager. Returns 0 on success; non-zero otherwise. */static int summarize_file(obj) DataObject *obj;{ Template *template = NULL; FILE *ifp = NULL; struct OID *oid = NULL; char buf[BUFSIZ], *s, *q; int pipefds[2], pid = 0, err; int localobj = 0; int pp_code = 0; /* * We don't really need the object to do a full summary, so * set a flag to say if we got it or not to make this section * more clear. */ localobj = !object_retrieve(obj); /* * Check to see if this object is a nested object. * If so, change the URL of the template to the URL * of the parent object, and include an Attribute for * the name of the nested file (using only the relative pathname) */ if ((obj->flags & F_NESTED) && obj->parent_url) { Debug(64, 1, ("Creating Nested object for %s\n", obj->parent_url)); template = create_template(obj->ttype, obj->parent_url); s = strstr(obj->url->url, tmpdir); s = (s != NULL) ? s + strlen(tmpdir) + 1 : obj->url->url; q = strchr(s, '/'); q = (q == NULL) ? s : q + 1; template->list = create_AVList(T_NESTED, q, strlen(q)); } else { oid = generate_oid(obj->url->url, gatherer_id, obj); template = create_template_with_oid(obj->ttype, obj->url->url, oid); } /* Add some other known Attributes */ add_AVList(template->list, T_FILETYPE, obj->type, strlen(obj->type)); /* We can't look at the object so finish up */ if (obj->flags & F_NO_ACCESS || obj->flags & F_MANUAL || !localobj) { goto finish_summarizing; } /* below here localobj == non-zero; add a few more attributes... */ sprintf(buf, "%u", (unsigned int) obj->s->st_size); add_AVList(template->list, T_FILESIZE, buf, strlen(buf));#ifdef USE_T_URI add_AVList(template->list, T_URI, obj->url->url, strlen(obj->url->url));#endif#ifdef USE_MD5 { /* If the file is local, we have its MD5 value */ if (obj->url->md5) { add_AVList(template->list, T_MD5, obj->url->md5, strlen(obj->url->md5)); } }#endif /* If we don't know its type then we can do no more */ if (!strcmp(obj->type, "Unknown")) { goto finish_summarizing; /* goto? aack! oh well */ } /* For full-text indexing try to grab all the data and finish */ if (do_fulltext) { grab_fulltext(template, obj); goto finish_summarizing; } /* * Summarize the Object * * Check to see if we want to summarize. * Check to see if we can access its contents. * Check to see if we're to summarize the contents or to * simply use the full-text of the file. Then, check to see * if we can use the fast, internal summarizer (quicksum()) * (or the semi-fast, external summarizer) that uses regular * expressions to define the values for the attributes. * If all else fails, run the standard, external summarizer. */ /* * The quicksum mechanism lets some easy types be summarized * very quickly using regular expressions; we save a fork() * and it's easier to specify how to summarize the object. * See quicksum.cf for the regular expression syntax. * * If we have POSIX regular expressions, then we can do * all of the quick summing in a procedure; otherwise, we * need to call the hacked perl script to do it for us... */#ifdef USE_QUICKSUM if (can_quicksum(obj->type)) { generate_quicksum(template, obj); goto finish_summarizing; }#else buf[0] = '\0'; sprintf(buf, "quick-sum \"%s\" \"%s\" < /dev/null", quicksum_file, obj->type); if (do_system(buf) == 0) { /* Make sure this works */ buf[0] = '\0'; sprintf(buf, "quick-sum \"%s\" \"%s\" \"%s\"", quicksum_file, obj->type, obj->url->filename); }#endif else { buf[0] = '\0'; /* in case sprintf fails */ sprintf(buf, "%s.sum \"%s\"", obj->type, obj->url->filename); } Debug(64, 1, ("Summarizer: RUNNING: %s\n", buf)); /* * Run the external summarizer. We could use popen(3), but it * exec's a shell to process the command line. We build our * own pipeline and fork/exec to save this extra process. */ if (pipe(pipefds) < 0) { log_errno("pipe"); goto finish_summarizing; } /* * We can't use vfork() here, because otherwise parse_argv * introduces a memory leak. */ if ((pid = fork()) < 0) { log_errno("fork"); goto finish_summarizing; } if (pid == 0) { /* child: summarizer */ char *argv[64], *urlbuf; close(pipefds[0]); /* child wont read from pipe */ dup2(pipefds[1], 1); /* stdout -> write:pipe */ close(pipefds[1]); /* close pipe, its now stdout */ /* parse_argv may barf, so initialize */ memset(argv, '\0', sizeof(argv)); parse_argv(argv, buf); /* add an environment variable for the child */ urlbuf = xmalloc(strlen(obj->url->url) + 32); urlbuf[0] = '\0'; sprintf(urlbuf, "SUMMARIZER_URL=%s", obj->url->url); if (putenv(urlbuf) < 0) { log_errno("putenv"); } execvp(argv[0], argv); sprintf(buf, "execvp: %s", argv[0]); log_errno(buf); _exit(1); } /* parent */ close(pipefds[1]); if ((ifp = fdopen(pipefds[0], "r")) == NULL) { errorlog("summarize: Running external summarizer: "); log_errno(buf); close(pipefds[0]); goto finish_summarizing; } if (!read_structured_summary(ifp, template)) { errorlog("Invalid output from %s.sum (url=%s)\n", obj->type, obj->url->url); } /* Write the Template to the Database */ finish_summarizing: Debug(64, 1, ("Finish building summary for %s\n", obj->url->url)); if (obj->avl) { /* add "hardcoded" AVPairs to Template */ merge_AVList(template->list, obj->avl); Debug(64, 1, ("Merging AVList for obj: %s\n", obj->url->url)); } if (do_keywords) { mkkeywords(template); } mkdescription(template); /* only do description for main tmpl */ mkgid(template); pp_code = post_process(template); if (pp_code == SUMMARIZE_DONT_ADD_OBJECT) { Debug(64, 1, ("NOT adding %s to the database\n", obj->url->url)); } else { add_template(template, obj); } /* clean up */ free_template(template); if (oid) free_oid(oid); if (ifp) { /* some people came from early on so check */ fclose(ifp); close(pipefds[0]); /* explicitly wait for the summarizer to exit */ err = waitpid(pid, (int *) NULL, (int) NULL); if (err != pid) { Debug(64, 1, ("WARNING: waiting for child %d got %d...\n", pid, err)); } } return (0);}/* * summarize_nested_object() - Summarizes a nested object and adds * the generated template to the storage manager. Returns 0 on * success; non-zero otherwise. */int summarize_nested_object(object) DataObject *object;{ Template *template = NULL; struct OID *oid = NULL; char buf[BUFSIZ], *s, *q; int localobject = 0; int pp_code = 0; localobject = !object_retrieve(object); /* Force retrieval of object */ /* * Check to see if this object is a nested object. * If so, change the URL of the template to the URL * of the parent object, and include an Attribute for * the name of the nested file (using only the last * component of the relative pathname) */ if ((object->flags & F_NESTED) && object->parent_url) { Debug(64, 1, ("Creating Nested object for %s\n", object->parent_url)); template = create_template(object->ttype, object->parent_url); s = strstr(object->url->url, tmpdir); s = (s != NULL) ? s + strlen(tmpdir) + 1 : object->url->url; q = strchr(s, '/'); q = (q == NULL) ? s : q + 1; template->list = create_AVList(T_NESTED, q, strlen(q)); } else { Debug(64, 1, ("Creating Nested object for %s\n", object->url->url)); oid = generate_oid(object->url->url, gatherer_id, object); template = create_template_with_oid(object->ttype, object->url->url, oid); } /* Add some other known Attributes */ add_AVList(template->list, T_FILETYPE, object->type, strlen(object->type)); if (localobject) { sprintf(buf, "%u", (unsigned int) object->s->st_size); add_AVList(template->list, T_FILESIZE, buf, strlen(buf)); }#ifdef USE_MD5 { /* The file is local, so generate its MD5 value */ if (localobject && object->url->md5) { add_AVList(template->list, T_MD5, object->url->md5, strlen(object->url->md5)); } }#endif /* We don't need to do any summarizing, a stubby template is enough */ /* Write the Template to the Database */ if (do_keywords) { mkkeywords(template); } mkgid(template); pp_code = post_process(template); if (pp_code == SUMMARIZE_DONT_ADD_OBJECT) { Debug(64, 1, ("NOT adding %s to the database\n", object->url->url)); } else { add_template(template, object); } /* clean up */ free_template(template); if (oid) free_oid(oid); return (0);}/* Local functions */#define skip_whitespace() \ while (1) { \ c = fgetc(fp); \ if (c == EOF) return 1; \ if (c == '}') return 1; \ if (!isspace(c)) { ungetc(c, fp); break; } \ }#define grab_attribute() \ p = buf; \ while (1) { \ c = fgetc(fp); \ if (c == EOF) return 1; \ if (c == '\n') return 0; \ if (c == '{') break; \ if (c == '}') break; \ *p++ = c; \ if (p == &buf[BUFSIZ-1]) return 1; \ } \ *p = '\0';#define grab_ttype() \ do { \ int i; \ if ((c = getc(fp)) == '@') { \ memset(buf,'\0',BUFSIZ); \ i = 0; \ for (c = getc(fp); (!isspace(c) && (c != '{')); c = getc(fp)) { \ if (i >= BUFSIZ -1) return(1); \ buf[i++] = c; \ } \ xfree (template->template_type); \ template->template_type = xstrdup(buf); \ while (isspace(c) || (c == '{')) c = getc(fp); \
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -