📄 summarize.c
字号:
ungetc(c,fp); \ memset(buf,'\0',BUFSIZ); \ i = 0; \ for (c = getc(fp); !isspace(c); c = getc(fp)) { \ if (i >= BUFSIZ - 1) return(1); \ buf[i++] = c; \ } \ xfree(template->url); \ template->url = xstrdup(buf); \ while (c != '\n') c = getc(fp); \ } else ungetc(c, fp); \ } while(0);/* * read_structured_summary() - Reads the output of a structured * summarizer that outputs its data as attribute value pairs. It * parses the attribute value pairs and adds them to the given template. * * XXX: NOTE that this doesn't work if the summarizer crashes. For * example, if it gets a segmentation fault and prints that to stderr, * then the text to stderr will get caught in the attribute name. Need * to use fork/exec and check error code to make sure that it returns 0 * as the exit code, otherwise throw away the data that was generated. * grab_attribute() dies on '\n' so if there's an error message from * the summarizer that goes to ifp, then the parser dies. */static int read_structured_summary(fp, template) FILE *fp; Template *template;{ static char buf[BUFSIZ]; char *attr = NULL; char *value = NULL; char *p = NULL; int vsize; int c; grab_ttype(); /* @TYPE { is optional */ while (1) { skip_whitespace(); grab_attribute(); attr = strdup(buf); /* Read Attribute */ grab_attribute(); vsize = atoi(buf); /* Read Value Size */ c = fgetc(fp); if (c != ':') { xfree(attr); return 0; /* expecting : */ } c = fgetc(fp); if (c != '\t') { xfree(attr); return 0; /* expecting <TAB> */ } value = xmalloc(vsize + 1); /* Read Value */ if (fread(value, 1, vsize, fp) != vsize) { xfree(attr); xfree(value); return 0; } value[vsize] = '\0'; if (do_cksumdups) add_AVList(template->list, attr, value, vsize); else FAST_add_AVList(template->list, attr, value, vsize); xfree(attr); xfree(value); }}#undef skip_whitespace#undef grab_attribute/* * grab_fulltext() - Adds the contents of an entire file to the template. */static void grab_fulltext(template, object) Template *template; DataObject *object;{ FILE *fp; char *value = NULL; if (!object->url->filename || !object->s) /* Object is not local... */ return; if ((fp = fopen(object->url->filename, "r")) == NULL) { log_errno(object->url->filename); return; } /* We have the file, so slurp the whole thing in at once */ value = xmalloc(object->s->st_size + 1); if (fread(value, 1, object->s->st_size, fp) != object->s->st_size) { log_errno(object->url->filename); xfree(value); return; } fclose(fp); value[object->s->st_size] = '\0'; /* be nice */ add_AVList(template->list, T_FULLTEXT, value, object->s->st_size); xfree(value); /* don't need this memory anymore */}#ifdef USE_QUICKSUM#define MAX_REGEX 32 /* max number of regular expressions per type *//* * For each type, define all of the regular expressions and their * associated attributes. */struct quicksums { char *type; char *attribute[MAX_REGEX]; char *regex[MAX_REGEX]; regex_t compiled[MAX_REGEX]; /* compiled version of regex */};struct quicksums *qs[MAX_TYPES];/* * When summarizing data, allocate a structure for each attribute that * lets us control the value buffer. */struct avbuf { char *attribute; Buffer *b;};struct avbuf vbuf[MAX_REGEX]; /* value buffers */#if defined(USE_POSIX_REGEX)#define do_match(s, c) (regexec((c), (s), 0, 0, 0) == 0)#else#error "unsupported"#endifstatic void init_quicksum(){ FILE *fp; char buf[BUFSIZ], *type, *attr, *regex, *p, *s; int i, j, done; memset(qs, '\0', MAX_TYPES * sizeof(struct quicksums *)); if ((fp = fopen(quicksum_file, "r")) == NULL) { log_errno(quicksum_file); return; } while (fgets(buf, BUFSIZ, fp)) { if (buf[0] == '#') continue; /* skip comments */ type = p = buf; while (*++p != '\t'); /* skip type name */ *p++ = '\0'; while (isspace(*p)) p++; /* skip whitespace */ attr = p; while (!isspace(*p)) p++; /* skip whitespace */ *p++ = '\0'; while (isspace(*p)) p++; /* skip whitespace */ regex = p; if ((s = strrchr(regex, '\n')) != NULL) *s = '\0'; /* remove newline */ for (i = done = 0; qs[i] != NULL; i++) { if (!strcmp(qs[i]->type, type)) { /* add to type */ /* find attribute's place */ for (j = 0; qs[i]->attribute[j] != NULL; j++); qs[i]->attribute[j] = strdup(attr); qs[i]->regex[j] = strdup(regex);#if defined(USE_POSIX_REGEX) if (regcomp(&qs[i]->compiled[j], qs[i]->regex[j], USE_RE_SYNTAX)) { fatal("init_quicksum: FILE: %s TYPE: %s COMPILE_ERROR: %s\n", quicksum_file, buf, qs[i]->regex[j]); }#endif if (j + 1 < MAX_REGEX) { qs[i]->attribute[j + 1] = NULL; qs[i]->regex[j + 1] = NULL; } else errorlog("WARNING! INCREASE MAX_REGEX"); done = 1; } } if (!done) { /* new type */ qs[i] = xmalloc(sizeof(struct quicksums)); qs[i]->type = strdup(type); qs[i]->attribute[0] = strdup(attr); qs[i]->regex[0] = strdup(regex);#if defined(USE_POSIX_REGEX) if (regcomp(&qs[i]->compiled[0], qs[i]->regex[0], USE_RE_SYNTAX)) { fatal("init_quicksum: FILE: %s TYPE: %s COMPILE_ERROR: %s\n", quicksum_file, buf, qs[i]->regex[0]); }#endif qs[i]->attribute[1] = NULL; qs[i]->regex[1] = NULL; } } fclose(fp); /* Reset the buffers, then allocate the buffers */ for (i = 0; i < MAX_REGEX; i++) { vbuf[i].attribute = NULL; vbuf[i].b = create_buffer(BUFSIZ); } if (debug_ok(64, 1)) { for (i = 0; qs[i] != NULL; i++) { Log("Type: %s\n", qs[i]->type); for (j = 0; qs[i]->attribute[j]; j++) Log("Attribute: %s --> RE: %s\n", qs[i]->attribute[j], qs[i]->regex[j]); } }}static void finish_quicksum(){ int i, j; for (i = 0; qs[i] != NULL; i++) { if (qs[i]->type) xfree(qs[i]->type);#if defined(USE_POSIX_REGEX) for (j = 0; qs[i]->attribute[j] != NULL; j++) regfree(&qs[i]->compiled[j]);#endif for (j = 0; qs[i]->attribute[j] != NULL; j++) xfree(qs[i]->attribute[j]); for (j = 0; qs[i]->regex[j] != NULL; j++) xfree(qs[i]->regex[j]); xfree(qs[i]); } memset(qs, '\0', MAX_TYPES * sizeof(struct quicksums *)); for (i = 0; i < MAX_REGEX; i++) { if (vbuf[i].attribute) { xfree(vbuf[i].attribute); vbuf[i].attribute = NULL; } if (vbuf[i].b) { free_buffer(vbuf[i].b); vbuf[i].b = NULL; } }}/* * generate_quicksum() - Quickly summarizes object and addes attributes to * template. */static void generate_quicksum(template, object) Template *template; DataObject *object;{ char buf[BUFSIZ]; FILE *fp; int i, j, curqs, found; Debug(64, 1, ("generate_quicksum(%s, %s)\n", object->type, object->url->url)); /* Can we quicksum the object? If so, find the object's type */ for (i = 0; qs[i] != NULL; i++) { if (!strcmp(qs[i]->type, object->type)) break; } if (qs[i] == NULL) return; curqs = i; /* Try opening the file to summarize */ if ((fp = fopen(object->url->filename, "r")) == NULL) { log_errno(object->url->filename); return; } /* Reset the buffers */ for (i = 0; i < MAX_REGEX; i++) { vbuf[i].attribute = NULL; } for (i = 0; qs[curqs]->attribute[i] != NULL; i++) { /* check to see if attribute is in value buffer */ for (found = j = 0; vbuf[j].attribute; j++) { if (!strcmp(vbuf[j].attribute, qs[curqs]->attribute[i])) { found = 1; break; } } if (!found) { for (j = 0; vbuf[j].attribute != NULL; j++); /* Find first spot */ vbuf[j].attribute = strdup(qs[curqs]->attribute[i]); } } /* Now summarize the file and write saved lines to vbuf */ while (fgets(buf, BUFSIZ, fp)) { for (j = 0; qs[curqs]->attribute[j] != NULL; j++) { if (do_match(buf, &qs[curqs]->compiled[j])) { for (found = i = 0; vbuf[i].attribute; i++) { if (!strcmp(qs[curqs]->attribute[j], vbuf[i].attribute)) { found = 1; break; /* Find vbuf to use */ } } if (found) { add_buffer(vbuf[i].b, buf, strlen(buf)); } } } } fclose(fp); /* Add values to template */ for (i = 0; vbuf[i].attribute; i++) { if (vbuf[i].b->length > 0) { add_AVList(template->list, vbuf[i].attribute, vbuf[i].b->data, vbuf[i].b->length); } } /* Clean up */ for (i = 0; i < MAX_REGEX; i++) { if (vbuf[i].attribute) { xfree(vbuf[i].attribute); vbuf[i].attribute = NULL; } shrink_buffer(vbuf[i].b); }}/* * can_quicksum() - Returns non-zero if generate_quicksum() can process type; * returns 0 otherwise; */static int can_quicksum(type) char *type;{ int i; for (i = 0; qs[i] != NULL; i++) if (!strcmp(qs[i]->type, type)) return (1); return (0);}#endif /* USE_QUICKSUM *//* * mkdescription() - Generates a Description for the Template. */static void mkdescription(t) Template *t;{ AVPair *avp; int i, j, gotdata, n; char *s; if (t == NULL || t->list == NULL) return; /* See if the Summarizer already generated one */ if (extract_AVPair(t->list, T_DESCRIPTION) != NULL) return; /* Try to build a Description attribute based on other fields */ /* These heuristics only apply if the Summarizer failed to make one */ avp = extract_AVPair(t->list, T_ABSTRACT); if (avp == NULL) avp = extract_AVPair(t->list, T_PARTTEXT); if (avp == NULL) avp = extract_AVPair(t->list, "body"); /* By SGML summarizer */ if (avp == NULL) avp = extract_AVPair(t->list, T_TITLE); /* Last resort */ /* Cannot find any data to use, or not enough */ if (avp == NULL || avp->vsize < 2) return; /* Extract a reasonable chunk of the data */ if ( (s = xmalloc(BUFSIZ / 4 + 1)) == NULL) return; for (i = 0, j = 0, gotdata = 0; i < avp->vsize; i++) { if (j >= BUFSIZ / 4) break; if (isspace(avp->value[i])) { /* Squash white space into a single blank / newline */ if (gotdata) s[j++] = (avp->value[i] == '\n' ? '\n' : ' '); gotdata = 0; } else { s[j++] = avp->value[i]; gotdata = 1; } } s[j] = '\0'; if (j > 0) { /* We know for sure that T_DESCRIPTION isn't in t->list */ FAST_add_AVList(t->list, T_DESCRIPTION, s, j); } xfree(s);}/* * mkkeywords() - Generates a keywords list for the Template. */static void mkkeywords(t) Template *t;{ AVPair *avp; char *s; int have_keys = 0; if (t == NULL || t->list == NULL) return; /* * Make canonical Keywords list, using attribute Keywords * or Partial-Text, or Description, Abstract, or Title. */ if ((avp = extract_AVPair(t->list, T_KEYS)) != NULL) have_keys = 1; if (avp == NULL) avp = extract_AVPair(t->list, T_PARTTEXT); if (avp == NULL) avp = extract_AVPair(t->list, "Description"); if (avp == NULL) avp = extract_AVPair(t->list, T_ABSTRACT); if (avp == NULL) avp = extract_AVPair(t->list, T_TITLE); if (avp == NULL) return; /* don't make any modifications */ if ((s = mkwordlist(avp->value, avp->vsize)) == NULL) return; /* don't make any modifications */ if (have_keys) { xfree(avp->value); avp->value = strdup(s); avp->vsize = strlen(s); } else { /* We know for sure that T_KEYS isn't in t->list */ FAST_add_AVList(t->list, T_KEYS, s, strlen(s)); } xfree(s); return;}/* * mkgid() - Verifies that the Template contains the Gatherer * Identification attributes: Gatherer-Name, Gatherer-Host, and * Gatherer-Version. */static void mkgid(t) Template *t;{ AVPair *avp; if (t == NULL || t->list == NULL || gatherer_id == NULL) return; if ((avp = extract_AVPair(t->list, T_GHOST)) == NULL) { add_AVList(t->list, T_GHOST, gatherer_id->host, strlen(gatherer_id->host)); } if ((avp = extract_AVPair(t->list, T_GVERSION)) == NULL) { add_AVList(t->list, T_GVERSION, gatherer_id->version, strlen(gatherer_id->version)); } if ((avp = extract_AVPair(t->list, T_GNAME)) == NULL) { add_AVList(t->list, T_GNAME, gatherer_id->name, strlen(gatherer_id->name)); } return;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -