⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 summarize.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 2 页
字号:
			ungetc(c,fp); \			memset(buf,'\0',BUFSIZ); \			i = 0; \			for (c = getc(fp); !isspace(c); c = getc(fp)) { \				if (i >= BUFSIZ - 1) return(1); \				buf[i++] = c; \			} \			xfree(template->url); \			template->url = xstrdup(buf); \			while (c != '\n') c = getc(fp); \		} else ungetc(c, fp);  \	} while(0);/* *  read_structured_summary() - Reads the output of a structured *  summarizer that outputs its data as attribute value pairs.  It *  parses the attribute value pairs and adds them to the given template. * *  XXX: NOTE that this doesn't work if the summarizer crashes.  For *  example, if it gets a segmentation fault and prints that to stderr, *  then the text to stderr will get caught in the attribute name.  Need *  to use fork/exec and check error code to make sure that it returns 0 *  as the exit code, otherwise throw away the data that was generated. *  grab_attribute() dies on '\n' so if there's an error message from *  the summarizer that goes to ifp, then the parser dies. */static int read_structured_summary(fp, template)     FILE *fp;     Template *template;{	static char buf[BUFSIZ];	char *attr = NULL;	char *value = NULL;	char *p = NULL;	int vsize;	int c;	grab_ttype();		/* @TYPE { is optional */	while (1) {		skip_whitespace();		grab_attribute();		attr = strdup(buf);	/* Read Attribute */		grab_attribute();		vsize = atoi(buf);	/* Read Value Size */		c = fgetc(fp);		if (c != ':') {			xfree(attr);			return 0;	/* expecting : */		}		c = fgetc(fp);		if (c != '\t') {			xfree(attr);			return 0;	/* expecting <TAB> */		}		value = xmalloc(vsize + 1);	/* Read Value */		if (fread(value, 1, vsize, fp) != vsize) {			xfree(attr);			xfree(value);			return 0;		}		value[vsize] = '\0';		if (do_cksumdups)			add_AVList(template->list, attr, value, vsize);		else			FAST_add_AVList(template->list, attr, value, vsize);		xfree(attr);		xfree(value);	}}#undef skip_whitespace#undef grab_attribute/* *  grab_fulltext() - Adds the contents of an entire file to the template. */static void grab_fulltext(template, object)     Template *template;     DataObject *object;{	FILE *fp;	char *value = NULL;	if (!object->url->filename || !object->s)	/* Object is not local... */		return;	if ((fp = fopen(object->url->filename, "r")) == NULL) {		log_errno(object->url->filename);		return;	}	/* We have the file, so slurp the whole thing in at once */	value = xmalloc(object->s->st_size + 1);	if (fread(value, 1, object->s->st_size, fp) != object->s->st_size) {		log_errno(object->url->filename);		xfree(value);		return;	}	fclose(fp);	value[object->s->st_size] = '\0';	/* be nice */	add_AVList(template->list, T_FULLTEXT, value, object->s->st_size);	xfree(value);		/* don't need this memory anymore */}#ifdef USE_QUICKSUM#define MAX_REGEX 32		/* max number of regular expressions per type *//* *  For each type, define all of the regular expressions and their *  associated attributes. */struct quicksums {	char *type;	char *attribute[MAX_REGEX];	char *regex[MAX_REGEX];	regex_t compiled[MAX_REGEX];	/* compiled version of regex */};struct quicksums *qs[MAX_TYPES];/* *  When summarizing data, allocate a structure for each attribute that *  lets us control the value buffer. */struct avbuf {	char *attribute;	Buffer *b;};struct avbuf vbuf[MAX_REGEX];	/* value buffers */#if defined(USE_POSIX_REGEX)#define do_match(s, c) (regexec((c), (s), 0, 0, 0) == 0)#else#error "unsupported"#endifstatic void init_quicksum(){	FILE *fp;	char buf[BUFSIZ], *type, *attr, *regex, *p, *s;	int i, j, done;	memset(qs, '\0', MAX_TYPES * sizeof(struct quicksums *));	if ((fp = fopen(quicksum_file, "r")) == NULL) {		log_errno(quicksum_file);		return;	}	while (fgets(buf, BUFSIZ, fp)) {		if (buf[0] == '#')			continue;	/* skip comments */		type = p = buf;		while (*++p != '\t');	/* skip type name */		*p++ = '\0';		while (isspace(*p))			p++;	/* skip whitespace */		attr = p;		while (!isspace(*p))			p++;	/* skip whitespace */		*p++ = '\0';		while (isspace(*p))			p++;	/* skip whitespace */		regex = p;		if ((s = strrchr(regex, '\n')) != NULL)			*s = '\0';	/* remove newline */		for (i = done = 0; qs[i] != NULL; i++) {			if (!strcmp(qs[i]->type, type)) {	/* add to type */				/* find attribute's place */				for (j = 0; qs[i]->attribute[j] != NULL; j++);				qs[i]->attribute[j] = strdup(attr);				qs[i]->regex[j] = strdup(regex);#if defined(USE_POSIX_REGEX)				if (regcomp(&qs[i]->compiled[j], qs[i]->regex[j],					    USE_RE_SYNTAX)) {					fatal("init_quicksum: FILE: %s TYPE: %s COMPILE_ERROR: %s\n",					      quicksum_file, buf, qs[i]->regex[j]);				}#endif				if (j + 1 < MAX_REGEX) {					qs[i]->attribute[j + 1] = NULL;					qs[i]->regex[j + 1] = NULL;				} else					errorlog("WARNING! INCREASE MAX_REGEX");				done = 1;			}		}		if (!done) {	/* new type */			qs[i] = xmalloc(sizeof(struct quicksums));			qs[i]->type = strdup(type);			qs[i]->attribute[0] = strdup(attr);			qs[i]->regex[0] = strdup(regex);#if defined(USE_POSIX_REGEX)			if (regcomp(&qs[i]->compiled[0], qs[i]->regex[0],				    USE_RE_SYNTAX)) {				fatal("init_quicksum: FILE: %s TYPE: %s COMPILE_ERROR: %s\n",				      quicksum_file, buf, qs[i]->regex[0]);			}#endif			qs[i]->attribute[1] = NULL;			qs[i]->regex[1] = NULL;		}	}	fclose(fp);	/* Reset the buffers, then allocate the buffers */	for (i = 0; i < MAX_REGEX; i++) {		vbuf[i].attribute = NULL;		vbuf[i].b = create_buffer(BUFSIZ);	}	if (debug_ok(64, 1)) {		for (i = 0; qs[i] != NULL; i++) {			Log("Type: %s\n", qs[i]->type);			for (j = 0; qs[i]->attribute[j]; j++)				Log("Attribute: %s --> RE: %s\n",				    qs[i]->attribute[j], qs[i]->regex[j]);		}	}}static void finish_quicksum(){	int i, j;	for (i = 0; qs[i] != NULL; i++) {		if (qs[i]->type)			xfree(qs[i]->type);#if defined(USE_POSIX_REGEX)		for (j = 0; qs[i]->attribute[j] != NULL; j++)			regfree(&qs[i]->compiled[j]);#endif		for (j = 0; qs[i]->attribute[j] != NULL; j++)			xfree(qs[i]->attribute[j]);		for (j = 0; qs[i]->regex[j] != NULL; j++)			xfree(qs[i]->regex[j]);		xfree(qs[i]);	}	memset(qs, '\0', MAX_TYPES * sizeof(struct quicksums *));	for (i = 0; i < MAX_REGEX; i++) {		if (vbuf[i].attribute) {			xfree(vbuf[i].attribute);			vbuf[i].attribute = NULL;		}		if (vbuf[i].b) {			free_buffer(vbuf[i].b);			vbuf[i].b = NULL;		}	}}/* *  generate_quicksum() - Quickly summarizes object and addes attributes to *  template. */static void generate_quicksum(template, object)     Template *template;     DataObject *object;{	char buf[BUFSIZ];	FILE *fp;	int i, j, curqs, found;	Debug(64, 1, ("generate_quicksum(%s, %s)\n", object->type, object->url->url));	/* Can we quicksum the object?  If so, find the object's type */	for (i = 0; qs[i] != NULL; i++) {		if (!strcmp(qs[i]->type, object->type))			break;	}	if (qs[i] == NULL)		return;	curqs = i;	/* Try opening the file to summarize */	if ((fp = fopen(object->url->filename, "r")) == NULL) {		log_errno(object->url->filename);		return;	}	/* Reset the buffers */	for (i = 0; i < MAX_REGEX; i++) {		vbuf[i].attribute = NULL;	}	for (i = 0; qs[curqs]->attribute[i] != NULL; i++) {		/* check to see if attribute is in value buffer */		for (found = j = 0; vbuf[j].attribute; j++) {			if (!strcmp(vbuf[j].attribute,				qs[curqs]->attribute[i])) {				found = 1;				break;			}		}		if (!found) {			for (j = 0; vbuf[j].attribute != NULL; j++);	/* Find first spot */			vbuf[j].attribute = strdup(qs[curqs]->attribute[i]);		}	}	/* Now summarize the file and write saved lines to vbuf */	while (fgets(buf, BUFSIZ, fp)) {		for (j = 0; qs[curqs]->attribute[j] != NULL; j++) {			if (do_match(buf, &qs[curqs]->compiled[j])) {				for (found = i = 0; vbuf[i].attribute; i++) {					if (!strcmp(qs[curqs]->attribute[j],						vbuf[i].attribute)) {						found = 1;						break;	/* Find vbuf to use */					}				}				if (found) {					add_buffer(vbuf[i].b, buf, strlen(buf));				}			}		}	}	fclose(fp);	/* Add values to template */	for (i = 0; vbuf[i].attribute; i++) {		if (vbuf[i].b->length > 0) {			add_AVList(template->list, vbuf[i].attribute,			    vbuf[i].b->data, vbuf[i].b->length);		}	}	/* Clean up */	for (i = 0; i < MAX_REGEX; i++) {		if (vbuf[i].attribute) {			xfree(vbuf[i].attribute);			vbuf[i].attribute = NULL;		}		shrink_buffer(vbuf[i].b);	}}/* *  can_quicksum() - Returns non-zero if generate_quicksum() can process type; *  returns 0 otherwise; */static int can_quicksum(type)     char *type;{	int i;	for (i = 0; qs[i] != NULL; i++)		if (!strcmp(qs[i]->type, type))			return (1);	return (0);}#endif /* USE_QUICKSUM *//* *  mkdescription() - Generates a Description for the Template. */static void mkdescription(t)     Template *t;{	AVPair *avp;	int i, j, gotdata, n;	char *s;	if (t == NULL || t->list == NULL)		return;	/* See if the Summarizer already generated one */	if (extract_AVPair(t->list, T_DESCRIPTION) != NULL)		return;	/* Try to build a Description attribute based on other fields */	/* These heuristics only apply if the Summarizer failed to make one */	avp = extract_AVPair(t->list, T_ABSTRACT);	if (avp == NULL)		avp = extract_AVPair(t->list, T_PARTTEXT);	if (avp == NULL)		avp = extract_AVPair(t->list, "body");  /* By SGML summarizer */	if (avp == NULL)		avp = extract_AVPair(t->list, T_TITLE); /* Last resort */	/* Cannot find any data to use, or not enough */	if (avp == NULL || avp->vsize < 2)		return;	/* Extract a reasonable chunk of the data */	if ( (s = xmalloc(BUFSIZ / 4 + 1)) == NULL)		return;	for (i = 0, j = 0, gotdata = 0; i < avp->vsize; i++) {		if (j >= BUFSIZ / 4)			break;		if (isspace(avp->value[i])) {			/* Squash white space into a single blank / newline */			if (gotdata)				s[j++] = (avp->value[i] == '\n' ? '\n' : ' ');			gotdata = 0;		} else {			s[j++] = avp->value[i];			gotdata = 1;		}	}	s[j] = '\0';	if (j > 0) {		/* We know for sure that T_DESCRIPTION isn't in t->list */		FAST_add_AVList(t->list, T_DESCRIPTION, s, j);	}	xfree(s);}/* *  mkkeywords() - Generates a keywords list for the Template. */static void mkkeywords(t)     Template *t;{	AVPair *avp;	char *s;	int have_keys = 0;	if (t == NULL || t->list == NULL)		return;	/*	 *  Make canonical Keywords list, using attribute Keywords	 *  or Partial-Text, or Description, Abstract, or Title.	 */	if ((avp = extract_AVPair(t->list, T_KEYS)) != NULL)		have_keys = 1;	if (avp == NULL)		avp = extract_AVPair(t->list, T_PARTTEXT);	if (avp == NULL)		avp = extract_AVPair(t->list, "Description");	if (avp == NULL)		avp = extract_AVPair(t->list, T_ABSTRACT);	if (avp == NULL)		avp = extract_AVPair(t->list, T_TITLE);	if (avp == NULL)		return;		/* don't make any modifications */	if ((s = mkwordlist(avp->value, avp->vsize)) == NULL)		return;		/* don't make any modifications */	if (have_keys) {		xfree(avp->value);		avp->value = strdup(s);		avp->vsize = strlen(s);	} else {		/* We know for sure that T_KEYS isn't in t->list */		FAST_add_AVList(t->list, T_KEYS, s, strlen(s));	}	xfree(s);	return;}/* *  mkgid() - Verifies that the Template contains the Gatherer *  Identification attributes:  Gatherer-Name, Gatherer-Host, and *  Gatherer-Version. */static void mkgid(t)     Template *t;{	AVPair *avp;	if (t == NULL || t->list == NULL || gatherer_id == NULL)		return;	if ((avp = extract_AVPair(t->list, T_GHOST)) == NULL) {		add_AVList(t->list, T_GHOST, gatherer_id->host,		    strlen(gatherer_id->host));	}	if ((avp = extract_AVPair(t->list, T_GVERSION)) == NULL) {		add_AVList(t->list, T_GVERSION, gatherer_id->version,		    strlen(gatherer_id->version));	}	if ((avp = extract_AVPair(t->list, T_GNAME)) == NULL) {		add_AVList(t->list, T_GNAME, gatherer_id->name,		    strlen(gatherer_id->name));	}	return;}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -