⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 region.c

📁 harvest是一个下载html网页得机器人
💻 C
字号:
/* Copyright (c) 1994 Sun Wu, Udi Manber, Burra Gopal.  All Rights Reserved. *//* From mail received from Bill Camargo and Darren Hardy in June 1994 */#include <stdio.h>#include "region.h"/* * Exports the following routines. Any filtering/attr-val parsing mechanism * can be integrated into glimpse and glimpseindex with this interface. */char * /* attrname = */ attr_id_to_name(/* int attrid */);int /* attrid = */ attr_name_to_id(/* char *attrname */);int attr_dump_names(/* char *filename */);int attr_load_names(/* char *filename */);int attr_free_table(/* void */);int region_initialize(/* void */);int region_destroy(/* void */);int region_create(/* char *filename */);int /* attrid = */ region_identify(/* int offset_in_file, int len_of_region */);#if	BG_DEBUGextern int memory_usage;#endif	/* BG_DEBUG*/#if	STRUCTURED_QUERIESprintsp(){	int	x;	printf("stack at %x\n", &x);}/*****************************************************************************/#define ATTR_HASH_TABLE_SIZE	256	/* must be a power of 16=multiple of 4 bits */#define ATTR_HASH_TABLE_MASK	0xff	/* bits that mask off the bits in TABLE_SIZE */#define ATTR_HASH_STEP_SIZE	2	/* #of nibbles that make up TABLE_SIZE */attr_element_t	*attr_hash_table[ATTR_HASH_TABLE_SIZE];char	**attr_name_table = NULL;int	attr_num = 0;int	attr_maxid = 0;/* English language characters have all info in lowest 4 bits */intattr_hash_index(word, len)	char	*word;	int	len;{	int	i=0, j, index = 0, temp;	for (i=0; i+ATTR_HASH_STEP_SIZE<=len; i+=ATTR_HASH_STEP_SIZE) {		temp = 0;		for (j=0; j<ATTR_HASH_STEP_SIZE; j++)			temp = (temp << 4) | word[i+j] & 0x0f;		index = (index + temp) & ATTR_HASH_TABLE_MASK;	}	temp = 0;	for (j=0; i+j<len; j++)		temp = (temp << 4) | word[i+j] & 0x0f;	index = (index + temp) & ATTR_HASH_TABLE_MASK;	return index;}char *attr_id_to_name(id)	int	id;{#if	0	printf("id = %d\n", id);#endif	/*0*/	if ((attr_name_table == NULL) || (id > attr_maxid)) return NULL;	else return attr_name_table[id];}/* * returns the attribute number associated with name, 0 for no attribute -- * NOTE: name may not be null terminated and you are not allowed to alter it. * called during indexing and search. */intattr_name_to_id(name, len)	char	*name;	int	len;{	int		index = attr_hash_index(name, len);	attr_element_t	*e = attr_hash_table[index];#if	0	char		c = name[len];	name[len] = '\0';	fprintf(stderr, "attr=%s @ %d?\n", name, index);	fflush(stderr);	name[len] = c;#endif	/*0*/	while(e != NULL) {		if (!strncmp(e->attribute, name, len)) break;		else e = e->next;	}	if (e!=NULL) {#if	0		fprintf(stderr, "foundid=%d\n", e->attributeid);#endif	/*0*/		return e->attributeid;	}	return 0;}/* * returns the attribute number (> 0) for the attribute "name". It adds the * name as a newly seen attribute if it doesn't exist already (using #tables). * called in region_create, which is called during indexing. */attr_insert_name(name, len)	char	*name;	int	len;{	int		index = attr_hash_index(name, len);	attr_element_t	**pe = &attr_hash_table[index], *e;	while(*pe != NULL) {		if (!strcmp((*pe)->attribute, name)) break;		else pe = &(*pe)->next;	}	if (*pe!=NULL) return (*pe)->attributeid;	e = (attr_element_t *)my_malloc(sizeof(attr_element_t));	e->attribute = (char *)my_malloc(len + 2);	strncpy(e->attribute, name, len + 1);	e->attributeid = (++attr_num);	e->next = NULL;	*pe = e;#if	0	fprintf(stderr, "inserting %s %d\n", name, attr_num);#endif	/*0*/	return e->attributeid;}/* * frees current hash table of attr-value pairs. * called after dump in indexing, and at end of search (after previous load). */intattr_free_table(){	int	i;	attr_element_t *e, *temp;	for (i=0; i<ATTR_HASH_TABLE_SIZE; i++) {		e = attr_hash_table[i];		while (e != NULL) {			temp = e->next;#if	BG_DEBUG			memory_usage -= strlen(e->attribute) + 2;#endif	/*BG_DEBUG*/			my_free(e->attribute, 0);			my_free(e, sizeof(attr_element_t));			e = temp;		}		attr_hash_table[i] = NULL;	}	if (attr_name_table != NULL) {		my_free(attr_name_table, sizeof(attr_element_t *) * ATTR_HASH_TABLE_SIZE);		attr_name_table = NULL;	}	return 0;}/* Looks for embedded attributes and copies the real attribute into dest */attr_extract(dest, src)	char	*dest, *src;{	char	*oldsrc = src;check_again:	if (!strncmp("embed<", src, 6) || !strncmp("Embed<", src, 6) || !strncmp("EMBED<", src, 6)) {		src += 6;		while ((*src != '>') && (*src != '\0')) src++;		if (*src == '\0') {			strcpy(dest, oldsrc);			return;		}		while (!isalnum(*(unsigned char *)src)) src ++;	/* assuming type names are .. */		oldsrc = src;		goto check_again;	}	strcpy(dest, src);	return;}/* * dumps the attribute-list into a file name (id, name, \n) * into the file specified and then destroys the hash table. * Returns #of attributes dumped into the file, -1 if error. * called at the end of indexing. */intattr_dump_names(filename)	char	*filename;{	int	i=0;	int	ret = -1;	FILE	*fp = fopen(filename, "w");	attr_element_t *e;#if	0	printf("in dump attr\n");#endif	/*0*/	if (fp == NULL) return -1;	ret = 0;	for (i=0; i<ATTR_HASH_TABLE_SIZE; i++) {		e = attr_hash_table[i];		while (e != NULL) {			fprintf(fp, "%d,%s ", e->attributeid, e->attribute);			e = e->next;			ret ++;		}		fputc('\n', fp);	}	fflush(fp);	fclose(fp);	return ret;}/* * constructs a hash-table of attributes by reading them from the file. * Returns #of attributes read from the file, -1 if error. * Does not recompute hash-indices of attributes. * called before searching for attr=val pairs. */intattr_load_names(filename)	char	*filename;{	int	index = 0, ret = 0;	FILE	*fp = fopen(filename, "r");	attr_element_t *e;	int	c = 0;	char	temp[1024];	/* max attr name */	char	buffer[1024+32];/* max attr id pair */	int	i;	int	id;	attr_maxid = 0;	memset(attr_hash_table, '\0', sizeof(attr_element_t *) * ATTR_HASH_TABLE_SIZE);	if (fp == NULL) return -1;	while ((c = getc(fp)) != EOF) {		if (c == '\n') {			index ++;			continue;		}		ungetc(c, fp);		/* fscanf screws up fp and skips over trailing space characters (\t,\n, ) */		i=0;		while ((c=getc(fp)) != ' ') buffer[i++] = c;		buffer[i] = '\0';#if	0		printf("buffer=%s\n", buffer);#endif	/*0*/		sscanf(buffer, "%d,%1023s", &id, temp);		temp[1023] = '\0';#if	0		printf("read attr=%s,%d @ %d\n", temp, id, index);#endif	/*0*/		if (id <= 0) continue;		e = (attr_element_t *)my_malloc(sizeof(attr_element_t));		e->attributeid = id;		if (id > attr_maxid) attr_maxid = id;		e->attribute = (char *)my_malloc(strlen(temp) + 2);		strcpy(e->attribute, temp);		e->next = attr_hash_table[index];		attr_hash_table[index] = e;		ret ++;		if (index >= ATTR_HASH_TABLE_SIZE - 1) break;	}	fclose(fp);	attr_name_table = (char **)my_malloc(sizeof(char *) * (ret=(ret >= (attr_maxid + 1) ? ret : (attr_maxid + 1))));	memset(attr_name_table, '\0', sizeof(char *) * ret);	for (i=0; i<ATTR_HASH_TABLE_SIZE; i++) {		e = attr_hash_table[i];		while (e!=NULL) {			attr_name_table[e->attributeid] = e->attribute;			e = e->next;		}	}	return ret;}/***************************************************************************/region_t *current_regions, *nextpos;	/* nextpos is hint into list *//* * Called during indexing before region_create. * returns 0. */intregion_initialize(){	attr_num = 0;	attr_name_table = NULL;	memset(attr_hash_table, '\0', sizeof(attr_element_t *) * ATTR_HASH_TABLE_SIZE);	current_regions = nextpos = NULL;	return 0;}/* * creates a data structure containing the list of attributes * which occur at increasing offsets in the given file -- future * region_identify() calls use the "current" data structure. * returns 0 if success, -1 if it cannot open the file. */intregion_create(name)	char	*name;{	FILE	*fp;	AVList	*al;	region_t *prl, *rl, *lastrl;	Template *t;	char	temp[1024];	current_regions = nextpos = NULL;	if ((fp = my_fopen(name, "r")) == NULL) return -1;	init_parse_template_file(fp);	lastrl = NULL;	while ((t = parse_template()) != NULL) {		/* do insertion sort of list returned by parse_template using offsets */		if ((t->url != NULL) && (strlen(t->url) > 0)) {			rl = (region_t *)my_malloc(sizeof(region_t));			/* Darren Hardy's Voodo :-) */                        /* The SOIF looks like this:  @TTYPE { URL\n */                        /* t->offset points to the @ */                        /* rl->offset points to the space before URL */                        /* rl->length includes the entire URL */                        rl->offset = t->offset + strlen(t->template_type) + 3;                        rl->length = strlen(t->url) + 1;			rl->attributeid = attr_insert_name("url", 3);			if ((lastrl != NULL) && (lastrl->offset <= rl->offset)) {	/* go forward */				prl = lastrl;				while (prl->next != NULL) {					if (prl->next->offset > rl->offset) {						rl->prev = prl;						rl->next = prl->next;						prl->next->prev = rl;						prl->next = rl;						lastrl = rl;						break;					}					else prl = prl->next;				}				if (prl->next == NULL) {					rl->next = NULL;					rl->prev = prl;					prl->next = rl;					lastrl = rl;				}			}			else {	/* must go backwards and find the right place to insert */				prl = lastrl;				while (prl != NULL) {					if (prl->offset < rl->offset) {						rl->prev = prl;						rl->next = prl->next;						if (prl->next != NULL)							prl->next->prev = rl;						prl->next = rl;						lastrl = rl;						break;					}					else prl = prl->prev;				}				if (prl == NULL) {					rl->next = current_regions;					if (current_regions != NULL) current_regions->prev = rl;					rl->prev = NULL;					current_regions = rl;					lastrl = rl;				}			}#if	0			printf("region url=[%d,%d]\n", rl->offset, rl->offset+rl->length);#endif	/*0*/		}		al = t->list;		while(al != NULL) {			rl = (region_t *)my_malloc(sizeof(region_t));			rl->offset = al->data->offset;			rl->length = al->data->vsize;			attr_extract(temp, al->data->attribute);			rl->attributeid = attr_insert_name(temp, strlen(temp));			if ((lastrl != NULL) && (lastrl->offset <= rl->offset)) {	/* go forward */				prl = lastrl;				while (prl->next != NULL) {					if (prl->next->offset > rl->offset) {						rl->prev = prl;						rl->next = prl->next;						prl->next->prev = rl;						prl->next = rl;						lastrl = rl;						break;					}					else prl = prl->next;				}				if (prl->next == NULL) {					rl->next = NULL;					rl->prev = prl;					prl->next = rl;					lastrl = rl;				}			}			else {	/* must go backwards and find the right place to insert */				prl = lastrl;				while (prl != NULL) {					if (prl->offset < rl->offset) {						rl->prev = prl;						rl->next = prl->next;						if (prl->next != NULL)							prl->next->prev = rl;						prl->next = rl;						lastrl = rl;						break;					}					else prl = prl->prev;				}				if (prl == NULL) {					rl->next = current_regions;					if (current_regions != NULL) current_regions->prev = rl;					rl->prev = NULL;					current_regions = rl;					lastrl = rl;				}			}#if	0			printf("region %s=[%d,%d]\n", al->data->attribute, rl->offset, rl->offset+rl->length);#endif	/*0*/			al = al->next;		}		free_template(t);	}	finish_parse_template();	nextpos = current_regions;	fclose(fp);	return 0;}/* * frees the data structure created for the current file above. * returns 0. */intregion_destroy(){	region_t *rl = current_regions, *trl;	while (rl != NULL) {		trl = rl;		rl = rl->next;		free(trl);	}	current_regions = nextpos = NULL;	return 0;}/* * returns attribute number [1..num_attr] which covers (inclusive) * the region * [offset, offset+len] in the "current" file, 0 if none. * called during indexing after region_create, and search after * attr_load_names. Do not need sophisticated interval trees here! */intregion_identify(offset, len)	int	offset, len;{	region_t *rl;	if (nextpos == NULL) nextpos = current_regions;	rl = nextpos;	while (rl!=NULL) {		if (rl->offset > offset + len)			goto backwards;			/* definitely before: can be earlier region OR hole */		else if ((rl->offset <= offset) && (rl->offset + rl->length >= offset + len))			return rl->attributeid;		/* definitely within */		else if (rl->offset + rl->length < offset)			nextpos = rl = rl->next;	/* definitely after: later region */		else return 0;				/* overlapping: error */	}	return 0;					/* reached end of file */backwards:	while (rl!=NULL) {		if (rl->offset > offset + len)			nextpos = rl = rl->prev;	/* definitely before: earlier region */		else if ((rl->offset <= offset) && (rl->length + rl->length >= offset + len))			return rl->attributeid;		/* definitely within */		else if (rl->offset + rl->length < offset)			return 0;			/* hole */		else return 0;				/* overlapping: error */	}	return 0;					/* reached end of file */}#else	/*STRUCTURED_QUERIES*/int attr_num = 0;char *attr_id_to_name(id)	int	id;{	return NULL;}int attr_name_to_id(name)	char	*name;{	return 0;}int attr_dump_names(name)	char	*name;{	return 0;}int attr_load_names(name)	char	*name;{	return 0;}int attr_free_table(){	return 0;}int region_initialize(){	return 0;}int region_desrtroy(){	return 0;}int region_create(name)	char	*name;{	return 0;}int region_destroy(){	return 0;}int region_identify(offset, len)	int	offset, len;{	return 0;}#endif	/*STRUCTURED_QUERIES*/

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -