⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 split.c

📁 harvest是一个下载html网页得机器人
💻 C
📖 第 1 页 / 共 2 页
字号:
/* Copyright (c) 1994 Burra Gopal, Udi Manber.  All Rights Reserved. */#include "glimpse.h"extern CHAR *getword();extern int checksg();extern int D;extern CHAR GProgname[MAXNAME];extern FILE *debug;extern int StructuredIndex;extern int WHOLEFILESCOPE;extern int ByteLevelIndex;extern int ComplexBoolean;extern int foundattr;extern int foundnot;/* returns where it found the distinguishing token: until that from prev value of begin is the current pattern (not just the "words" in it) */CHAR *parse_flat(begin, end, prev, next)	CHAR	*begin;	CHAR	*end;	int	prev;	int	*next;{	if (begin > end) {		*next = prev;		return end;	}	if (prev & ENDSUB_EXP) prev &= ~ATTR_EXP;	if ((prev & ATTR_EXP) && !(prev & VAL_EXP)) prev |= VAL_EXP;	while (begin <= end) {		if (*begin == ',') {			prev |= OR_EXP;			prev |= VAL_EXP;			prev |= ENDSUB_EXP;			if (prev & AND_EXP) {				fprintf(stderr, "%s: parse error at character '%c'\n", GProgname, *begin);				return NULL;			}			*next = prev;			return begin;		}		else if (*begin == ';') {			prev |= AND_EXP;			prev |= VAL_EXP;			prev |= ENDSUB_EXP;			if (prev & OR_EXP) {				fprintf(stderr, "%s: parse error at character '%c'\n", GProgname, *begin);				return NULL;			}			*next = prev;			return begin;		}		else if (*begin == '=') {			if (StructuredIndex <= 0) begin++;	/* don't care about = since just another character */			else {				if (prev & ATTR_EXP) {					fprintf(stderr, "%s: syntax error: only ',' and ';' can follow 'attribute=value'\n", GProgname);					return NULL;				}				prev |= ATTR_EXP;	/* remains an ATTR_EXP until a new ',' OR ';' */				prev &= ~VAL_EXP;				*next = prev;				return begin;			}		}		else if (*begin == '\\') begin ++;	/* skip two things */		begin++;	}	*next = prev;	return begin;}intsplit_pattern_flat(GPattern, GM, APattern, terminals, pnum_terminals, pGParse, num_attr)	CHAR	*GPattern;	int	GM;	CHAR	*APattern;	ParseTree terminals[];	int	*pnum_terminals;	int	*pGParse;	/* doesn't interpret it as a tree */	int	num_attr;{	int   j, k = 0, l = 0, len = 0;	int   current_attr;	CHAR  *buffer;	CHAR  *buffer_pat;	CHAR  *buffer_end;	char  tempbuf[MAX_LINE_LEN];	memset(APattern, '\0', MAXPAT);	buffer = GPattern;	buffer_end = buffer + GM;	j=0;	*pGParse = 0;	current_attr = 0;	foundattr = 0;	/*	 * buffer is the runnning pointer, buffer_pat is the place where	 * the distinguishing delimiter was found, buffer_end is the end.	 */	 while (buffer_pat = parse_flat(buffer, buffer_end, *pGParse, pGParse)) {		/* there is no pattern until after the distinguishing delimiter position: some agrep garbage */		if (buffer_pat <= buffer) {			buffer = buffer_pat+1;			if (buffer_pat >= buffer_end) break;			continue;		}		if ((*pGParse & ATTR_EXP) && !(*pGParse & VAL_EXP)) {	/* fresh attribute */			foundattr=1;			memcpy(tempbuf, buffer, buffer_pat - buffer);			tempbuf[buffer_pat - buffer] = '\0';			len = strlen(tempbuf);			for (k = 0; k<len; k++) {				if (tempbuf[k] == '\\') {					for (l=k; l<len; l++)						tempbuf[l] = tempbuf[l+1];					len --;				}			}			if ( ((current_attr = attr_name_to_id(tempbuf, len)) <= 0) || (current_attr >= num_attr)) {				buffer[buffer_pat - buffer] = '\0';				fprintf(stderr, "%s: unknown attribute name '%s'\n", GProgname, buffer);				return -1;			}			buffer = buffer_pat+1;	/* immediate next character after distinguishing delimiter */			if (buffer_pat >= buffer_end) break;			continue;		}		else {	/* attribute's value OR raw-value */			if (*pnum_terminals >= MAXNUM_PAT) {				fprintf(stderr, "%s: boolean expression has too many terms\n", GProgname);				return -1;			}			terminals[*pnum_terminals].op = 0;			terminals[*pnum_terminals].type = LEAF;			terminals[*pnum_terminals].terminalindex = *pnum_terminals;			terminals[*pnum_terminals].data.leaf.attribute = (unsigned char *)current_attr;	/* default is no structure */			terminals[*pnum_terminals].data.leaf.value = (CHAR *)malloc(buffer_pat - buffer + 2);			memcpy(terminals[*pnum_terminals].data.leaf.value, buffer, buffer_pat - buffer);	/* without distinguishing delimiter */			terminals[*pnum_terminals].data.leaf.value[buffer_pat - buffer] = '\0';			if (foundattr || WHOLEFILESCOPE) {				memcpy(&APattern[j], buffer, buffer_pat - buffer);				j += buffer_pat - buffer;	/* NOT including the distinguishing delimiter at buffer_pat, or '\0' */				APattern[j++] = (*(buffer_pat + 1) == '\0' ? '\0' : ',');	/* always search for OR, do filtering at the end */#if	BG_DEBUG				fprintf(debug, "current_attr = %d, val = %s\n", current_attr, terminals[*pnum_terminals].data.leaf.value);#endif	/*BG_DEBUG*/			}			else {				memcpy(&APattern[j], buffer, buffer_pat + 1 - buffer);				j += buffer_pat + 1 - buffer;	/* including the distinguishing delimiter at buffer_pat, or '\0' */			}			(*pnum_terminals)++;		}		if (*pGParse & ENDSUB_EXP) current_attr = 0;	/* remains 0 until next fresh attribute */		if (buffer_pat >= buffer_end) break;		buffer = buffer_pat+1;	}	if (buffer_pat == NULL) return -1;	/* got out of while loop because of NULL rather than break */	APattern[j] = '\0';	if (foundattr || WHOLEFILESCOPE)	/* then search must always be OR since scope is over whole files */		for (j=0; APattern[j] != '\0'; j++)			if (APattern[j] == '\\') j++;			else if (APattern[j] == ';') APattern[j] = ',';	return(*pnum_terminals);}extern int is_complex_boolean();	/* use the one in agrep/asplit.c */extern int get_token_bool();	/* use the one in agrep/asplit.c *//* Spaces ARE significant: 'a1=v1' and 'a1=v1 ' and 'a1 =v1' etc. are NOT identical */intget_attribute_value(pattr, pval, tokenbuf, tokenlen, num_attr)	int	*pattr, tokenlen;	CHAR	**pval, *tokenbuf;{	CHAR	tempbuf[MAXNAME];	int	i = 0, j = 0, k = 0, l = 0;	while (i < tokenlen) {		if (tokenbuf[i] == '\\') {			tempbuf[j++] = tokenbuf[i++];			tempbuf[j++] = tokenbuf[i++];		}		else if (StructuredIndex) {			if (tokenbuf[i] == '=') {				i++;	/* skip over = : now @ 1st char of value */				tempbuf[j] = '\0';				for (k=0; k<j; k++) {					if (tempbuf[k] == '\\') {						for (l=k; l<j; l++)							tempbuf[l] = tempbuf[l+1];						j --;					}				}				if ( ((*pattr = attr_name_to_id(tempbuf, j)) <= 0) || (*pattr >= num_attr) ) {	/* named a non-existent attribute */					fprintf(stderr, "%s: unknown attribute name '%s'\n", GProgname, tempbuf);					return 0;				}				*pval = (CHAR *)malloc(tokenlen - i + 2);				memcpy(*pval, &tokenbuf[i], tokenlen - i);				(*pval)[tokenlen - i] = '\0';				foundattr = 1;				return 1;			}			else tempbuf[j++] = tokenbuf[i++];	/* consider = as just another char */		}		else tempbuf[j++] = tokenbuf[i++];	/* no attribute parsing */	}	/* Not a structured expression */	tempbuf[j] = '\0';	*pval = (CHAR *)malloc(j + 2);	memcpy(*pval, tempbuf, j);	(*pval)[j] = '\0';	return 1;}extern destroy_tree();	/* use the one in agrep/asplit.c *//* * Recursive descent; C-style => AND + OR have equal priority => must bracketize expressions appropriately or will go left->right. * Also strips out attribute names since agrep doesn't understand them: copies resulting pattern for agrep-ing into apattern. * Grammar: * 	E = {E} | ~a | ~{E} | E ; E | E , E | a * Parser: *	One look ahead at each literal will tell you what to do. *	~ has highest priority, ; and , have equal priority (left to right associativity), ~~ is not allowed. */ParseTree *parse_tree(buffer, len, bufptr, apattern, apatptr, terminals, pnum_terminals, num_attr)	CHAR	*buffer;	int	len;	int	*bufptr;	CHAR	*apattern;	int	*apatptr;	ParseTree terminals[];	int	*pnum_terminals;	int	num_attr;{	int	token, tokenlen;	CHAR	tokenbuf[MAXNAME];	int	oldtokenlen;	CHAR	oldtokenbuf[MAXNAME];	ParseTree *t, *n, *leftn;	token = get_token_bool(buffer, len, bufptr, tokenbuf, &tokenlen);	switch(token)	{	case	'{':	/* (exp) */		apattern[(*apatptr)++] = '{';		if ((t = parse_tree(buffer, len, bufptr, apattern, apatptr, terminals, pnum_terminals, num_attr)) == NULL) return NULL;		if ((token = get_token_bool(buffer, len, bufptr, tokenbuf, &tokenlen)) != '}') {			fprintf(stderr, "%s: parse error at offset %d\n", GProgname, *bufptr);			destroy_tree(t);			return (NULL);		}		apattern[(*apatptr)++] = '}';		if ((token = get_token_bool(buffer, len, bufptr, tokenbuf, &tokenlen)) == 'e') return t;		switch(token)		{		/* must find boolean infix operator */		case ',':		case ';':			apattern[(*apatptr)++] = token;			leftn = t;			if ((t = parse_tree(buffer, len, bufptr, apattern, apatptr, terminals, pnum_terminals, num_attr)) == NULL) return NULL;			n = (ParseTree *)malloc(sizeof(ParseTree));			n->op = (token == ';') ? ANDPAT : ORPAT ;			n->type = INTERNAL;			n->data.internal.left = leftn;			n->data.internal.right = t;			return n;		/* or end of parent sub expression */		case '}':			unget_token_bool(bufptr, tokenlen);	/* part of someone else who called me */			return t;		default:			destroy_tree(t);			fprintf(stderr, "%s: parse error at offset %d\n", GProgname, *bufptr);			return NULL;		}	/* Go one level deeper */	case	'~':	/* not exp */		foundnot = 1;		apattern[(*apatptr)++] = '~';		if ((token = get_token_bool(buffer, len, bufptr, tokenbuf, &tokenlen)) == 'e') return NULL;		switch(token)		{		case 'a':			if (*pnum_terminals >= MAXNUM_PAT) {				fprintf(stderr, "%s: pattern expression too long (> %d terms)\n", GProgname, MAXNUM_PAT);				return NULL;			}			n = &terminals[*pnum_terminals];

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -