⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 maskgen.c

📁 harvest是一个下载html网页得机器人
💻 C
字号:
/* Copyright (c) 1994 Sun Wu, Udi Manber, Burra Gopal.  All Rights Reserved. */#include "agrep.h"extern unsigned D_endpos, endposition, Init1, wildmask;extern Mask[], Bit[], Init[], NO_ERR_MASK;extern int AND, REGEX, NOUPPER, D_length;extern unsigned char Progname[];extern int agrep_initialfd;extern int EXITONERROR;extern int errno;intmaskgen(Pattern, D)unsigned char *Pattern; int D;{	struct term { 		int flag; 		unsigned char class[WORD];	} 	position[WORD+10];	unsigned char c;	int i, j, k, l, M, OR=0, EVEN = 0, base, No_error;#ifdef	DEBUG	fprintf(stderr, "maskgen: len=%d, pat=%s, D=%d\n", strlen(Pattern), Pattern, D);#endif	for(i=0; i<WORD; i++) position[i].class[0] = '\0';	for(i=0; i<WORD; i++) position[i].flag = 0;	wildmask = NO_ERR_MASK = endposition = 0;	No_error = 0;	if ((M = strlen(Pattern)) <= 0) return 0;	if(NOUPPER) {		for(i=0; i<M; i++) if(isalpha((int)Pattern[i])) 			if (isupper((int)Pattern[i])) Pattern[i] = tolower((int)Pattern[i]);	}#ifdef DEBUG	for(i=0; i<M; i++) printf(" %d", Pattern[i]);	printf("\n");#endif	for (i=0, j=1; i< M; i++)	{		switch (Pattern[i])		{		case WILDCD : 			if(REGEX) {				position[j].class[0] = '.';				position[j].class[1] = '.';				position[j++].class[2] = '\0'; 				break;			}			wildmask = wildmask | Bit[j-1]; 			break;		case STAR   : 			break; 		case ORSYM  : 			break; 		case LPARENT: 			break;		case RPARENT: 			break;		case LANGLE : 			No_error = ON; 			EVEN++;			break;		case RANGLE : 			No_error = OFF; 			EVEN--;			if(EVEN < 0) {				fprintf(stderr, "%s: unmatched '<', '>' (use \\<, \\> to search for <, >)\n", Progname);				if (!EXITONERROR) {					errno = AGREP_ERROR;					return -1;				}				else exit(2);			}			break;		case LRANGE : 			if(No_error == ON) NO_ERR_MASK = NO_ERR_MASK | Bit[j]; 			i=i+1; 			if (Pattern[i] == NOTSYM) { 				position[j].flag = Compl; 				i++; 			}			k=0;			while (Pattern[i] != RRANGE && i < M)			{ 				if(Pattern[i] == HYPHEN) 				{ 					position[j].class[k-1] = Pattern[i+1]; 					i=i+2; 				}				else { 					position[j].class[k] = position[j].class[k+1] = Pattern[i];					k = k+2; 					i++;				}			}			if(i == M) {				fprintf(stderr, "%s: unmatched '[', ']' (use \\[, \\] to search for [, ])\n", Progname);				if (!EXITONERROR) {					errno = AGREP_ERROR;					return -1;				}				else exit(2);			}			position[j].class[k] = '\0';			j++; 			break;		case RRANGE : 			fprintf(stderr, "%s: unmatched '[', ']' (use \\[, \\] to search for [, ])\n", Progname);			if (!EXITONERROR) {				errno = AGREP_ERROR;				return -1;			}			else exit(2);			break;     		case ORPAT  : 			if(REGEX == ON || AND == ON) {				fprintf(stderr, "illegal pattern: cannot handle OR (',') and AND (';')/regular-expressions simultaneously\n");				if (!EXITONERROR) {					errno = AGREP_ERROR;					return -1;				}				else exit(2);			}			OR = ON;			position[j].flag = 2; 			position[j].class[0] = '\0';			endposition = endposition | Bit[j++]; 			break;		case ANDPAT : 			position[j].flag = 2; 			position[j].class[0] = '\0'; 			if(j > D_length) AND = ON;			if(OR || (REGEX == ON && j>D_length)) {				fprintf(stderr, "illegal pattern: cannot handle AND (';') and OR (',')/regular-expressions simultaneously\n");				if (!EXITONERROR) {					errno = AGREP_ERROR;					return -1;				}				else exit(2);			}			endposition = endposition | Bit[j++]; 			break;			/*				case ' '    : if (Pattern[i-1] == ORPAT || Pattern[i-1] == ANDPAT) break;							  if(No_error == ON) NO_ERR_MASK = NO_ERR_MASK | Bit[j];							  position[j].flag = 0;							  position[j].class[0] = position[j].class[1] = Pattern[i];							  position[j++].class[2] = '\0';  break;			*/		case '\n'   : 			NO_ERR_MASK = NO_ERR_MASK | Bit[j];			position[j].class[0] = position[j].class[1] = '\n';			position[j++].class[2] = '\0'; 			break;		case WORDB  : 			NO_ERR_MASK = NO_ERR_MASK | Bit[j];			position[j].class[0] = 1;			position[j].class[1] = 47;			position[j].class[2] = 58;			position[j].class[3] = 64;			position[j].class[4] = 91;			position[j].class[5] = 96;			position[j].class[6] = 123;			position[j].class[7] = 127;			position[j++].class[8] = '\0';			break;    		case NNLINE : 			NO_ERR_MASK |= Bit[j];			position[j].class[0] = position[j].class[1] = '\n';			position[j].class[2] = position[j].class[3] = NNLINE;			position[j++].class[4] = '\0';			break;		default : 			if(No_error == ON) NO_ERR_MASK = NO_ERR_MASK | Bit[j];			position[j].flag = 0;			position[j].class[0] = position[j].class[1] = Pattern[i];			position[j++].class[2] = '\0'; 		}		if(j > WORD) {			fprintf(stderr, "%s: pattern too long (has > %d chars)\n", Progname, WORD);			if (!EXITONERROR) {				errno = AGREP_ERROR;				return -1;			}			else exit(2);		}	}	if (EVEN != 0) {		fprintf(stderr, "%s: unmatched '<', '>' (use \\<, \\> to search for <, >)\n", Progname);		if (!EXITONERROR) {			errno = AGREP_ERROR;			return -1;		}		else exit(2);	}	M = j - 1;	base = WORD - M;	wildmask = (wildmask >> base);	endposition = (endposition >> base);	NO_ERR_MASK = (NO_ERR_MASK >> 1) & (~Bit[1]);	NO_ERR_MASK = ~NO_ERR_MASK >> (base-1);	for (i=1; i<= WORD - M ; i++) Init[0] = Init[0] | Bit[i];	Init[0] = Init[0] | endposition;	/* not necessary for INit[i], i>0, */	/* but at every begining of the matching process append one					no-match character to initialize the error vectors */	endposition = ( endposition << 1 ) + 1;	Init1 = (Init[0] | wildmask | endposition) ;	D_endpos = ( endposition >> ( M - D_length ) ) << ( M - D_length);	endposition = endposition ^ D_endpos;#ifdef DEBUG	printf("endposition: %o\n", endposition);	printf("no_err_mask: %o\n", NO_ERR_MASK);#endif	for(c=0, i=0; i < MAXSYM; c++, i++)	{		for (k=1, l=0; k<=M ; k++, l=0)  {			while (position[k].class[l] != '\0') {				if (position[k].class[l] == NOCARE && (c != '\n' || REGEX) ) 				{  					Mask[c] = Mask[c] | Bit[base + k]; 					break; 				}				if (c >= position[k].class[l] && c <= position[k].class[l+1])				{  					Mask[c] = Mask[c] | Bit[base + k]; 					break; 				}				l = l + 2;  			}			if (position[k].flag == Compl) Mask[c] = Mask[c] ^ Bit[base+k];		}	}	if(NOUPPER) for(i=0; i<MAXSYM; i++)		if (isupper(i)) Mask[i] = Mask[tolower(i)]; 	return(M);}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -